<a href="https://colab.research.google.com/github/Chandan1905/Encryptix/blob/Project-3/Encryptix_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'fraud-detection:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F817870%2F1399887%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240526%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240526T034439Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D681d43afb315e3121f5d34fe2990dafb5d1922541d2f9be80e84a327e19b3ea439907c9f94026f7809eb4616439d2d14ef32c152321b1fb09022de2f269c155a11632f06ef7fed5bbd8116e10a02e63d469e756c66d9dc326d68db492f2cff2eb3479aba10a6a0edcbbb9e7ce70759f2323485a5318ebba09b57216fbad5f8703a0e0c9badf33882eba546a319ddbc555d9edba368059821eff880e7d23fdc4e84f006b1d7266790a046607a94ec840f4f205a2816bc220f8ad48de0a30cf6637a123c921d69e4c10d8e217304220ac933c1a4ffcfbc8198c4b1e73d5ebed72c7cad8ad1c4e54cf369c2a18879bc19c9ffbfabc61631f1f55a5c3dfa3467184f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
train_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv')
train_data.head()

In [None]:
train_data.info()

In [None]:
data = train_data.drop(['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num'], axis=1)
data.info()

In [None]:
# Feature Engineering: Extract features from trans_date_trans_time
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['hour'] = data['trans_date_trans_time'].dt.hour
data['day'] = data['trans_date_trans_time'].dt.day
data['month'] = data['trans_date_trans_time'].dt.month
data['day_of_week'] = data['trans_date_trans_time'].dt.dayofweek

data['dob'] = pd.to_datetime(data['dob'])
data['day_of_birth'] = data['trans_date_trans_time'].dt.day
data['month_of_birth'] = data['trans_date_trans_time'].dt.month
data['year_of_birth'] = data['trans_date_trans_time'].dt.year


In [None]:
# Drop the original trans_date_trans_time column
data = data.drop(['trans_date_trans_time'], axis=1)
data = data.drop(['dob'],axis=1)

In [None]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['merchant', 'category', 'gender', 'city', 'state', 'job']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
data

In [None]:
# Standardize the numerical features
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])


In [None]:
X = data.drop('is_fraud', axis=1)
y = data['is_fraud']

In [None]:
import matplotlib.pyplot as plt
data.is_fraud.plot.hist()

In [None]:
X.corrwith(data['is_fraud']).plot.bar(figsize=(16,9), title = 'Correlation with the variables', rot =45, grid=True)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X,y)

In [None]:
test_data = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')
test_data.info()

In [None]:
test = test_data.drop(['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'trans_num'], axis=1)
test['trans_date_trans_time'] = pd.to_datetime(test['trans_date_trans_time'])
test['hour'] = test['trans_date_trans_time'].dt.hour
test['day'] = test['trans_date_trans_time'].dt.day
test['month'] = test['trans_date_trans_time'].dt.month
test['day_of_week'] = test['trans_date_trans_time'].dt.dayofweek

test['dob'] = pd.to_datetime(test['dob'])
test['day_of_birth'] = test['trans_date_trans_time'].dt.day
test['month_of_birth'] = test['trans_date_trans_time'].dt.month
test['year_of_birth'] = test['trans_date_trans_time'].dt.year

In [None]:
# Drop the original trans_date_trans_time column
test = test.drop(['trans_date_trans_time'], axis=1)
test = test.drop(['dob'],axis=1)

In [None]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['merchant', 'category', 'gender', 'city', 'state', 'job']
for col in categorical_columns:
    le = LabelEncoder()
    test[col] = le.fit_transform(test[col])
    label_encoders[col] = le
test

In [None]:
# Standardize the numerical features
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

X_test = test.drop('is_fraud', axis=1)
y_test = test['is_fraud']


In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
acc