In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'fraud-detection:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F817870%2F1399887%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240530%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240530T053146Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D040ca734753b051b23a3b38aec45a7589b475ac3f3ea0b00b7c89285230fd8e1890ef9f65835e9329be5f6dc30732963cf8fb3cda549c4f36d53925f75d82304a6861912f5625b4773d791e179025fa07347910a8813a6209a42f8accacbb1ec84aee466c31b8dedca1396b306b1dd9839e3ce5b79ba1d67bd951db307c3c7ede79c8ceea1ee9214ea373db63b73c8b6144dc7b916f9e8516b0e39b86d619574dfd6cdbbb5f047dd28eb586d6ea00f58b84d296e057a83ff350dc00144670434ad37ff25b64ff024836d98f6cb7eeee0e9d5ee24dde755091538a415e85c7e0c8f532b6e4cb6fa5a90bdb546baf8c38e91484aefd486d2d5837ac19464db3336'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading fraud-detection, 211766662 bytes compressed
Downloaded and uncompressed: fraud-detection
Data source import complete.


# Credit Card Fraud Detection
#### In this notebook a model is built to detect fraudulent credit card transactions using a dataset containing information about credit card transactions, and algorithms like Logistic Regression, Decision Trees, or Random Forests.

## Installing & Importing libraries

In [None]:
# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install -U scikit-learn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC
import sklearn.metrics as metrics
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

## Data Loading & Framing

In [None]:
df = pd.read_csv("/kaggle/input/fraud-detection/fraudTrain.csv")
df

## Data Pre-Processing
### a) For Train set

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.drop(columns=["Unnamed: 0", "trans_num", "street"], inplace= True)
df

In [None]:
data = df.head(n = 20000)
data.is_fraud.value_counts()

In [None]:
df_processed = pd.get_dummies(data=data)
df_processed

In [None]:
x_train = df_processed.drop(columns='is_fraud', axis=1)
y_train = df_processed['is_fraud']

### b) For Test set

In [None]:
df_test = pd.read_csv("/kaggle/input/fraud-detection/fraudTest.csv")
df_test

In [None]:
df_test.drop(columns=["Unnamed: 0", "trans_num", "street"], inplace= True)
df_test

In [None]:
data_test = df_test.sample(frac=1, random_state=1).reset_index()
data_test = data_test.head(n = 5000)
data_test.is_fraud.value_counts()

In [None]:
df_processed_test = pd.get_dummies(data=data_test)
df_processed_test

In [None]:
x_test = df_processed.drop(columns='is_fraud', axis=1)
y_test = df_processed['is_fraud']

# Modeling
Next models have been built and their accuracies have been computed to select a best model.
### Logistic Regression

In [None]:
LR = LogisticRegression(solver='liblinear')

LR.fit(x_train, y_train)

In [None]:
predictions = LR.predict(x_test)

In [None]:
predict_proba = LR.predict_proba(x_test)

In [None]:
LR_Accuracy_Score = accuracy_score(y_test, predictions)

print(LR_Accuracy_Score)

### Decision Tree

In [None]:
Tree = DecisionTreeClassifier()


Tree.fit(x_train, y_train)

In [None]:
predictions = Tree.predict(x_test)

In [None]:
Tree_Accuracy_Score = accuracy_score(y_test, predictions)
Tree_JaccardIndex = jaccard_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions)


print(Tree_Accuracy_Score)
print(Tree_JaccardIndex)
print(Tree_F1_Score)

### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train, y_train)

In [None]:
predictions = knn.predict(x_test)

In [None]:
KNN_Accuracy_Score = accuracy_score(y_test, predictions)
KNN_JaccardIndex = jaccard_score(y_test, predictions)
KNN_F1_Score = f1_score(y_test, predictions)

print(KNN_Accuracy_Score)
print(KNN_JaccardIndex)
print(KNN_F1_Score)