# import libraries

In [1]:
import pandas as pd
import numpy as np

import os

import requests

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)


# import data

In [3]:
dateset_url = 'https://lead-program-assets.s3.eu-west-3.amazonaws.com/M05-Projects/fraudTest.csv'
filepath = './src/fraudTest.csv'

if not os.path.exists(filepath):
    print("downloading dataset...")
    response = requests.get(dateset_url)
    response.raise_for_status()  # Check if the request was successful
    with open(filepath, 'wb') as file:
        file.write(response.content)
    print(f"File downloaded and saved as {filepath}")
else:
    print(f'dataset file exists at :{filepath}')

    
df = pd.read_csv(filepath, index_col=0)


downloading dataset...
File downloaded and saved as ./src/fraudTest.csv


# Basic statistics

In [58]:
print(f"The dataset contains {df.shape[0]} rows and {df.shape[1]} columns")
display(df.head())


display(df.describe(include='all'))

print("Type of each column :\n")
print(df.dtypes)
print("\n")

print("Missing value percentage :\n")
print(df.isnull().sum()/len(df)*100)

The dataset contains 555719 rows and 29 columns


Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_year,trans_month,trans_day,trans_hour,trans_minutes,trans_seconds,dob_year,dob_month,dob_day
0,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,2020,6,21,12,14,25,1968,3,19
1,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,2020,6,21,12,14,33,1990,1,17
2,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0,2020,6,21,12,14,53,1970,10,21
3,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0,2020,6,21,12,15,15,1987,7,25
4,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0,2020,6,21,12,15,17,1955,7,6


Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_year,trans_month,trans_day,trans_hour,trans_minutes,trans_seconds,dob_year,dob_month,dob_day
count,555719.0,555719,555719,555719.0,555719,555719,555719,555719,555719,555719,555719.0,555719.0,555719.0,555719.0,555719,555719,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
unique,,693,14,,341,471,2,924,849,50,,,,,478,555719,,,,,,,,,,,,,
top,,fraud_Kilback LLC,gas_transport,,Christopher,Smith,F,444 Robert Mews,Birmingham,TX,,,,,Film/video editor,2da90c7d74bd46a0caf3777415b3ebd3,,,,,,,,,,,,,
freq,,1859,56370,,11443,12146,304886,1474,2423,40393,,,,,4119,1,,,,,,,,,,,,,
mean,4.178387e+17,,,69.39281,,,,,,,48842.628015,38.543253,-90.231325,88221.89,,,1380679000.0,38.542798,-90.23138,0.00386,2020.0,9.508536,16.463904,12.809062,29.50202,29.493391,1973.363763,6.527894,15.673367
std,1.309837e+18,,,156.745941,,,,,,,26855.283328,5.061336,13.72178,300390.9,,,5201104.0,5.095829,13.733071,0.062008,0.0,1.978205,8.955311,6.810924,17.340482,17.30428,17.418528,3.399485,8.888683
min,60416210000.0,,,1.0,,,,,,,1257.0,20.0271,-165.6723,23.0,,,1371817000.0,19.027422,-166.671575,0.0,2020.0,6.0,1.0,0.0,0.0,0.0,1924.0,1.0,1.0
25%,180042900000000.0,,,9.63,,,,,,,26292.0,34.6689,-96.798,741.0,,,1376029000.0,34.755302,-96.905129,0.0,2020.0,8.0,9.0,7.0,14.0,15.0,1962.0,4.0,8.0
50%,3521417000000000.0,,,47.29,,,,,,,48174.0,39.3716,-87.4769,2408.0,,,1380762000.0,39.376593,-87.445204,0.0,2020.0,10.0,17.0,14.0,29.0,29.0,1975.0,7.0,15.0
75%,4635331000000000.0,,,83.01,,,,,,,72011.0,41.8948,-80.1752,19685.0,,,1385867000.0,41.954163,-80.264637,0.0,2020.0,12.0,24.0,19.0,45.0,44.0,1987.0,9.0,23.0


Type of each column :

cc_num             int64
merchant          object
category          object
amt              float64
first             object
last              object
gender            object
street            object
city              object
state             object
zip                int64
lat              float64
long             float64
city_pop           int64
job               object
trans_num         object
unix_time          int64
merch_lat        float64
merch_long       float64
is_fraud           int64
trans_year         int32
trans_month        int32
trans_day          int32
trans_hour         int32
trans_minutes      int32
trans_seconds      int32
dob_year           int32
dob_month          int32
dob_day            int32
dtype: object


Missing value percentage :

cc_num           0.0
merchant         0.0
category         0.0
amt              0.0
first            0.0
last             0.0
gender           0.0
street           0.0
city             0.0
state            0.

# Data cleaning

## Date & Time

In [34]:
df['trans_date_trans_time']=pd.to_datetime(df['trans_date_trans_time'])
df['trans_year']=df['trans_date_trans_time'].dt.year
df['trans_month']=df['trans_date_trans_time'].dt.month
df['trans_day']=df['trans_date_trans_time'].dt.day
df['trans_hour']=df['trans_date_trans_time'].dt.hour
df['trans_minutes']=df['trans_date_trans_time'].dt.minute
df['trans_seconds']=df['trans_date_trans_time'].dt.second
df = df.drop('trans_date_trans_time', axis=1)

In [35]:
df['dob']=pd.to_datetime(df['dob'])
df['dob_year']=df['dob'].dt.year
df['dob_month']=df['dob'].dt.month
df['dob_day']=df['dob'].dt.day
df = df.drop('dob', axis=1)

In [65]:
df.dtypes

cc_num             int64
merchant          object
category          object
amt              float64
first             object
last              object
gender            object
street            object
city              object
state             object
zip                int64
lat              float64
long             float64
city_pop           int64
job               object
trans_num         object
unix_time          int64
merch_lat        float64
merch_long       float64
is_fraud           int64
trans_year         int32
trans_month        int32
trans_day          int32
trans_hour         int32
trans_minutes      int32
trans_seconds      int32
dob_year           int32
dob_month          int32
dob_day            int32
dtype: object

In [96]:
def convert_to_string(dataset, columns):
    for column in columns:
        dataset[column] = dataset[column].astype(str)
        print(f'column {column} converted to string')
        
        
columns_to_convert = ['merchant','category','first','last','gender','street','city','state','job']
      
convert_to_string(df, columns_to_convert)

column merchant converted to string
column category converted to string
column first converted to string
column last converted to string
column gender converted to string
column street converted to string
column city converted to string
column state converted to string
column job converted to string


columns to drop for a dummy model: 

remove columns with no significant or too many categories :
- merchant
- category
- first
- last
- street
- city
- zip
- job

remove unique ID columns :
- trans_num

remove redondant information :
- unix_time


 columns'type to change :

 - merchant(string)
 - category(string)
 - first(string)
 - last(string)
 - gender(string)
 - street(string)
 - city(string)
 - state(string)
 - job(string)
 - dob(Date)
 - trans_num

columns to encode 

 - category
 - first
 - last
 - gender
 - city
 - state
 - job
 - 

# Model

In [107]:
X = df.drop(['cc_num','zip','trans_num','unix_time','is_fraud'], axis=1)
y = df['is_fraud']

In [108]:
X.columns

Index(['merchant', 'category', 'amt', 'first', 'last', 'gender', 'street',
       'city', 'state', 'lat', 'long', 'city_pop', 'job', 'merch_lat',
       'merch_long', 'trans_year', 'trans_month', 'trans_day', 'trans_hour',
       'trans_minutes', 'trans_seconds', 'dob_year', 'dob_month', 'dob_day'],
      dtype='object')

## Preprocessing

In [109]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [110]:
# Definning categorical and numeric features :

numeric_features = []
categorical_features = []


for column in X.columns:
    if pd.api.types.is_numeric_dtype(X[column]) and not pd.api.types.is_bool_dtype(X[column]):
        numeric_features.append(column)
    else:
        categorical_features.append(column)


print("Numeric features : ", numeric_features)
print("Categorical features : ", categorical_features)

Numeric features :  ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'trans_year', 'trans_month', 'trans_day', 'trans_hour', 'trans_minutes', 'trans_seconds', 'dob_year', 'dob_month', 'dob_day']
Categorical features :  ['merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']


In [111]:
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        # some category might be present in set and not in another. the OneHotEncoder will simply ignore such category.
        ("encoder",OneHotEncoder(drop='first'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


In [None]:
# import joblib

# preprocessor = joblib.load('preprocessor.pkl')

In [112]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

## Linear Regression

In [124]:
log_regressor = LogisticRegression()
log_regressor.fit(X_train, y_train)

In [125]:
y_train_pred = log_regressor.predict(X_train)
y_test_pred = log_regressor.predict(X_test)

print("F1 score on training set :", f1_score(y_train, y_train_pred))
print("F1 score on training set :", f1_score(y_test, y_test_pred))

F1 score on training set : 0.26082130965593786
F1 score on training set : 0.1769436997319035


## Random Forrest

In [139]:
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)


In [140]:
y_train_pred = classifier.predict(X_train)
y_test_pred = classifier.predict(X_test)

print("F1 score on training set :", f1_score(y_train, y_train_pred))
print("F1 score on training set :", f1_score(y_test, y_test_pred))

F1 score on training set : 1.0
F1 score on training set : 0.6645962732919255


## XGBoost

In [137]:
print("Grid search...")
xgboost = XGBClassifier()

params = {
    "max_depth": [2, 4, 6],
    "min_child_weight": [1,2,3],
    "n_estimators": [2,4,6,8],
}
print(params)
gridsearch_xgb = GridSearchCV(
    xgboost, param_grid=params, cv=3, verbose=3, scoring='f1'
)
gridsearch_xgb.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_xgb.best_params_)
print("Best validation accuracy : ", gridsearch_xgb.best_score_)
print()
print("Accuracy on training set : ", gridsearch_xgb.score(X_train, y_train))
print("Accuracy on test set : ", gridsearch_xgb.score(X_test, y_test))


Grid search...
{'max_depth': [2, 4, 6], 'min_child_weight': [1, 2, 3], 'n_estimators': [2, 4, 6, 8]}
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV 1/3] END max_depth=2, min_child_weight=1, n_estimators=2;, score=0.347 total time=   0.2s
[CV 2/3] END max_depth=2, min_child_weight=1, n_estimators=2;, score=0.335 total time=   0.2s
[CV 3/3] END max_depth=2, min_child_weight=1, n_estimators=2;, score=0.000 total time=   0.2s
[CV 1/3] END max_depth=2, min_child_weight=1, n_estimators=4;, score=0.347 total time=   0.2s
[CV 2/3] END max_depth=2, min_child_weight=1, n_estimators=4;, score=0.335 total time=   0.2s
[CV 3/3] END max_depth=2, min_child_weight=1, n_estimators=4;, score=0.355 total time=   0.2s
[CV 1/3] END max_depth=2, min_child_weight=1, n_estimators=6;, score=0.628 total time=   0.2s
[CV 2/3] END max_depth=2, min_child_weight=1, n_estimators=6;, score=0.623 total time=   0.2s
[CV 3/3] END max_depth=2, min_child_weight=1, n_estimators=6;, score=0.316 total time

In [138]:
y_train_pred = gridsearch_xgb.best_estimator_.predict(X_train)
y_test_pred = gridsearch_xgb.best_estimator_.predict(X_test)

print("F1 score on training set :", f1_score(y_train, y_train_pred))
print("F1 score on training set :", f1_score(y_test, y_test_pred))

F1 score on training set : 0.8061068702290076
F1 score on training set : 0.7870619946091644
