In [45]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

import pickle

In [2]:
df = pd.read_csv("fraudTest.csv")

In [3]:
len(df)

555719

In [4]:
df[:10]

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0
5,5,2020-06-21 12:15:37,30407675418785,fraud_Daugherty LLC,kids_pets,19.55,Danielle,Evans,F,76752 David Lodge Apt. 064,...,42.1939,-76.7361,520,Psychotherapist,1991-10-13,798db04aaceb4febd084f1a7c404da93,1371816937,41.747157,-77.584197,0
6,6,2020-06-21 12:15:44,213180742685905,fraud_Romaguera Ltd,health_fitness,133.93,Kayla,Sutton,F,010 Weaver Land,...,40.507,-123.9743,1139,"Therapist, occupational",1951-01-15,17003d7ce534440eadb10c4750e020e5,1371816944,41.499458,-124.888729,0
7,7,2020-06-21 12:15:50,3589289942931264,fraud_Reichel LLC,personal_care,10.37,Paula,Estrada,F,350 Stacy Glens,...,43.7557,-97.5936,343,"Development worker, international aid",1972-03-05,8be473af4f05fc6146ea55ace73e7ca2,1371816950,44.495498,-97.728453,0
8,8,2020-06-21 12:16:10,3596357274378601,"fraud_Goyette, Howell and Collier",shopping_pos,4.37,David,Everett,M,4138 David Fall,...,41.0001,-78.2357,3688,Advice worker,1973-05-27,71a1da150d1ce510193d7622e08e784e,1371816970,41.546067,-78.120238,0
9,9,2020-06-21 12:16:11,3546897637165774,fraud_Kilback Group,food_dining,66.54,Kayla,Obrien,F,7921 Robert Port Suite 343,...,31.6591,-96.8094,263,Barrister,1956-05-30,a7915132c7c4240996ba03a47f81e3bd,1371816971,31.782919,-96.366185,0


In [5]:
df = df.drop(['Unnamed: 0', 'trans_date_trans_time', 'first', 'last', 'street', 'dob', 'trans_num'], axis=1)

In [6]:
df = pd.get_dummies(df, columns=['merchant', 'category', 'gender', 'job'], drop_first=True)

In [7]:
df['trans_date'] = pd.to_datetime(df['unix_time'], unit='s')
df['hour'] = df['trans_date'].dt.hour
df['day'] = df['trans_date'].dt.day
df['month'] = df['trans_date'].dt.month
df['weekday'] = df['trans_date'].dt.weekday
df = df.drop(['unix_time', 'trans_date'], axis=1)

In [8]:
x = df.drop(['is_fraud'], axis=1)
y = df['is_fraud']

In [9]:
high_cardinality_threshold = 1000
high_cardinality_cols = [col for col in x.columns if x[col].nunique() > high_cardinality_threshold]

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
for col in high_cardinality_cols:
    freq_encoding = x[col].value_counts().to_dict()
    x[col] = x[col].map(freq_encoding)

In [13]:
remaining_categorical_cols = x.select_dtypes(include=['object']).columns
x = pd.get_dummies(x, columns=remaining_categorical_cols, drop_first=True, sparse=True)

In [32]:
x_train_sparse = csr_matrix(x_train)

  arg1 = np.asarray(arg1)


In [21]:
classes = np.array([0, 1])
class_weights = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = {cls: weight for cls, weight in zip(classes, class_weights)}

In [22]:
print("Class Weight Dictionary:", class_weight_dict)

Class Weight Dictionary: {0: 0.5019374103552551, 1: 129.53822843822843}


In [36]:
model = RandomForestClassifier(random_state=42, n_estimators = 100, class_weight=class_weight_dict)

In [34]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
}

In [37]:
model.fit(x_train_sparse, y_train)

In [38]:
model.score(x_test, y_test)



0.9978946231915353

In [40]:
y_pred = model.predict(x_test)



In [44]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9978946231915353
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    110718
           1       0.98      0.46      0.63       426

    accuracy                           1.00    111144
   macro avg       0.99      0.73      0.81    111144
weighted avg       1.00      1.00      1.00    111144

Confusion Matrix:
 [[110714      4]
 [   230    196]]


In [47]:
with open('random_forest_fraud_detection.pkl', 'wb') as file:
    pickle.dump(model, file)