# TASK_2: CREDIT CARD FRAUD DETECTION

In [1]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None)

In [26]:
df = pd.read_csv('fraudTrain.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [27]:
df.shape

(1296675, 23)

In [29]:
columns_to_drop = [' trans_date_trans_time ', ' merchant ', ' category ', ' first ', ' last ', ' gender ', ' street ', ' city ', ' state ', ' job ', ' dob ', ' trans_num ']
df.drop(columns=[col.strip() for col in columns_to_drop], inplace=True)


In [30]:
df.dtypes

Unnamed: 0      int64
cc_num          int64
amt           float64
zip             int64
lat           float64
long          float64
city_pop        int64
unix_time       int64
merch_lat     float64
merch_long    float64
is_fraud        int64
dtype: object

In [31]:
df.isnull().sum()

Unnamed: 0    0
cc_num        0
amt           0
zip           0
lat           0
long          0
city_pop      0
unix_time     0
merch_lat     0
merch_long    0
is_fraud      0
dtype: int64

In [32]:
fraud_count = df["is_fraud"].value_counts()
fraud_rate = 100*fraud_count/df.shape[0]
fraud_data = pd.concat([fraud_count, fraud_rate], axis=1).reset_index()
fraud_data.columns = ['Class','Count','Percentage']

print(fraud_data)

   Class    Count  Percentage
0      0  1289169   99.421135
1      1     7506    0.578865


# Data Preparation

In [33]:
df_fraud = df[df['is_fraud']==1]
df_not_fraud = df[df['is_fraud']==0]
df_not_fraud_sampled = df_not_fraud.sample(df_fraud.shape[0], replace=False, random_state=101)

df_balanced = pd.concat([df_not_fraud_sampled, df_fraud], axis=0).sample(frac=1, replace=False, random_state=101).reset_index().drop('index', axis=1)
df_balanced

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,535778,4562827002127,68.26,45331,40.0987,-84.6342,22930,1345284274,39.945159,-84.266574,0
1,256079,30131826429364,87.21,37932,35.9335,-84.1481,391389,1336390918,36.721193,-84.069282,0
2,1116101,213120463918358,3.52,71277,32.9550,-92.5870,1758,1365514372,33.621144,-91.623247,0
3,624802,2283743876903625,1.17,30009,34.0770,-84.3033,165556,1348339676,33.187166,-83.628333,0
4,1188709,3533742182628021,7.21,29455,32.8357,-79.8217,20478,1368290186,32.526751,-79.309593,0
...,...,...,...,...,...,...,...,...,...,...,...
15007,1272848,4158945090781624,10.49,15217,40.4308,-79.9205,687276,1371074867,40.964928,-79.472869,0
15008,52484,561942763351,271.41,20036,38.9087,-77.0414,601723,1328052454,38.066476,-76.794198,1
15009,591175,4092452671396169678,172.96,28423,34.3091,-78.3372,2290,1347077127,34.401023,-77.496899,0
15010,1050573,2712209726293386,202.85,70808,30.4066,-91.1468,378909,1363013375,30.135138,-91.801888,0


In [34]:
fraud_count = df_balanced["is_fraud"].value_counts()
fraud_rate = 100*fraud_count/df_balanced.shape[0]
fraud_data = pd.concat([fraud_count, fraud_rate], axis=1).reset_index()
fraud_data.columns = ['Class','Count','Percentage']
fraud_data

Unnamed: 0,Class,Count,Percentage
0,0,7506,50.0
1,1,7506,50.0


In [35]:
X_train, X_test, y_train,y_test = train_test_split(df_balanced.drop('is_fraud', axis=1), df_balanced['is_fraud'], test_size=0.2, random_state=101)

print(f'''X_train: {X_train.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_test: {y_test.shape}''')

X_train: (12009, 10)
X_test: (3003, 10)
y_train: (12009,)
y_test: (3003,)


# Fitting a Random Forest Model

In [36]:
non_numeric_columns = df.select_dtypes(exclude=['number', 'bool']).columns
print("Non-numeric columns:", non_numeric_columns)


Non-numeric columns: Index([], dtype='object')


In [38]:
randomForestModel = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

randomForestModel.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', RandomForestClassifier())])

In [39]:
y_pred_logis = randomForestModel.predict(X_test)
y_pred_logis

array([1, 1, 0, ..., 1, 0, 1], dtype=int64)

In [40]:
cr = classification_report(y_test, y_pred_logis)
print(cr)

              precision    recall  f1-score   support

           0       0.90      0.93      0.91      1495
           1       0.93      0.89      0.91      1508

    accuracy                           0.91      3003
   macro avg       0.91      0.91      0.91      3003
weighted avg       0.91      0.91      0.91      3003



# Save The Model

In [42]:
with open('./model.pkl','wb') as fp:
    pickle.dump(randomForestModel, fp)