In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,classification_report,f1_score,roc_auc_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
dataset= pd.read_csv('credit_card_dataset_DP.csv')
dataset

Unnamed: 0,amount,transaction_hour,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud,merchant_category_Electronics,merchant_category_Food,merchant_category_Grocery,merchant_category_Travel
0,84.47,22,0,0,66,3,40,0,1,0,0,0
1,541.82,3,1,0,87,1,64,0,0,0,0,1
2,237.01,17,0,0,49,1,61,0,0,0,1,0
3,164.33,4,0,1,72,3,34,0,0,0,1,0
4,30.53,15,0,0,79,0,44,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,350.91,22,0,0,99,4,37,0,0,1,0,0
9996,410.04,5,0,0,70,3,25,0,0,0,0,0
9997,527.75,21,0,0,44,2,45,0,1,0,0,0
9998,91.20,2,0,0,38,0,37,0,1,0,0,0


In [4]:
# Split data into indep and dep
indep = dataset.drop('is_fraud',axis=1)
dep = dataset['is_fraud']

In [5]:
indep

Unnamed: 0,amount,transaction_hour,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,merchant_category_Electronics,merchant_category_Food,merchant_category_Grocery,merchant_category_Travel
0,84.47,22,0,0,66,3,40,1,0,0,0
1,541.82,3,1,0,87,1,64,0,0,0,1
2,237.01,17,0,0,49,1,61,0,0,1,0
3,164.33,4,0,1,72,3,34,0,0,1,0
4,30.53,15,0,0,79,0,44,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,350.91,22,0,0,99,4,37,0,1,0,0
9996,410.04,5,0,0,70,3,25,0,0,0,0
9997,527.75,21,0,0,44,2,45,1,0,0,0
9998,91.20,2,0,0,38,0,37,1,0,0,0


In [6]:
dep

0       0
1       0
2       0
3       0
4       0
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: is_fraud, Length: 10000, dtype: int64

In [7]:
# train_test_split data 
X_train,X_test,y_train,y_test = train_test_split(indep,dep,test_size= 0.2,random_state=0)

#Standard scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Smote for data imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print("Before SMOTE:\n", y_train.value_counts())
print("After SMOTE:\n", y_train_smote.value_counts())

Before SMOTE:
 is_fraud
0    7874
1     126
Name: count, dtype: int64
After SMOTE:
 is_fraud
0    7874
1    7874
Name: count, dtype: int64


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid={'criterion':['gini', 'entropy', 'log_loss'],'max_features': ['auto','sqrt','log2'],
              'n_estimators':[10,100]}
grid=GridSearchCV(RandomForestClassifier(),param_grid,refit=True,verbose=3,n_jobs=-1,scoring='f1')

# fitting the model for grid search 
grid.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [9]:
re=grid.cv_results_
grid_predictions = grid.predict(X_test_scaled) 
cm = confusion_matrix(y_test, grid_predictions)
clf_report = classification_report(y_test, grid_predictions)
f1_macro=f1_score(y_test,grid_predictions,average='weighted')
roc_score = roc_auc_score(y_test,grid.predict_proba(X_test_scaled)[:,1])

In [10]:
print("The f1_macro value for best parameter {}:".format(grid.best_params_),f1_macro)
print("\nThe confusion Matrix:\n",cm)
print("\nThe report:\n",clf_report)
print("\nROC_AUC_Score:",roc_score)

The f1_macro value for best parameter {'criterion': 'log_loss', 'max_features': 'sqrt', 'n_estimators': 100}: 0.992699192892173

The confusion Matrix:
 [[1970    5]
 [   9   16]]

The report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1975
           1       0.76      0.64      0.70        25

    accuracy                           0.99      2000
   macro avg       0.88      0.82      0.85      2000
weighted avg       0.99      0.99      0.99      2000


ROC_AUC_Score: 0.9966886075949367


In [11]:
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.013239,0.004337,0.0,0.0,gini,auto,10,"{'criterion': 'gini', 'max_features': 'auto', ...",,,,,,,,13
1,0.01143,0.00343,0.0,0.0,gini,auto,100,"{'criterion': 'gini', 'max_features': 'auto', ...",,,,,,,,13
2,0.341382,0.031272,0.020339,0.003752,gini,sqrt,10,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.997146,0.998098,0.997778,0.999365,0.997149,0.997907,0.000816,11
3,2.382531,0.111144,0.043046,0.001378,gini,sqrt,100,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.997777,0.999049,0.998732,0.998731,0.998099,0.998477,0.000467,5
4,0.308674,0.020714,0.026754,0.011563,gini,log2,10,"{'criterion': 'gini', 'max_features': 'log2', ...",0.995541,0.998099,0.997465,0.998731,0.998099,0.997587,0.001098,12
5,2.397356,0.091849,0.048906,0.004091,gini,log2,100,"{'criterion': 'gini', 'max_features': 'log2', ...",0.99714,0.998732,0.998415,0.999048,0.997783,0.998224,0.000685,7
6,0.00874,0.001854,0.0,0.0,entropy,auto,10,"{'criterion': 'entropy', 'max_features': 'auto...",,,,,,,,13
7,0.011503,0.001587,0.0,0.0,entropy,auto,100,"{'criterion': 'entropy', 'max_features': 'auto...",,,,,,,,13
8,0.281567,0.03698,0.011971,0.003605,entropy,sqrt,10,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.996817,0.996829,0.999047,0.999048,0.998415,0.998031,0.001013,9
9,2.575311,0.063188,0.044928,0.004795,entropy,sqrt,100,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.998094,0.999365,0.998731,0.999048,0.997783,0.998604,0.000588,3


In [12]:
amount_input = float(input("Enter amount:"))
transhr_input = int(input("Enter transaction hour:"))
foreigntrans_input = int(input("Enter foreign transaction (yes(1)/No(0)):"))
loc_mismatch_input = int(input("Enter Location mismatch (yes(1)/No(0)):"))
dev_ts_input = int(input("Enter Device trust score:"))
velocity24h_input = int(input("Enter frequency of transaction in 24 hrs:"))
cardholder_age_input = int(input("Enter card holder age:"))
MCE_input = int(input("Enter Merchant category - Electronics (yes(1)/No(0)):"))
MCF_input = int(input("Enter Merchant category - Food (yes(1)/No(0)):")) 
MCG_input = int(input("Enter Merchant category - Grocery (yes(1)/No(0)):"))
MCT_input = int(input("Enter Merchant category - Travel (yes(1)/No(0)):"))

Enter amount: 900000
Enter transaction hour: 1
Enter foreign transaction (yes(1)/No(0)): 1
Enter Location mismatch (yes(1)/No(0)): 1
Enter Device trust score: 1
Enter frequency of transaction in 24 hrs: 15
Enter card holder age: 45
Enter Merchant category - Electronics (yes(1)/No(0)): 1
Enter Merchant category - Food (yes(1)/No(0)): 0
Enter Merchant category - Grocery (yes(1)/No(0)): 0
Enter Merchant category - Travel (yes(1)/No(0)): 0


In [13]:
user_data=[[amount_input,transhr_input,foreigntrans_input,loc_mismatch_input,dev_ts_input,velocity24h_input,cardholder_age_input,MCE_input,MCF_input,MCG_input,MCT_input]]
user_data_scaled = scaler.transform(user_data)
future_prediction = grid.predict(user_data_scaled)
print("Credit Card Fraud Prediction =", future_prediction[0])

Credit Card Fraud Prediction = 1


In [18]:
#using pipeline to store scaler and best model.
from sklearn.pipeline import Pipeline
final_pipeline = Pipeline([
    ('scaler', scaler),
    ('model', grid.best_estimator_)   # best RandomForest model
])

In [19]:
import pickle
filename="Finalized_credit_card_fraud_detection_model_GridRandomForest.pkl"
pickle.dump(final_pipeline, open(filename, 'wb'))