In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"shrikantlokare","key":"dc4926d6bfb859d71ab28d38d25d9da1"}'}

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d kartik2112/fraud-detection

Downloading fraud-detection.zip to /content
 95% 191M/202M [00:01<00:00, 121MB/s]
100% 202M/202M [00:01<00:00, 134MB/s]


In [None]:
!unzip /content/fraud-detection.zip

Archive:  /content/fraud-detection.zip
  inflating: fraudTest.csv           
  inflating: fraudTrain.csv          


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time

In [None]:
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")

In [None]:
# Adjusting the display to fit rows and columns effectively
start_time = time.time()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
from datetime import datetime, date
import math
from math import radians, sin, cos, acos, atan2
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
df = pd.read_csv('/content/fraudTrain.csv')

In [None]:
def full_name(df): 
    df['Customer_name'] = df['first']+" " + df['last']

In [None]:
def calculating_distance(df):
    
    R = 6373.0  # radius of the Earth
    
    df['lat'] = df['lat'].astype('float')
    df['long'] = df['long'].astype('float')
    df['merch_lat'] = df['merch_lat'].astype('float')
    df['merch_long'] = df['merch_long'].astype('float')#coordinates
    
    df['lat'] = np.radians(df['lat'])
    df['long'] = np.radians(df['long'])
    df['merch_lat'] = np.radians(df['merch_lat'])
    df['merch_long'] = np.radians(df['merch_long'])
    
    df['dlon'] = df['merch_long'] - df['long'] #change in coordinates
    df['dlat'] = df['merch_lat'] - df['lat']
    
    a = np.sin(df['dlat'] / 2)**2 + np.cos(df['lat']) * np.cos(df['merch_lat']) * np.sin(df['dlon'] / 2)**2 
    
    c = 2*2*np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    df['distance'] = R * c
    

In [None]:
def calculating_age(df) :
    
    df['dob'] = pd.to_datetime(df['dob'])
    
    def calculate_age(born):
        today = date.today()
        return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    
    df['age'] = df["dob"].apply(calculate_age)
    


In [None]:
def binning(df):
    
    df["Population_group"] = pd.cut(df["city_pop"], 
                                bins=list(range(0,3000001,500000)), 
                                labels = ["<5lac","5-10lac","10-15lac","15-20lac","20-25lac","25-30lac"])
    df["dist_range_km"] = pd.cut(df["distance"],
                             bins=[0,25,50,100,150,200,250,300,9999],
                             labels = ["<25","25-50","50-100","100-150","150-200","200-250","250-300","300+"])
    df["age_group"] = pd.cut(df["age"], 
                             bins=[0,25,40,60,80,200], 
                             labels = ["<25","25-40","40-60","60-80","80+"])

In [None]:
def extracting_month_date_hr(df):
    
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['month'] = pd.DatetimeIndex(df['trans_date_trans_time']).month
    df['day_of_week'] = df['trans_date_trans_time'].dt.day_name()
    df['transaction_hour'] = df['trans_date_trans_time'].dt.hour
    


In [None]:
def feature_selection(df):
    
    df = df[['category','amt','gender','state','Population_group','age_group', 
            'dist_range_km','month','day_of_week','transaction_hour','is_fraud']]
    cat_cols = ["category", "state", "month", "day_of_week", "transaction_hour", 'gender', 
                'Population_group','age_group', 'dist_range_km']
    dummy = pd.get_dummies(df[cat_cols], drop_first=True)
    df = pd.concat([df, dummy], axis=1)
    df.drop(cat_cols,axis = 1, inplace = True)
    
    global df1
    df1 = df
    return df1

In [None]:
def main_function():
    full_name(df)
    calculating_distance(df)
    calculating_age(df)
    binning(df)
    extracting_month_date_hr(df)
    feature_selection(df)

In [None]:
main_function()

In [None]:
df = df1

In [None]:
df.shape

(1296675, 88)

In [None]:
X = df.drop('is_fraud', axis = 1)
y = df['is_fraud']
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size= 0.3 , random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import precision_score, recall_score,confusion_matrix, classification_report, accuracy_score, f1_score

model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()
model4 = XGBClassifier()

model_dict = {'LogisticRegression':model1,
              'DecisionTreeClassifier':model2,
              'RandomForestClassifier':model3,
              'xgb':model4
             }


result_dict = {'parameter' : ['Accuracy','F1 score','Recall','Precision','True_Positive','False_Positive','False_negative','True_negative'],
               'LogisticRegression':[],
               'DecisionTreeClassifier':[],
               'RandomForestClassifier':[],
               'xgb':[],
              }

for model_name,model in model_dict.items() :
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    result_dict[model_name].append(accuracy_score(y_test, y_pred))
    result_dict[model_name].append(f1_score(y_test, y_pred))
    result_dict[model_name].append(recall_score(y_test, y_pred))
    result_dict[model_name].append(precision_score(y_test, y_pred))
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,0])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,1])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,0])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,1])
    
result = pd.DataFrame(result_dict)
result.set_index('parameter').T

parameter,Accuracy,F1 score,Recall,Precision,True_Positive,False_Positive,False_negative,True_negative
LogisticRegression,0.993589,0.0,0.0,0.0,386509.0,209.0,2285.0,0.0
DecisionTreeClassifier,0.995815,0.644231,0.645077,0.643387,385901.0,817.0,811.0,1474.0
RandomForestClassifier,0.99708,0.715288,0.624508,0.83695,386440.0,278.0,858.0,1427.0
xgb,0.996977,0.711057,0.63326,0.810644,386380.0,338.0,838.0,1447.0


## handling imbalance dataset with modern technique

In [None]:
# SMOTE

In [None]:
X = df.drop('is_fraud', axis = 1)
y = df['is_fraud']
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size= 0.3 , random_state = 42)

In [None]:
from imblearn.over_sampling import SMOTE
smt = SMOTE(random_state=42, k_neighbors=5)
X_train, y_train = smt.fit_resample(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import precision_score, recall_score,confusion_matrix, classification_report, accuracy_score, f1_score

model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()
model4 = XGBClassifier()

model_dict = {'LogisticRegression':model1,
              'DecisionTreeClassifier':model2,
              'RandomForestClassifier':model3,
              'xgb':model4
             }


result_dict = {'parameter' : ['Accuracy','F1 score','Recall','Precision','True_Positive','False_Positive','False_negative','True_negative'],
               'LogisticRegression':[],
               'DecisionTreeClassifier':[],
               'RandomForestClassifier':[],
               'xgb':[],
              }

for model_name,model in model_dict.items() :
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    result_dict[model_name].append(accuracy_score(y_test, y_pred))
    result_dict[model_name].append(f1_score(y_test, y_pred))
    result_dict[model_name].append(recall_score(y_test, y_pred))
    result_dict[model_name].append(precision_score(y_test, y_pred))
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,0])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,1])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,0])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,1])
    
result_with_SMOTE = pd.DataFrame(result_dict)
result_with_SMOTE.set_index('parameter').T

parameter,Accuracy,F1 score,Recall,Precision,True_Positive,False_Positive,False_negative,True_negative
LogisticRegression,0.942132,0.040738,0.20919,0.022566,366014.0,20704.0,1807.0,478.0
DecisionTreeClassifier,0.993908,0.550626,0.635449,0.485781,385181.0,1537.0,833.0,1452.0
RandomForestClassifier,0.996342,0.660948,0.607002,0.725418,386193.0,525.0,898.0,1387.0
xgb,0.975774,0.295033,0.86302,0.17793,377607.0,9111.0,313.0,1972.0


In [None]:
# SMOTETomek

In [None]:
# X = df.drop('is_fraud', axis = 1)
# y = df['is_fraud']
# X_train,X_test,y_train,y_test = train_test_split(X,y, test_size= 0.3 , random_state = 42)

In [None]:
# from imblearn.combine import SMOTETomek
# smtomek = SMOTETomek(random_state=42)
# X_train, y_train = smtomek.fit_resample(X_train, y_train)

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# # import xgboost as xgb
# from xgboost import XGBClassifier

# from sklearn.metrics import precision_score, recall_score,confusion_matrix, classification_report, accuracy_score, f1_score

# model1 = LogisticRegression()
# model2 = DecisionTreeClassifier()
# model3 = RandomForestClassifier()
# model4 = XGBClassifier()

# model_dict = {'LogisticRegression':model1,
#               'DecisionTreeClassifier':model2,
#               'RandomForestClassifier':model3,
#               'xgb':model4
#              }


# result_dict = {'parameter' : ['Accuracy','F1 score','Recall','Precision','True_Positive','False_Positive','False_negative','True_negative'],
#                'LogisticRegression':[],
#                'DecisionTreeClassifier':[],
#                'RandomForestClassifier':[],
#                'xgb':[],
#               }

# for model_name,model in model_dict.items() :
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
    
#     result_dict[model_name].append(accuracy_score(y_test, y_pred))
#     result_dict[model_name].append(f1_score(y_test, y_pred))
#     result_dict[model_name].append(recall_score(y_test, y_pred))
#     result_dict[model_name].append(precision_score(y_test, y_pred))
#     result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,0])
#     result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,1])
#     result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,0])
#     result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,1])
    
# result_with_SMOTETomek = pd.DataFrame(result_dict)
# result_with_SMOTETomek.set_index('parameter').T

In [None]:
# ADASYN

In [None]:
X = df.drop('is_fraud', axis = 1)
y = df['is_fraud']
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size= 0.3 , random_state = 42)

In [None]:
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=42)
X_train, y_train = ada.fit_resample(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.metrics import precision_score, recall_score,confusion_matrix, classification_report, accuracy_score, f1_score

model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()
model4 = XGBClassifier()

model_dict = {'LogisticRegression':model1,
              'DecisionTreeClassifier':model2,
              'RandomForestClassifier':model3,
              'xgb':model4
             }


result_dict = {'parameter' : ['Accuracy','F1 score','Recall','Precision','True_Positive','False_Positive','False_negative','True_negative'],
               'LogisticRegression':[],
               'DecisionTreeClassifier':[],
               'RandomForestClassifier':[],
               'xgb':[],
              }

for model_name,model in model_dict.items() :
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    result_dict[model_name].append(accuracy_score(y_test, y_pred))
    result_dict[model_name].append(f1_score(y_test, y_pred))
    result_dict[model_name].append(recall_score(y_test, y_pred))
    result_dict[model_name].append(precision_score(y_test, y_pred))
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,0])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[0,1])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,0])
    result_dict[model_name].append(confusion_matrix(y_test, y_pred)[1,1])
    
result_with_ADASYN = pd.DataFrame(result_dict)
result_with_ADASYN.set_index('parameter').T

parameter,Accuracy,F1 score,Recall,Precision,True_Positive,False_Positive,False_negative,True_negative
LogisticRegression,0.928579,0.029144,0.182495,0.015836,360803.0,25915.0,1868.0,417.0
DecisionTreeClassifier,0.994345,0.571984,0.643326,0.514886,385333.0,1385.0,815.0,1470.0
RandomForestClassifier,0.996219,0.631236,0.550985,0.73885,386273.0,445.0,1026.0,1259.0
xgb,0.94618,0.165032,0.90547,0.090789,365998.0,20720.0,216.0,2069.0


In [None]:
display(result.set_index('parameter').T)
display(result_with_SMOTE.set_index('parameter').T)
#display(result_with_SMOTETomek.set_index('parameter').T)
display(result_with_ADASYN.set_index('parameter').T)

parameter,Accuracy,F1 score,Recall,Precision,True_Positive,False_Positive,False_negative,True_negative
LogisticRegression,0.993589,0.0,0.0,0.0,386509.0,209.0,2285.0,0.0
DecisionTreeClassifier,0.995815,0.644231,0.645077,0.643387,385901.0,817.0,811.0,1474.0
RandomForestClassifier,0.99708,0.715288,0.624508,0.83695,386440.0,278.0,858.0,1427.0
xgb,0.996977,0.711057,0.63326,0.810644,386380.0,338.0,838.0,1447.0


parameter,Accuracy,F1 score,Recall,Precision,True_Positive,False_Positive,False_negative,True_negative
LogisticRegression,0.942132,0.040738,0.20919,0.022566,366014.0,20704.0,1807.0,478.0
DecisionTreeClassifier,0.993908,0.550626,0.635449,0.485781,385181.0,1537.0,833.0,1452.0
RandomForestClassifier,0.996342,0.660948,0.607002,0.725418,386193.0,525.0,898.0,1387.0
xgb,0.975774,0.295033,0.86302,0.17793,377607.0,9111.0,313.0,1972.0


parameter,Accuracy,F1 score,Recall,Precision,True_Positive,False_Positive,False_negative,True_negative
LogisticRegression,0.928579,0.029144,0.182495,0.015836,360803.0,25915.0,1868.0,417.0
DecisionTreeClassifier,0.994345,0.571984,0.643326,0.514886,385333.0,1385.0,815.0,1470.0
RandomForestClassifier,0.996219,0.631236,0.550985,0.73885,386273.0,445.0,1026.0,1259.0
xgb,0.94618,0.165032,0.90547,0.090789,365998.0,20720.0,216.0,2069.0


as we can see the xgb is performoing way better than other models in terms of finding true negative values but at the sametime it is giving so many flase positive values. Let's try to ajust performance by tunning the parameters of the model.
Since RandomForest is giving least number of false positive we will also try to increase its acuuracy by tunning the parameters if XGB  is giving so many flase positive values even after parameter tunning.  

In [1]:
from xgboost import XGBClassifier
model4 = XGBClassifier()