# Importing Data from Kaggle

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"shrikantlokare","key":"dc4926d6bfb859d71ab28d38d25d9da1"}'}

In [3]:
! mkdir ~/.kaggle

In [4]:
! cp kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle datasets download -d kartik2112/fraud-detection

Downloading fraud-detection.zip to /content
 98% 197M/202M [00:01<00:00, 154MB/s]
100% 202M/202M [00:01<00:00, 142MB/s]


In [7]:
!unzip /content/fraud-detection.zip

Archive:  /content/fraud-detection.zip
  inflating: fraudTest.csv           
  inflating: fraudTrain.csv          


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time

In [9]:
warnings.filterwarnings('ignore')
sns.set_style("darkgrid")

In [10]:
# Adjusting the display to fit rows and columns effectively
start_time = time.time()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [11]:
from datetime import datetime, date
import math
from math import radians, sin, cos, acos, atan2
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Reading Data

In [12]:
df = pd.read_csv("/content/fraudTrain.csv")

In [13]:
df = df.drop('Unnamed: 0',axis=1)

In [14]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


# Data Cleaning and Preparation

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   first                  1296675 non-null  object 
 6   last                   1296675 non-null  object 
 7   gender                 1296675 non-null  object 
 8   street                 1296675 non-null  object 
 9   city                   1296675 non-null  object 
 10  state                  1296675 non-null  object 
 11  zip                    1296675 non-null  int64  
 12  lat                    1296675 non-null  float64
 13  long                   1296675 non-null  float64
 14  city_pop          

In [16]:
# Create a column customer name with columns first and last 
# feature extraction
df['Customer_name'] = df['first']+" " + df['last']
df = df.drop(['first','last'], axis=1)

In [17]:
# Create a categorical column Population_group by binning the variable city_pop

df["Population_group"] = pd.cut(df["city_pop"], bins=list(range(0,3000001,500000)), labels = ["<5lac","5-10lac","10-15lac","15-20lac","20-25lac","25-30lac"])

In [18]:
# Create a column age from dob variable

df['dob'] = pd.to_datetime(df['dob'])

In [19]:
def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [20]:
# Create a column age
df['age'] = df["dob"].apply(calculate_age)

In [21]:
# Create a column age_group from the column age
df["age_group"] = pd.cut(df["age"], bins=[0,25,40,60,80,200], labels = ["<25","25-40","40-60","60-80","80+"])

In [22]:
# Calculating distance between the customer and merchant location using customer location co-ordinates(lat and long) 
# and merchant location co-ordinates(merch_lat and merch_long)

R = 6373.0  # radius of the Earth

df['lat'] = df['lat'].astype('float')
df['long'] = df['long'].astype('float')
df['merch_lat'] = df['merch_lat'].astype('float')
df['merch_long'] = df['merch_long'].astype('float')#coordinates

In [23]:
df['lat'] = np.radians(df['lat'])
df['long'] = np.radians(df['long'])
df['merch_lat'] = np.radians(df['merch_lat'])
df['merch_long'] = np.radians(df['merch_long'])

In [24]:
df['dlon'] = df['merch_long'] - df['long'] #change in coordinates
df['dlat'] = df['merch_lat'] - df['lat']

In [25]:
a = np.sin(df['dlat'] / 2)**2 + np.cos(df['lat']) * np.cos(df['merch_lat']) * np.sin(df['dlon'] / 2)**2  #Haversine formula

In [26]:
c = 2*2*np.arctan2(np.sqrt(a), np.sqrt(1 - a))
df['distance'] = R * c

In [27]:
df['distance'].head()

0    157.244484
1     60.443320
2    216.480102
3    191.406530
4    155.162181
Name: distance, dtype: float64

In [28]:
df = df.drop(['dlat', 'dlon'], axis=1)

In [29]:
df = df.drop(['dob','city_pop'], axis=1)

In [30]:
# Create a column dist_range_km from the column distance
df["dist_range_km"] = pd.cut(df["distance"], bins=[0,25,50,100,150,200,250,300,9999], labels = ["<25","25-50","50-100","100-150","150-200","200-250","250-300","300+"])

In [31]:
# Transaction date and time column
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

In [32]:
# Extract year and month from trans_date_trans_time column

df['year'] = pd.DatetimeIndex(df['trans_date_trans_time']).year
df['month'] = pd.DatetimeIndex(df['trans_date_trans_time']).month


In [33]:
# Extract day of the week and transaction hour from trans_date_trans_time column

df['day_of_week'] = df['trans_date_trans_time'].dt.day_name()
df['transaction_hour'] = df['trans_date_trans_time'].dt.hour

In [34]:
df.drop(['trans_date_trans_time', "lat", "long", "merch_lat", "merch_long", "Customer_name", "year"], axis=1, inplace=True)

In [35]:
df.drop(["merchant", "city", "job"], axis=1, inplace=True)

In [36]:
df.head()

Unnamed: 0,cc_num,category,amt,gender,street,state,zip,trans_num,unix_time,is_fraud,Population_group,age,age_group,distance,dist_range_km,month,day_of_week,transaction_hour
0,2703186189652095,misc_net,4.97,F,561 Perry Cove,NC,28654,0b242abb623afc578575680df30655b9,1325376018,0,<5lac,34,25-40,157.244484,150-200,1,Tuesday,0
1,630423337322,grocery_pos,107.23,F,43039 Riley Greens Suite 393,WA,99160,1f76529f8574734946361c461b024d99,1325376044,0,<5lac,44,40-60,60.44332,50-100,1,Tuesday,0
2,38859492057661,entertainment,220.11,M,594 White Dale Suite 530,ID,83252,a1a22d70485983eac12b5b88dad1cf95,1325376051,0,<5lac,60,40-60,216.480102,200-250,1,Tuesday,0
3,3534093764340240,gas_transport,45.0,M,9443 Cynthia Court Apt. 038,MT,59632,6b849c168bdad6f867558c3793159a81,1325376076,0,<5lac,55,40-60,191.40653,150-200,1,Tuesday,0
4,375534208663984,misc_pos,41.96,M,408 Bradley Rest,VA,24433,a41d7549acf90789359a9aa5346dcb46,1325376186,0,<5lac,36,25-40,155.162181,150-200,1,Tuesday,0


In [37]:
df= df[['category',	'amt',	'gender',	'state',	'Population_group',	'age_group', 'dist_range_km',	'month',	'day_of_week','transaction_hour','is_fraud']]

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 11 columns):
 #   Column            Non-Null Count    Dtype   
---  ------            --------------    -----   
 0   category          1296675 non-null  object  
 1   amt               1296675 non-null  float64 
 2   gender            1296675 non-null  object  
 3   state             1296675 non-null  object  
 4   Population_group  1296675 non-null  category
 5   age_group         1296675 non-null  category
 6   dist_range_km     1296675 non-null  category
 7   month             1296675 non-null  int64   
 8   day_of_week       1296675 non-null  object  
 9   transaction_hour  1296675 non-null  int64   
 10  is_fraud          1296675 non-null  int64   
dtypes: category(3), float64(1), int64(3), object(4)
memory usage: 82.9+ MB


In [39]:
df['transaction_hour']= df['transaction_hour'].astype(str)
df['month']= df['month'].astype(str)
df['transaction_hour']= df['transaction_hour'].astype(str)
df['month']= df['month'].astype(str)

In [40]:
cat_cols = ["category", "state", "month", "day_of_week", "transaction_hour", 'gender', 'Population_group','age_group', 'dist_range_km']

In [41]:
dummy = pd.get_dummies(df[cat_cols], drop_first=True)

In [42]:
df = pd.concat([df, dummy], axis=1)

In [43]:
df

Unnamed: 0,category,amt,gender,state,Population_group,age_group,dist_range_km,month,day_of_week,transaction_hour,is_fraud,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DC,state_DE,state_FL,state_GA,state_HI,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_MT,state_NC,state_ND,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_RI,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,month_10,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,transaction_hour_1,transaction_hour_10,transaction_hour_11,transaction_hour_12,transaction_hour_13,transaction_hour_14,transaction_hour_15,transaction_hour_16,transaction_hour_17,transaction_hour_18,transaction_hour_19,transaction_hour_2,transaction_hour_20,transaction_hour_21,transaction_hour_22,transaction_hour_23,transaction_hour_3,transaction_hour_4,transaction_hour_5,transaction_hour_6,transaction_hour_7,transaction_hour_8,transaction_hour_9,gender_M,Population_group_5-10lac,Population_group_10-15lac,Population_group_15-20lac,Population_group_20-25lac,Population_group_25-30lac,age_group_25-40,age_group_40-60,age_group_60-80,age_group_80+,dist_range_km_25-50,dist_range_km_50-100,dist_range_km_100-150,dist_range_km_150-200,dist_range_km_200-250,dist_range_km_250-300,dist_range_km_300+
0,misc_net,4.97,F,NC,<5lac,25-40,150-200,1,Tuesday,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
1,grocery_pos,107.23,F,WA,<5lac,40-60,50-100,1,Tuesday,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,entertainment,220.11,M,ID,<5lac,40-60,200-250,1,Tuesday,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
3,gas_transport,45.00,M,MT,<5lac,40-60,150-200,1,Tuesday,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
4,misc_pos,41.96,M,VA,<5lac,25-40,150-200,1,Tuesday,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,entertainment,15.56,M,UT,<5lac,40-60,200-250,6,Sunday,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
1296671,food_dining,51.70,M,MD,<5lac,40-60,150-200,6,Sunday,12,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
1296672,food_dining,105.93,M,NM,<5lac,40-60,150-200,6,Sunday,12,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
1296673,food_dining,74.90,M,SD,<5lac,40-60,150-200,6,Sunday,12,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


In [44]:
df = df.drop(cat_cols,axis = 1)

In [None]:
df.to_csv('fraud_detection_train.csv')

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Columns: 122 entries, amt to dist_range_km_300+
dtypes: float64(1), int64(1), uint8(120)
memory usage: 168.2 MB


In [46]:
X = df.drop('is_fraud',axis = 1)

In [47]:
y = df['is_fraud']

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [50]:
from sklearn.preprocessing import StandardScaler

In [51]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

In [53]:
# Base model
lreg = LogisticRegression(penalty='l1',solver='saga',multi_class="ovr")
lreg.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', penalty='l1', solver='saga')

In [54]:
y_pred = lreg.predict(X_test)

In [55]:
print ('Accuracy: ', accuracy_score(y_test, y_pred))
print ('F1 score: ', f1_score(y_test, y_pred))
print ('Recall: ', recall_score(y_test, y_pred))
print ('Precision: ', precision_score(y_test, y_pred))
print ('\n clasification report:\n', classification_report(y_test,y_pred))
print ('\n confussion matrix:\n',confusion_matrix(y_test, y_pred))

Accuracy:  0.993760965546494
F1 score:  0.015815085158150853
Recall:  0.008552631578947369
Precision:  0.10483870967741936

 clasification report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00    257815
           1       0.10      0.01      0.02      1520

    accuracy                           0.99    259335
   macro avg       0.55      0.50      0.51    259335
weighted avg       0.99      0.99      0.99    259335


 confussion matrix:
 [[257704    111]
 [  1507     13]]


In [56]:
# Random OverSampling

from imblearn.over_sampling import RandomOverSampler

over_sample = RandomOverSampler(sampling_strategy = 1)
X_resampled_os, y_resampled_os = over_sample.fit_resample(X_train, y_train)
len(X_resampled_os)

2062708

In [None]:
# checking performane after oversampling

lreg.fit(X_resampled_os, y_resampled_os)


In [59]:
y_pred_os = lreg.predict(X_test)

In [60]:
print ('Accuracy: ', accuracy_score(y_test, y_pred_os))
print ('F1 score: ', f1_score(y_test, y_pred_os))
print ('Recall: ', recall_score(y_test, y_pred_os))
print ('Precision: ', precision_score(y_test, y_pred_os))
print ('\n clasification report:\n', classification_report(y_test,y_pred_os))
print ('\n confussion matrix:\n',confusion_matrix(y_test, y_pred_os))

Accuracy:  0.8747990051477819
F1 score:  0.07902425187916608
Recall:  0.9164473684210527
Precision:  0.04129242626352453

 clasification report:
               precision    recall  f1-score   support

           0       1.00      0.87      0.93    257815
           1       0.04      0.92      0.08      1520

    accuracy                           0.87    259335
   macro avg       0.52      0.90      0.51    259335
weighted avg       0.99      0.87      0.93    259335


 confussion matrix:
 [[225473  32342]
 [   127   1393]]


In [None]:
# SMOTE
from imblearn.over_sampling import SMOTE
smt = SMOTE(random_state=45, k_neighbors=5)
X_resampled_smt, y_resampled_smt = smt.fit_resample(X_train, y_train)
len(X_resampled_smt)

In [None]:
# checking performane after smote
lreg_smt.fit(X_resampled_smt, y_resampled_smt)
y_pred_smt = lreg_smt.predict(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
def run_exps(X_train: pd.DataFrame , y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame):
    
    dfs = []

    models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
              ]
    
    results = []
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']

    target_names = ['non_fraud', 'fraud']

    for name, model in models:
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
        names.append(name)
        this_df['model'] = name
        dfs.append(this_df)
        final = pd.concat(dfs, ignore_index=True)

    return final

In [None]:

result_dict = {}

result_dict[name].append(accuracy_score(y_test, y_pred_os))
result_dict[name].append(f1_score(y_test, y_pred_os))
result_dict[name].append(recall_score(y_test, y_pred_os))
result_dict[name].append(recall_score(y_test, y_pred_os))







In [None]:
result_disct = {}

In [None]:
models = [
          ('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('SVM', SVC()), 
          ('GNB', GaussianNB()),
          ('XGB', XGBClassifier())
              ]

In [None]:
pd.Dataframe()