In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/creditcardfraud/creditcard.csv


In [2]:
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.model_selection import train_test_split # data split
from sklearn.ensemble import RandomForestClassifier # Random forest tree algorithm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, confusion_matrix

In [3]:
df = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
df.drop('Time', axis = 1, inplace = True)

In [7]:
df['Class'].value_counts()
#1=fraud, 0=notfraud

0    284315
1       492
Name: Class, dtype: int64

In [8]:
nonfraud = df[df.Class==0]
fraud = df[df.Class==1]

print("Amount stats for non fraud class")
print(nonfraud.Amount.describe())

print("\nAmount stats for fraud class")
print(fraud.Amount.describe())

Amount stats for non fraud class
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

Amount stats for fraud class
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64


Values of amount variable are varying a lot, needs normalising
## standard scaling

In [9]:

sc = StandardScaler()

amt = df['Amount'].values

df['Amount'] = sc.fit_transform(amt.reshape(-1,1))

df['Amount']

0         0.244964
1        -0.342475
2         1.160686
3         0.140534
4        -0.073403
            ...   
284802   -0.350151
284803   -0.254117
284804   -0.081839
284805   -0.313249
284806    0.514355
Name: Amount, Length: 284807, dtype: float64

In [10]:
X = df.drop('Class',axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify =y, random_state=1)

In [11]:
X_train[:1]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
9341,1.148607,-0.004154,-0.231106,1.124256,0.696077,1.177523,-0.160142,0.201478,1.568269,-0.32029,...,-0.122816,-0.294334,-0.55062,-0.309693,-1.773489,0.791006,-0.263954,-0.008489,-0.011284,-0.118142


# Unbalanced data

In [12]:
rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_ypred = rf.predict(X_test)

In [13]:
print(classification_report(y_test, rf_ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.80      0.85        98

    accuracy                           1.00     56962
   macro avg       0.95      0.90      0.92     56962
weighted avg       1.00      1.00      1.00     56962



# SMOTE - oversampling

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=7)
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

In [16]:
y_train_over.value_counts()

0    227451
1    227451
Name: Class, dtype: int64

In [17]:
rf2 = RandomForestClassifier()
rf2.fit(X_train_over, y_train_over)
rf2_ypred = rf2.predict(X_test)

In [18]:
print(classification_report(y_test, rf2_ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.87      0.85        98

    accuracy                           1.00     56962
   macro avg       0.91      0.93      0.92     56962
weighted avg       1.00      1.00      1.00     56962



# Undersampling

In [19]:
from imblearn.under_sampling import RandomUnderSampler

In [20]:
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)

In [21]:
y_train_un.value_counts()

0    394
1    394
Name: Class, dtype: int64

In [22]:
rf3 = RandomForestClassifier()
rf3.fit(X_train_un, y_train_un)
rf3_ypred = rf3.predict(X_test)

In [23]:
print(classification_report(y_test, rf3_ypred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.93      0.11        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962



In [24]:
print(confusion_matrix(y_test,rf3_ypred))

[[55462  1402]
 [    7    91]]


# Class weightage

In [25]:
count_class_1 = y_train.value_counts()[0]
count_class_2 = y_train.value_counts()[1]
ratio = count_class_1/count_class_2
rf4 = RandomForestClassifier(class_weight={1:ratio, 0:1})

In [26]:
rf4.fit(X_train, y_train)
rf4_ypred = rf4.predict(X_test)

In [27]:
print(classification_report(y_test,rf4_ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.85      0.88        98

    accuracy                           1.00     56962
   macro avg       0.96      0.92      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [28]:
print(confusion_matrix(y_test,rf4_ypred))

[[56856     8]
 [   15    83]]


In [29]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score,balanced_accuracy_score

In [30]:
acc=[]
acc.append(accuracy_score(y_test, rf_ypred))
acc.append(accuracy_score(y_test, rf2_ypred))
acc.append(accuracy_score(y_test, rf3_ypred))
acc.append(accuracy_score(y_test, rf4_ypred))

In [31]:
bacc=[]  #average of recall obtained on each class
bacc.append(balanced_accuracy_score(y_test, rf_ypred))
bacc.append(balanced_accuracy_score(y_test, rf2_ypred))
bacc.append(balanced_accuracy_score(y_test, rf3_ypred))
bacc.append(balanced_accuracy_score(y_test, rf4_ypred))

In [32]:
rfm = precision_recall_fscore_support(y_test, rf_ypred, average='macro')
rfm2 = precision_recall_fscore_support(y_test, rf2_ypred, average='macro')
rfm3 = precision_recall_fscore_support(y_test, rf3_ypred, average='macro')
rfm4 = precision_recall_fscore_support(y_test, rf4_ypred, average='macro')


In [33]:
precision = [rfm[0],rfm2[0],rfm3[0],rfm4[0]]
recall = [rfm[1],rfm2[1],rfm3[1],rfm4[1]]
f1 = [rfm[2],rfm2[2],rfm3[2],rfm4[2]]
f1 = [rfm[2],rfm2[2],rfm3[2],rfm4[2]]


In [34]:
metrics = pd.DataFrame()
metrics['Models'] = ['Unbalanced','SMOTE','Undersampling','ClassWeightage']


In [35]:
metrics['Precision'] = precision
metrics['Recall'] = recall
metrics['F1score'] = f1
metrics['Accuracy'] = acc
metrics['balanced_accuracy_score'] = bacc
metrics

Unnamed: 0,Models,Precision,Recall,F1score,Accuracy,balanced_accuracy_score
0,Unbalanced,0.953313,0.897889,0.92379,0.999508,0.897889
1,SMOTE,0.912507,0.933515,0.922749,0.999456,0.933515
2,Undersampling,0.530412,0.951958,0.550925,0.975264,0.951958
3,ClassWeightage,0.955912,0.923399,0.939052,0.999596,0.923399


In [36]:
import plotly.express as px

In [37]:
metricsdf=pd.melt(metrics,id_vars=['Models'],var_name='metrics', value_name='value')
metricsdf

Unnamed: 0,Models,metrics,value
0,Unbalanced,Precision,0.953313
1,SMOTE,Precision,0.912507
2,Undersampling,Precision,0.530412
3,ClassWeightage,Precision,0.955912
4,Unbalanced,Recall,0.897889
5,SMOTE,Recall,0.933515
6,Undersampling,Recall,0.951958
7,ClassWeightage,Recall,0.923399
8,Unbalanced,F1score,0.92379
9,SMOTE,F1score,0.922749


In [38]:
fig = px.bar(metricsdf, x="Models", color="metrics",
             y='value',
             title="A Grouped Bar Chart With Plotly Express in Python",
             barmode='group',
             
            )

fig.show()

In [39]:
fig = px.bar(metricsdf, x="Models", color="metrics",
             y='value',
             barmode='relative',
            )
fig.show()

In [40]:
fig = px.bar(metricsdf, x="Models", color="metrics",
             y='value',
             title="A Grouped Bar Chart With Plotly Express in Python",
             barmode='group',
             facet_col='metrics'
            )

fig.show()

## after evaluating all the modelling methods for imbalanced classes, Class weightage method performs the best.
### Another advantage of this method is its simplicity; while SMOTE takes a significant amount of time to run, class weights does not add to the time complexity of the machine learning pipeline.