In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


ModuleNotFoundError: No module named 'pandas'

In [66]:
df = pd.read_csv('Data/creditcard.csv')

In [75]:
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.783274,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,-0.269825,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0.670579,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,-0.296653,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,0.038986,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,0.641096,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,-0.167680,0


In [68]:
df.drop(columns = ['Time'], inplace = True)

In [69]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [70]:
per_of_fraud = (df['Class'].value_counts()[1] / len(df))*100

Here, per_of_fraud indicates the percentage of fraud cases in all of the transactions and it's value is 0.17%
We can see that the percentage of fraud cases is very low compared to the total transactions. So, we can say that the dataset is highly imbalanced. From the dataframe, we can see that the 'Amount' column is varying heavily and we can view it statistically

In [71]:
non_fraud_filt = (df['Class'] == 0)
non_fraud_df = df.loc[non_fraud_filt]
non_fraud_df['Amount'].describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [72]:
fraud_filt = (df['Class'] == 1)
fraud_df = df.loc[fraud_filt]
fraud_df['Amount'].describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

From the statistics, we can see that the data in the Amount is varying hugely and we need to normalise it. We can normalise it using robust scaler because the data has outliers and standardscaler doesn't work efficiently when there are outliers.

In [73]:
rbs = RobustScaler()
df['Amount'] = rbs.fit_transform(df['Amount'].values.reshape(-1,1)) 

In [86]:
model_params = {
    'tree_model':{
        'model': DecisionTreeClassifier(),
        'params':{
            'criterion': ['entropy'],
            'max_depth': [4, 5]
        }
    },
    'knn':{
        'model': KNeighborsClassifier(),
        'params':{
            'n_neighbors': [5 , 6, 7]
        }
    },
    'lr':{
        'model': LogisticRegression(),
        'params':{
            'max_iter': [1000]
        }
    },
    'rf':{
        'model': RandomForestClassifier(),
        'params':{
            'max_depth': [4,5]
        }
    },
    'xgb':{
        'model': XGBClassifier(),
        'params':{
            'max_depth': [4,5]
        }
    }
    
}

In [78]:
X = df.drop(columns = 'Class').values
y = df['Class'].values

In [87]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(
        mp['model'],
        mp['params'],
        cv = 5,
        n_jobs = -1
    )
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    }
    )





In [88]:
scores_df = pd.DataFrame(scores, columns = ['model', 'best_score', 'best_params'])
scores_df

Unnamed: 0,model,best_score,best_params
0,tree_model,0.999319,"{'criterion': 'entropy', 'max_depth': 4}"
1,knn,0.999259,{'n_neighbors': 7}
2,lr,0.999126,{'max_iter': 1000}
3,rf,0.999326,{'max_depth': 5}
4,xgb,0.999396,{'max_depth': 5}


We have calculated the accuracy scores of multiple models like nearest neighbors, xgboostclassifier etc with hyper tuning the paramters of each model and looked at which parameters gives the best accuracy for each model.
From the accuracy scores, we can say that xgboostclassifier has the best accuracy score with a max depth of 5. So, we can conclude that the xgbclassifier is the most appropriate model for our case.