In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from imblearn.over_sampling import RandomOverSampler
import math
import random

In [2]:
data = pd.read_csv("/content/Creditcard_data.csv")

In [3]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
Amount = normalize([data['Amount']])[0]
data['Amount'] = Amount
data = data.iloc[:, 1:]

In [5]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0.000463,1
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.065115,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.021237,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.012036,0


In [6]:
X = data.drop(['Class'], axis = 1)
y = data['Class']

# Balancing dataset using over-sampling technique


In [7]:
oversampler = RandomOverSampler(random_state = 42)

X_balanced, y_balanced = oversampler.fit_resample(X,y)

In [8]:
X_dummy, X_test, y_dummy, y_test = train_test_split(X_balanced, y_balanced, test_size=0.33, random_state=42)

In [9]:
data_new = pd.concat([X_balanced, y_balanced], axis = 1)
data_new

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,0.000463,1
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.065115,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.021237,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0.012036,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,-0.328631,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,0.000258,1
1522,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,-0.142617,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.000170,1
1523,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,-0.142617,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.000170,1
1524,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,-0.066407,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,0.000172,1


# Defining model in functions

In [10]:
#Logistic Regression
def LR(X_train, y_train, X_test, y_test):
  lf = LogisticRegression()
  lf.fit(X_train, y_train)
  y_pred = lf.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)
  return accuracy

In [11]:
#Decision Trees
def DT(X_train, y_train, X_test, y_test):
  tree = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train)
  y_pred = tree.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  return accuracy

In [12]:
#Random Forest
def RF(X_train, y_train, X_test, y_test):
  rfc = RandomForestClassifier(n_estimators=2)
  rfc.fit(X_train, y_train)
  y_pred = rfc.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)
  return accuracy

In [13]:
#SVM
def SVM(X_train, y_train, X_test, y_test):
  svm=SGDClassifier(loss = 'hinge')
  svm.fit(X_train, y_train)
  y_pred = svm.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)
  return accuracy

In [14]:
#Gradient Boosting
def GB(X_train, y_train, X_test, y_test):
  gradient = GradientBoostingClassifier().fit(X_train, y_train)
  y_pred = gradient.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  return accuracy

In [15]:
z = 1.96
p = 0.5
e = 0.05

# Simple Random Sampling


In [16]:
n_srs = (z**2*p*(1-p))/(e**2)
n_srs = math.floor(n_srs)
print(n_srs)

384


In [17]:
def randomSampling(data, n_srs):
  random_sample = data.sample(n=n_srs)
  return (random_sample)
srs = randomSampling(data_new, n_srs)

In [18]:
srs.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
144,1.276134,0.21147,0.143001,0.37005,0.154748,-0.032255,-0.00303,-0.038887,0.030715,-0.121891,...,-0.279654,-0.747704,0.056808,-0.759464,0.271641,0.174757,-0.01077,0.008104,0.00034,0
1168,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1
332,-0.398935,0.941953,1.359288,0.01349,-0.19194,-0.824441,0.474948,0.175557,-0.362987,-0.283274,...,-0.22221,-0.704461,0.020807,0.429138,-0.286397,0.048579,0.229096,0.085802,0.000463,0
956,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.15659,-0.142117,-0.574775,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,0.000172,1
928,-1.738582,0.05274,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,-0.066407,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.33504,0.240323,-0.345129,-0.383563,0.000172,1


In [19]:
X_srs = srs.drop('Class', axis = 1)
y_srs = srs['Class']
len(y_srs)

384

In [20]:
srs_lr = LR(X_srs, y_srs, X_test, y_test)
srs_dt = DT(X_srs, y_srs, X_test, y_test)
srs_rf = RF(X_srs, y_srs, X_test, y_test)
srs_svm = SVM(X_srs, y_srs, X_test, y_test)
srs_gb = GB(X_srs, y_srs, X_test, y_test)

d1 = [srs_lr, srs_dt, srs_rf, srs_svm, srs_gb]
d1

[0.9047619047619048,
 0.9345238095238095,
 0.9940476190476191,
 0.9166666666666666,
 0.9841269841269841]

# Systematic Sampling

In [21]:
interval = 4
systematic_data = data_new.iloc[::interval]
systematic_data

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,0.012036,0
8,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.410430,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.016027,0
12,1.249999,-1.221637,0.383930,-1.234899,-1.485419,-0.753230,-0.689405,-0.227487,-2.094011,1.323729,...,-0.231809,-0.483285,0.084668,0.392831,0.161135,-0.354990,0.026416,0.042422,0.020893,0
16,1.103215,-0.040296,1.267332,1.289091,-0.735997,0.288069,-0.586057,0.189380,0.782333,-0.267975,...,-0.024612,0.196002,0.013802,0.103758,0.364298,-0.382261,0.092809,0.037051,0.002234,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1508,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,-0.838587,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,0.090968,1
1512,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,-0.574775,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,0.000172,1
1516,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,-0.328631,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,0.000258,1
1520,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,-0.286012,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,0.000222,1


In [22]:
X_ss = systematic_data.drop('Class', axis=1)
y_ss = systematic_data['Class']

In [23]:
ss_lr = LR(X_ss, y_ss, X_test, y_test)
ss_dt = DT(X_ss, y_ss, X_test, y_test)
ss_rf = RF(X_ss, y_ss, X_test, y_test)
ss_svm = SVM(X_ss, y_ss, X_test, y_test)
ss_gb = GB(X_ss, y_ss, X_test, y_test)

d2 = [ss_lr, ss_dt, ss_rf, ss_svm, ss_gb]
d2

[0.9146825396825397,
 0.9265873015873016,
 0.996031746031746,
 0.9464285714285714,
 0.9920634920634921]

#Stratified Sampling

In [24]:
s = 2
n_strat = (z**2*p*(1-p))/(e**2/s**2)
n_strat = math.floor(n_strat)
print(n_strat)

1536


In [25]:
strata = data_new.groupby('Class')
stratified_data = strata.apply(lambda x: x.sample(n_srs))
stratified_data

Unnamed: 0_level_0,Unnamed: 1_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,606,1.294157,1.132084,-2.148605,1.230031,1.447894,-0.767676,0.575635,-0.094475,-0.454617,-1.349215,...,-0.257463,-0.726440,-0.374828,-1.358994,0.968892,-0.225584,0.027084,0.071322,0.000514,0
0,451,-0.386633,0.953379,1.851726,1.623108,-0.603151,0.204916,0.438894,-0.313317,0.359461,0.585055,...,0.141432,0.996964,-0.205808,0.481312,-0.224335,-0.105636,-0.104202,-0.070726,0.009589,0
0,323,-0.438337,1.028364,1.455502,-0.230275,0.198899,-0.358325,0.659133,0.059988,-0.741863,-0.167486,...,-0.187516,-0.485442,-0.074138,-0.002586,-0.153928,0.071891,0.264233,0.109265,0.001228,0
0,368,1.320626,-1.174338,0.713431,-2.177941,-1.511234,0.059314,-1.308064,0.298796,0.788814,-0.200099,...,-0.354221,-0.365280,0.064822,-0.322003,0.152701,-0.003131,0.075103,0.010941,0.000447,0
0,335,1.311258,0.075028,-1.292832,-0.483154,2.139257,3.181564,-0.505045,0.804860,-0.074071,-0.217426,...,-0.342676,-1.148504,0.095279,0.946121,0.369713,0.110626,-0.021837,0.023638,0.000340,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,830,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.000000,1
1,1285,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,-0.286012,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,0.000222,1
1,1337,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,-0.282617,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,0.000463,1
1,1081,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,-0.286012,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,0.000222,1


In [26]:
X_st = stratified_data.drop('Class', axis=1) #X_train
y_st = stratified_data['Class']

In [27]:
st_lr = LR(X_st, y_st, X_test, y_test)
st_dt = DT(X_st, y_st, X_test, y_test)
st_rf = RF(X_st, y_st, X_test, y_test)
st_svm = SVM(X_st, y_st, X_test, y_test)
st_gb = GB(X_st, y_st, X_test, y_test)

d3 = [st_lr, st_dt, st_rf, st_svm, st_gb]
d3

[0.9186507936507936, 0.9325396825396826, 1.0, 0.8809523809523809, 1.0]

#Clustering Sample

In [28]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=42).fit(data_new)
cluster_assignments = kmeans.labels_
cluster = [1,3,5,6,8]
cluster_series = pd.Series(cluster_assignments)
cluster_data = data_new[cluster_series.isin(cluster)]
cluster_data.head()



Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.025729,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.065115,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.021237,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.012036,0
5,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,0.000631,0


In [29]:
X_c = cluster_data.drop('Class', axis = 1)
y_c = cluster_data['Class']

In [30]:
cs_lr = LR(X_c, y_c, X_test, y_test)
cs_dt = DT(X_c, y_c, X_test, y_test)
cs_rf = RF(X_c, y_c, X_test, y_test)
cs_svm = SVM(X_c, y_c, X_test, y_test)
cs_gb = GB(X_c, y_c, X_test, y_test)

d4 = [cs_lr, cs_dt, cs_rf, cs_svm, cs_gb]
d4

[0.7638888888888888,
 0.7936507936507936,
 0.7738095238095238,
 0.7698412698412699,
 0.7698412698412699]

#Bootstrap Sampling

In [31]:
bootstrap_samples = 20
a1 = []
a2 = []
a3 = []
a4 = []
a5 = []

In [32]:
def bootstrap(classifier, alist):
  for i in range(bootstrap_samples):
    indices = np.random.choice(range(len(data_new)), size=len(data_new), replace = True)
    X_bs = data_new.iloc[indices].drop('Class', axis=1)
    y_bs = data_new['Class'].iloc[indices]

    alist = classifier(X_bs, y_bs, X_test, y_test)
  mean_acc = np.mean(alist)
  return mean_acc

In [33]:
bs_lr = bootstrap(LR, a1)
bs_dt = bootstrap(DT, a2)
bs_rf = bootstrap(RF, a3)
bs_svm = bootstrap(SVM, a4)
bs_gb = bootstrap(GB, a5)

d5 = [bs_lr, bs_dt, bs_rf, bs_svm, bs_gb]
d5

[0.9246031746031746,
 0.9365079365079365,
 1.0,
 0.9424603174603174,
 0.996031746031746]

In [34]:
result = {
    'Model' : ['Logistic Regression','Decision Tree','Random Tree','SVC','Gradient Boosting'],
    'Simple Random Sampling' : d1,
    'Systematic Sampling' : d2,
    'Stratified Sampling' : d3,
    'Cluster Sampling' : d4,
    'Bootstrap Sampling' : d5
}

In [35]:
result_df = pd.DataFrame(result)

# Final Result Table

In [36]:
result_df

Unnamed: 0,Model,Simple Random Sampling,Systematic Sampling,Stratified Sampling,Cluster Sampling,Bootstrap Sampling
0,Logistic Regression,0.904762,0.914683,0.918651,0.763889,0.924603
1,Decision Tree,0.934524,0.926587,0.93254,0.793651,0.936508
2,Random Tree,0.994048,0.996032,1.0,0.77381,1.0
3,SVC,0.916667,0.946429,0.880952,0.769841,0.94246
4,Gradient Boosting,0.984127,0.992063,1.0,0.769841,0.996032
