**Importing Libraries**

In [1]:
import pandas as pd
import numpy as np

**Importing data**

In [2]:
df=pd.read_csv(r"C:\Users\chaha\Downloads\Creditcard_data.csv")
X=df.drop(columns=['Class'])
Y=df['Class']
print(X.shape)
print(Y.shape)

(772, 30)
(772,)


**Checking for null values**

In [3]:
df.isnull().values.any()

False

**Checking if the dataset is balanced**

In [4]:
count1 = df[df['Class']==1].count().max()
count0 = df[df['Class']==0].count().max()
print(count1," objects belong to class 1")
print(count0," objects belong to class 0")

9  objects belong to class 1
763  objects belong to class 0


*Hence, the dataset is imbalanced because number of objects belonging to negative class are more*

**Splitting the dataset into two different sets of the individual classes**

In [8]:
fraud = df[df['Class']==1]
notfraud = df[df['Class']==0]

*To fix this imbalance, we use the techniques of oversampling to create a balanced dataset*

In [8]:
# Oversampling

from imblearn.over_sampling import RandomOverSampler

ROS = RandomOverSampler()
X2_res,Y2_res = ROS.fit_resample(X,Y)

In [6]:
# Error = 0.05 for simple random sampling
n = (1.96*1.96*0.5*0.5)//(0.05**2)

In [9]:
df2=X2_res
df2['Class']=Y2_res
df2

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1
1522,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
1523,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1
1524,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1


**Simple Random Sampling**

In [10]:
s1 = df2.sample(n=int(n), random_state=42)
s1

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1439,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1
76,49,-0.549626,0.418949,1.729833,0.203065,-0.187012,0.253878,0.500894,0.251256,-0.227985,...,0.115062,0.418529,-0.065133,0.264981,0.003958,0.395969,0.027182,0.043506,59.99,0
1010,406,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
660,499,1.255439,0.307729,0.292700,0.699873,-0.428876,-1.088456,0.043840,-0.167739,0.128854,...,-0.294795,-0.882126,0.136846,0.327949,0.194459,0.096516,-0.027271,0.029491,1.98,0
1132,529,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,1.50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
756,564,-0.203837,0.532747,-0.339857,-0.730934,2.728163,3.535882,0.263680,0.919169,-0.194501,...,-0.082087,-0.271636,-0.157778,0.989458,0.228821,-0.545156,0.058120,0.035573,12.90,0
1074,529,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,1.50,1
867,529,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,1.50,1


**Bootstrap Sampling**

In [11]:
# Oversampling
s2 = df2.sample(n=int(n), replace=True, random_state=42)

s2


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1126,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
1459,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
860,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1
1294,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
1130,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,423,1.174846,-0.097105,-0.361083,0.914490,0.594571,1.106664,-0.114835,0.307650,0.347846,...,-0.157769,-0.388798,-0.303408,-1.737288,0.805329,-0.221755,0.018558,-0.008570,43.34,0
569,426,-0.424126,0.943262,1.133354,-0.166338,0.387243,-0.030382,0.466045,0.275041,-0.527932,...,-0.211158,-0.582126,-0.021227,-0.379104,-0.241854,0.106796,0.251811,0.080462,4.95,0
1346,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1
685,517,-0.639474,-0.048355,2.452755,0.310804,-0.430963,-0.290032,0.166889,0.006196,0.651675,...,0.004189,0.110847,0.057008,0.389171,-0.449642,0.218186,-0.067664,-0.073760,59.90,0


**Stratified Sampling**

In [12]:
z=1.96
p=0.5
E=0.05
S=0.7
sample_size=round((z**2*p*(1-p))/((E/S)**2))
s3=df2.groupby('Class', group_keys=False).apply(lambda x: x.sample(sample_size))
s3['Class'].value_counts()
s3

  s3=df2.groupby('Class', group_keys=False).apply(lambda x: x.sample(sample_size))


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
351,259,-1.569485,-1.932133,1.249203,-4.434211,1.244282,0.402688,-0.649554,0.534756,0.886183,...,-0.074659,0.397405,0.199030,-1.386013,-0.141955,-0.984011,0.274079,-0.019784,55.45,0
253,172,1.264699,0.168286,-0.275340,0.220631,0.459215,0.468624,-0.172794,0.209104,-0.028623,...,-0.304789,-0.908607,-0.009439,-1.376774,0.266823,0.191967,-0.016844,0.002034,1.29,0
100,68,1.156939,0.037215,0.556799,0.519507,-0.479754,-0.352714,-0.222487,0.158242,0.011252,...,-0.182662,-0.612268,0.197305,0.174883,0.032497,0.099480,-0.026816,0.004199,2.69,0
311,222,1.081027,-0.139455,0.483881,0.642057,-0.186845,0.538283,-0.302749,0.315920,0.277328,...,-0.124039,-0.190064,0.057896,-0.269354,0.253835,0.311886,0.001591,-0.003468,17.24,0
43,33,-0.935732,0.170416,2.746261,-1.077965,-0.305594,0.011577,-0.296178,0.402776,-0.040472,...,0.401212,1.064864,-0.158325,0.295505,-0.259370,0.754195,0.046664,0.093948,9.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912,574,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,1.29,1
952,574,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,1.29,1
244,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1
849,472,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1


**Systematic Sampling**

In [13]:
n=len(df)
k=int(n**0.5)
s4=df2.iloc[::k]
s4

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
27,23,1.322707,-0.174041,0.434555,0.576038,-0.836758,-0.831083,-0.264905,-0.220982,-1.071425,...,-0.284376,-0.323357,-0.03771,0.347151,0.559639,-0.280158,0.042335,0.028822,16.0,0
54,37,1.295668,0.341483,0.081505,0.566746,-0.110459,-0.766325,0.073155,-0.168304,0.071837,...,-0.323607,-0.929781,0.063809,-0.193565,0.287574,0.127881,-0.023731,0.0252,0.99,0
81,52,1.147369,0.059035,0.263632,1.211023,-0.044096,0.301067,-0.13296,0.227885,0.252191,...,-0.087813,-0.110756,-0.097771,-0.323374,0.633279,-0.305328,0.027394,-0.00058,6.67,0
108,73,1.162281,1.248178,-1.581317,1.475024,1.138357,-1.020373,0.638387,-0.136762,-0.805505,...,-0.124012,-0.22715,-0.199185,-0.289757,0.776244,-0.28395,0.056747,0.084706,1.0,0
135,84,1.119272,-0.669639,0.803807,-0.651693,-1.395666,-0.800698,-0.601605,0.01439,2.019905,...,0.163687,0.546516,-0.176836,0.402556,0.563402,-0.534236,0.075047,0.042001,67.3,0
162,103,-0.940893,1.074155,1.759398,-0.601446,0.101693,-0.18852,0.455756,-3.460682,0.441525,...,2.270069,-0.143518,0.153908,0.700927,-0.413235,1.374031,-0.996161,-0.836301,9.99,0
189,124,-1.710935,-1.366799,2.217311,0.404714,-0.114375,-0.075942,-0.259943,0.320897,-0.175355,...,0.390634,0.481111,0.405839,0.066433,0.156732,1.286201,-0.093975,0.098826,230.0,0
216,142,1.288256,0.085828,-1.179482,0.064357,2.195225,3.383363,-0.448437,0.799347,-0.147006,...,0.017485,-0.051355,-0.14548,1.007613,0.833293,-0.265485,0.020539,0.015394,4.9,0
243,164,-0.433211,1.020835,2.01973,3.003261,0.031308,0.187063,0.850856,-0.143932,-0.918043,...,-0.177298,-0.18026,0.00776,0.382658,-0.187193,0.100067,0.204039,-0.01815,65.26,0


**Cluster Sampling**

In [14]:
z=1.96
p=0.5
E=0.05
C=1.7
sample_size=round((z**2*p*(1-p))/((E/C)**2))
clusters=2
df_new=df2
N = len(df2)
K = int(N/sample_size)
data = None
for k in range(K):
    sample_k = df_new.sample(sample_size)
    sample_k["cluster"] = np.repeat(k,len(sample_k))
    df_new = df_new.drop(index = sample_k.index)
    data = pd.concat([data,sample_k],axis = 0)

random_chosen_clusters = np.random.randint(0,K,size = clusters)
s5 = data[data.cluster.isin(random_chosen_clusters)]
s5.drop(['cluster'], axis=1, inplace=True)
s5

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
273,194,-1.131517,1.016399,0.735810,1.166614,0.790236,-1.187196,0.736469,-0.327992,-0.555549,...,0.009613,0.315739,0.054210,0.294232,0.003877,-0.314159,-0.099512,0.122697,1.00,0
985,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
362,266,-2.564961,2.470985,2.649417,-1.564256,1.794297,-0.614742,4.185906,-3.855359,5.436633,...,-1.672706,-0.463149,-0.532466,0.306494,0.226844,-0.365416,-0.936735,-2.733887,10.35,0
1335,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1
101,68,-0.770196,0.483572,1.901072,-0.001876,0.024245,-1.171903,0.666483,-0.094603,-0.136666,...,-0.057336,-0.155801,0.018108,0.941555,-0.168582,0.056171,0.058979,0.134188,22.36,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,548,-2.565417,1.285936,1.446036,0.685430,0.496797,0.970669,-1.994767,-5.916808,0.893287,...,5.200062,-1.191485,-0.057648,-0.457314,-0.492138,-0.330366,0.404640,-0.378188,41.96,0
506,373,-1.254476,0.322250,1.784507,-0.307396,1.009086,-0.998491,0.328482,0.131284,-0.485255,...,-0.160980,-0.778999,-0.025359,0.000828,0.154781,0.042034,-0.019550,0.081176,1.79,0
1286,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
667,504,-1.031079,0.626547,2.622436,1.411312,-0.296553,1.122952,0.286100,0.088802,1.103565,...,-0.446253,-0.374942,-0.075503,0.207437,-0.151028,-0.471124,-0.039909,-0.191513,24.90,0


In [15]:
samples=[]
samples.append(s1)
samples.append(s2)
samples.append(s3)
samples.append(s4)
samples.append(s5)

In [16]:
sol=pd.DataFrame(columns=['Simple-Random','Bootstrap','Stratified','Systematic','Cluster'], index=['Logistic Regression','Decision Tree','Random Forest','Naive Bayes','SVM'])

**Logistic Regression**

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [18]:
from sklearn.linear_model import LogisticRegression

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = LogisticRegression(random_state = 0,max_iter=2000)
    model.fit(xtrain, y_train)
    y_pred = model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[0,i]=acc*100


**Decision Trees**

In [19]:
from sklearn.tree import DecisionTreeClassifier

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)
    
    model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    model.fit(xtrain, y_train)
    y_pred = model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[1,i]=acc*100

**Random Forest**

In [20]:
from sklearn.ensemble import RandomForestClassifier

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = RandomForestClassifier(n_estimators = 100)
    model.fit(xtrain, y_train)
    y_pred = model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[2,i]=acc*100

**Naive Bayes**

In [21]:
from sklearn.naive_bayes import GaussianNB

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = GaussianNB()
    model.fit(xtrain,y_train)
    y_pred= model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[3,i]=acc*100

**Support Vector Machines**

In [22]:
from sklearn.svm import SVC

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = SVC(kernel='rbf')
    model.fit(xtrain, y_train)
    y_pred=model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[4,i]=acc*100

In [23]:
print(sol)

                    Simple-Random  Bootstrap Stratified Systematic    Cluster
Logistic Regression        90.625  91.666667  87.234043  86.666667  93.165468
Decision Tree           94.791667  97.916667   97.87234       80.0  98.561151
Random Forest               100.0      100.0      100.0  93.333333      100.0
Naive Bayes                84.375  64.583333  71.276596  46.666667  63.309353
SVM                     76.041667  73.958333  63.829787  53.333333  71.942446


In [26]:
for i in range(sol.shape[0]):  # Iterate through each row (model)
    model_name = sol.index[i]  # Get the model name
    best_sample_name = sol.iloc[i].idxmax()  # Get the column name with the highest accuracy
    best_accuracy = sol.iloc[i].max()  # Get the best accuracy value
    print(f"{model_name}: Best sampling is '{best_sample_name}' with accuracy {best_accuracy:.2f}%")


Logistic Regression: Best sampling is 'Cluster' with accuracy 93.17%
Decision Tree: Best sampling is 'Cluster' with accuracy 98.56%
Random Forest: Best sampling is 'Simple-Random' with accuracy 100.00%
Naive Bayes: Best sampling is 'Simple-Random' with accuracy 84.38%
SVM: Best sampling is 'Simple-Random' with accuracy 76.04%


*Random Forest has the best overall performance*