# Sampling
### types of sampling : 
- simple random sampling
- systematic sampling
- stratified sampling
- cluster sampling

## Handling imbalanced datasets : 
- oversampling => reuse the samples and SMOTE
- undersampling
- ensemble different resampled datasets
- class weights / change evaluation metrics

## Dealing with Credit Card Fraud Detection

In [62]:
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import random as rd

In [2]:
df = pd.read_csv('Creditcard_data.csv')

In [3]:
df


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,575,-0.572263,0.731748,1.541254,0.150506,1.108974,0.372152,1.084879,-0.146329,-0.274447,...,-0.143508,-0.107582,-0.418263,-0.731029,0.877525,-0.364150,-0.177509,-0.256545,26.72,0
768,579,-1.296845,-0.511605,2.404726,-0.310762,-0.319551,-0.542842,-0.173310,0.260423,-1.202688,...,-0.071270,-0.161175,0.088496,0.285390,0.281069,-0.370130,0.043410,0.092318,80.00,0
769,579,1.214170,0.210481,0.484651,0.479768,-0.261955,-0.527039,0.021782,-0.106888,-0.037631,...,-0.224292,-0.594609,0.159877,0.091873,0.140964,0.227406,-0.017389,0.016030,5.98,0
770,580,1.267030,-0.071114,0.037680,0.512683,0.242392,0.705212,-0.226582,0.109483,0.657565,...,-0.164468,-0.177225,-0.222918,-1.245505,0.678360,0.525059,0.002920,-0.003333,12.36,0


In [4]:
print(df.shape)

(772, 31)


In [5]:
print("Overview of the data:" )
df.head()

Overview of the data:


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
print("Description of the dataset:" )
df.describe()

Description of the dataset:


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,...,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0
mean,283.005181,-0.176963,0.217169,0.875172,0.285628,-0.005029,0.159081,0.123329,-0.057547,-0.030384,...,0.004888,-0.096995,-0.040344,-0.002501,0.114337,0.022782,0.023353,-0.017045,68.66829,0.011658
std,171.834196,1.294724,1.173401,1.031878,1.258758,1.098143,1.225682,0.852075,0.830144,0.878183,...,0.609335,0.607228,0.358724,0.621507,0.429667,0.484227,0.300934,0.278332,197.838269,0.107411
min,0.0,-6.093248,-12.114213,-5.694973,-4.657545,-6.631951,-3.498447,-4.925568,-7.494658,-2.770089,...,-4.134608,-2.776923,-3.553381,-1.867208,-1.389079,-1.243924,-2.377933,-2.735623,0.0,0.0
25%,126.5,-0.896416,-0.174684,0.308677,-0.460058,-0.534567,-0.630717,-0.296289,-0.16788,-0.517068,...,-0.213746,-0.525289,-0.176915,-0.379766,-0.166227,-0.313631,-0.047868,-0.033083,5.9875,0.0
50%,282.0,-0.382618,0.285843,0.905435,0.395919,-0.116612,-0.109581,0.116329,0.034755,-0.08227,...,-0.075802,-0.076551,-0.048353,0.091886,0.143723,-0.026414,0.023199,0.021034,16.665,0.0
75%,432.0,1.110739,0.885745,1.532969,1.117559,0.452818,0.482972,0.57539,0.252395,0.412261,...,0.095149,0.307438,0.070085,0.426339,0.425798,0.260408,0.112199,0.087023,55.5275,0.0
max,581.0,1.586093,5.267376,3.772857,4.075817,7.672544,5.122103,4.808426,2.134599,5.459274,...,5.27342,1.57475,3.150413,1.215279,1.13672,3.087444,2.490503,1.57538,3828.04,1.0


In [7]:
class_counts = df['Class'].value_counts()
class_counts_percentage = df['Class'].value_counts(normalize=True) * 100
print("Class counts: \n", class_counts)
#print("\nClass counts percentage: \n", class_counts_percentage)

Class counts: 
 0    763
1      9
Name: Class, dtype: int64


In [8]:
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,282.036697,-0.170866,0.221982,0.877498,0.277333,-0.012087,0.167587,0.128422,-0.060726,-0.027426,...,0.062458,0.004213,-0.096844,-0.042388,-0.000297,0.116503,0.021647,0.024137,-0.016615,68.770328
1,365.111111,-0.693891,-0.190851,0.67797,0.988833,0.593371,-0.562057,-0.30837,0.211968,-0.281084,...,0.274256,0.062134,-0.109865,0.132977,-0.18934,-0.069308,0.11893,-0.043178,-0.053463,60.017778


In [9]:
x = df.iloc[:,:30]
y = df.iloc[:,30:31]

In [10]:
def all_models() :
    all_models = dict()
    all_models['Bagging_Classifier']= BaggingClassifier()
    all_models['LGBMClassifier']= LGBMClassifier()
    all_models['RF']= RandomForestClassifier()
    all_models['KNN'] = KNeighborsClassifier()
    all_models['LogisticRegression'] = LogisticRegression()
    return all_models

def get_model_scores(x_train,y_train,x_test,y_test) :
    
    estimators = all_models()
    
    for name,model in estimators.items() :
        model.fit(x_train,y_train)
        y_pred = model.predict(x_test)
        print('*****************************************************************')
        print('For the model : ',name)
        print('Classification Report : \n',classification_report(y_test,y_pred))
        print('*****************************************************************')

        
        
   
    
    
    
    
    

## Without any Data Preprocessing

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 0)

In [12]:
x_train.shape,y_train.shape

((579, 30), (579, 1))

In [13]:
get_model_scores(x_train,y_train,x_test,y_test)

*****************************************************************
For the model :  Bagging_Classifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       191
           1       0.00      0.00      0.00         2

    accuracy                           0.99       193
   macro avg       0.49      0.50      0.50       193
weighted avg       0.98      0.99      0.98       193

*****************************************************************
*****************************************************************
For the model :  LGBMClassifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       191
           1       0.00      0.00      0.00         2

    accuracy                           0.99       193
   macro avg       0.49      0.50      0.50       193
weighted avg       0.98      0.99      0.98       193

****************************

## Stratified Sampling

In [14]:
count_class_0,count_class_1 = df.Class.value_counts()
df_class_0 = df[df['Class'] == 0]
df_class_1 = df[df['Class'] == 1]


In [15]:
df_class_0.shape,df_class_1.shape

((763, 31), (9, 31))

In [16]:
df_class_0 = df_class_0.sample(count_class_1)

In [17]:
df_class_1

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
182,118,1.254914,0.350287,0.302488,0.693114,-0.37147,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
244,164,0.073497,0.551033,0.45189,0.114964,0.822947,0.25148,0.296319,0.139497,-0.12305,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.23175,0.230171,0.99,1
541,406,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1
623,472,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1
639,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.15659,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.0,1
699,529,-2.000567,-2.495484,2.467149,1.140053,2.46201,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.17106,1.5,1
717,539,-1.738582,0.05274,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.33504,0.240323,-0.345129,-0.383563,1.0,1
766,574,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.2103,0.014455,...,-0.286856,-0.820658,0.127663,0.343128,0.22112,0.094391,-0.022189,0.030944,1.29,1


In [18]:
concat_df = pd.concat([df_class_0,df_class_1],ignore_index = True)

In [19]:
concat_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,496,-2.009893,0.522693,2.04351,0.223521,-0.597859,0.078163,-0.450347,0.772471,0.532385,...,-0.126642,0.080104,0.272598,0.088534,0.317082,0.419253,0.281494,0.060872,16.5,0
1,528,-0.378417,0.751515,1.772256,0.31102,-0.32913,-0.746206,0.719034,-0.081805,-0.152417,...,-0.120891,-0.240394,-0.057803,0.733812,-0.049448,0.207357,0.023386,0.057469,25.41,0
2,303,1.254258,1.218376,-2.148615,1.155957,1.813892,-0.238358,0.623888,-0.060265,-0.739258,...,-0.210083,-0.463849,-0.370852,-1.644707,0.96267,-0.200548,0.055746,0.071654,2.95,0
3,231,0.2831,0.819284,1.054309,0.348488,-0.156817,-0.509169,0.050382,-0.908179,-0.406903,...,0.509395,-0.962118,0.040063,0.381392,0.754119,0.179193,0.051791,0.114121,1.98,0
4,509,-0.152397,-0.748114,1.659571,-2.160601,-1.448594,0.558854,-0.274352,0.010159,-1.964295,...,0.004382,0.474039,-0.084319,-0.353695,-0.226044,-0.143999,-0.173635,-0.241563,114.0,0
5,172,-1.428535,1.793578,0.545758,-0.399309,0.673577,0.745919,-0.358475,-4.044724,0.232362,...,3.844729,-1.093705,0.256804,-0.869687,0.252491,-0.452726,0.055281,-0.153211,2.29,0
6,153,-0.648576,0.740565,1.800382,0.140826,0.050646,-0.257415,0.92047,-0.422028,0.590522,...,-0.229107,-0.161205,-0.11253,0.42075,-0.139914,0.163193,-0.469014,-0.350543,22.99,0
7,123,0.968784,-0.501798,-1.333558,0.388804,2.036091,3.661714,-0.236795,0.809586,0.154501,...,-0.091052,-0.56128,-0.260983,1.0177,0.837225,-0.342889,-0.003467,0.039004,160.86,0
8,417,-0.473731,0.69734,2.2796,1.359875,0.342429,1.392886,0.289971,0.170677,0.578966,...,-0.462425,-0.48681,-0.235667,-0.726568,0.085981,-0.351095,0.289067,-0.04303,8.61,0
9,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1


In [20]:
x_undersample = concat_df.iloc[:,:30]
y_undersample = concat_df.iloc[:,30:31]

In [21]:
y_undersample.value_counts()

Class
0        9
1        9
dtype: int64

In [22]:
#x_train_under,x_test_under,y_train_under,y_test_under = train_test_split(x_undersample,y_undersample,random_state = 0)

In [23]:
get_model_scores(x_undersample,y_undersample,x_test,y_test)

*****************************************************************
For the model :  Bagging_Classifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      0.89      0.94       191
           1       0.05      0.50      0.08         2

    accuracy                           0.89       193
   macro avg       0.52      0.70      0.51       193
weighted avg       0.98      0.89      0.93       193

*****************************************************************
*****************************************************************
For the model :  LGBMClassifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       191
           1       0.00      0.00      0.00         2

    accuracy                           0.99       193
   macro avg       0.49      0.50      0.50       193
weighted avg       0.98      0.99      0.98       193

****************************

In [24]:
x_test.shape,y_test.shape

((193, 30), (193, 1))

Random forest and Bagging Classifier give the most f1 score.
However the training data was very less, so now we will use oversampling

## Random Sampling

In [25]:
count_class_0,count_class_1

(763, 9)

In [26]:
df_class_0 = df[df['Class'] == 0]
df_class_1 = df[df['Class'] == 1]


In [27]:
df_class_1 = df_class_1.sample(count_class_0,replace = True)

In [28]:
df_class_0.shape,df_class_1.shape

((763, 31), (763, 31))

In [29]:
concat_df = pd.concat([df_class_0,df_class_1],ignore_index = True)

In [30]:
concat_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
2,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
3,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
4,2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.081080,3.67,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
1522,574,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,1.29,1
1523,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
1524,574,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,1.29,1


In [31]:
x_over = concat_df.iloc[:,:30]
y_over = concat_df.iloc[:,30:31]

In [32]:
concat_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,282.036697,-0.170866,0.221982,0.877498,0.277333,-0.012087,0.167587,0.128422,-0.060726,-0.027426,...,0.062458,0.004213,-0.096844,-0.042388,-0.000297,0.116503,0.021647,0.024137,-0.016615,68.770328
1,364.014417,-0.659711,-0.194198,0.682627,1.014166,0.600635,-0.545038,-0.361535,0.227061,-0.270364,...,0.263467,0.064689,-0.098373,0.129402,-0.197157,-0.071074,0.133564,-0.035327,-0.049377,54.646435


In [33]:
x_train_over,x_test_over,y_train_over,y_test_over = train_test_split(x_over,y_over,random_state = 0,stratify = y_over)

In [34]:
get_model_scores(x_train_over,y_train_over,x_test_over,y_test_over)

*****************************************************************
For the model :  Bagging_Classifier
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       191
           1       0.98      1.00      0.99       191

    accuracy                           0.99       382
   macro avg       0.99      0.99      0.99       382
weighted avg       0.99      0.99      0.99       382

*****************************************************************
*****************************************************************
For the model :  LGBMClassifier
Classification Report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       191
           1       1.00      1.00      1.00       191

    accuracy                           1.00       382
   macro avg       1.00      1.00      1.00       382
weighted avg       1.00      1.00      1.00       382

****************************

Oversampling with replacement of data worked very well, but the same data points were used again and again for the sampling, so overfitting might be there.<br> We will use SMOTE

## SMOTE 
This will create synthetic samples 

In [35]:
x = df.iloc[:,:30]
y = df.iloc[:,30:31]


In [36]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 'minority')
x_sm,y_sm = smote.fit_resample(x,y)

In [37]:
x_sm.shape,y_sm.shape

((1526, 30), (1526, 1))

In [38]:
y_sm.value_counts()

Class
0        763
1        763
dtype: int64

In [39]:
x_train_sm,x_test_sm,y_train_sm,y_test_sm = train_test_split(x_sm,y_sm,random_state = 0,stratify = y_sm)

In [40]:
y_sm.value_counts()

Class
0        763
1        763
dtype: int64

In [41]:
get_model_scores(x_train_sm,y_train_sm,x_test_sm,y_test_sm)

*****************************************************************
For the model :  Bagging_Classifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98       191
           1       0.98      0.97      0.98       191

    accuracy                           0.98       382
   macro avg       0.98      0.98      0.98       382
weighted avg       0.98      0.98      0.98       382

*****************************************************************
*****************************************************************
For the model :  LGBMClassifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       191
           1       0.98      0.99      0.99       191

    accuracy                           0.99       382
   macro avg       0.99      0.99      0.99       382
weighted avg       0.99      0.99      0.99       382

****************************

## Systematic Sampling 

In [42]:
def systematic_sampling(df,step) :
    indexes = np.arange(0,len(df),step = step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample


In [52]:
df_systematic_sample = systematic_sampling(df,2)

In [53]:
print(df_systematic_sample.Class.value_counts())

0    383
1      3
Name: Class, dtype: int64


In [54]:
x = df.iloc[:,:30]
y = df.iloc[:,30:31]

In [55]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 0,stratify = y)

In [56]:
get_model_scores(x_train,y_train,x_test,y_test)

*****************************************************************
For the model :  Bagging_Classifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       191
           1       0.00      0.00      0.00         2

    accuracy                           0.99       193
   macro avg       0.49      0.50      0.50       193
weighted avg       0.98      0.99      0.98       193

*****************************************************************
*****************************************************************
For the model :  LGBMClassifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       191
           1       0.00      0.00      0.00         2

    accuracy                           0.99       193
   macro avg       0.49      0.50      0.50       193
weighted avg       0.98      0.99      0.98       193

****************************

## Cluster Sampling

In [58]:
length = len(df)
groups = 2
elements = length / groups
print('elements by group:',elements)

elements by group: 386.0


In [59]:
group_list = []
group_id = 0
element_count = 0

for _ in df.iterrows():
    group_list.append(group_id)
    element_count += 1
    if element_count > elements:
        element_count = 0
        group_id += 1

In [60]:
np.unique(group_list, return_counts=True)

(array([0, 1]), array([387, 385], dtype=int64))

In [68]:
group_list

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [65]:
group_selected = rd.randint(0, groups - 1)
print('group selected:',group_selected)

group selected: 1


In [67]:
df_cluster_sample = df[df['Class'] == group_selected]

display(df_cluster_sample.shape)
print('')

display(df_cluster_sample['Class'].value_counts())
print('')

(9, 31)




1    9
Name: Class, dtype: int64




In [69]:
df['group'] = group_list
display(df.head(3))

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,group
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1,0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,0


In [70]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,group
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1,0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,575,-0.572263,0.731748,1.541254,0.150506,1.108974,0.372152,1.084879,-0.146329,-0.274447,...,-0.107582,-0.418263,-0.731029,0.877525,-0.364150,-0.177509,-0.256545,26.72,0,1
768,579,-1.296845,-0.511605,2.404726,-0.310762,-0.319551,-0.542842,-0.173310,0.260423,-1.202688,...,-0.161175,0.088496,0.285390,0.281069,-0.370130,0.043410,0.092318,80.00,0,1
769,579,1.214170,0.210481,0.484651,0.479768,-0.261955,-0.527039,0.021782,-0.106888,-0.037631,...,-0.594609,0.159877,0.091873,0.140964,0.227406,-0.017389,0.016030,5.98,0,1
770,580,1.267030,-0.071114,0.037680,0.512683,0.242392,0.705212,-0.226582,0.109483,0.657565,...,-0.177225,-0.222918,-1.245505,0.678360,0.525059,0.002920,-0.003333,12.36,0,1


In [71]:
group_selected = rd.randint(0, groups - 1)
print('group selected:',group_selected)

group selected: 1


In [72]:
df_cluster_sample = df[df['group'] == group_selected]

display(df_cluster_sample.shape)
print('')

display(df_cluster_sample['group'].value_counts())
print('')

(385, 32)




1    385
Name: group, dtype: int64




In [77]:
x = df_cluster_sample.iloc[:,:30]
y = df_cluster_sample.iloc[:,30:31]

In [78]:
x.shape,y.shape

((385, 30), (385, 1))

In [79]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 0,stratify = y)

In [80]:
get_model_scores(x_train,y_train,x_test,y_test)

*****************************************************************
For the model :  Bagging_Classifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        95
           1       0.00      0.00      0.00         2

    accuracy                           0.98        97
   macro avg       0.49      0.50      0.49        97
weighted avg       0.96      0.98      0.97        97

*****************************************************************
*****************************************************************
For the model :  LGBMClassifier
Classification Report : 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        95
           1       0.00      0.00      0.00         2

    accuracy                           0.98        97
   macro avg       0.49      0.50      0.49        97
weighted avg       0.96      0.98      0.97        97

****************************