#### Handling Inbalanced Dataset with Machine Learning

In [None]:
import pandas as pd
df=pd.read_csv('creditcard.csv')
df.head()

In [3]:
## Dependent feature -- Class
df.shape

(284807, 31)

In [4]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [6]:
## Independent and dependent features
X = df.drop('Class',axis=1)
y = df.Class

#### Cross Validation like KFold and Hyperparameter Tuning

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import GridSearchCV

In [8]:
log_class = LogisticRegression()
grid = {'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv = KFold(n_splits=5, random_state = None, shuffle=False)

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7)

In [10]:
clf = GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train,y_train)


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\bhargavi\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\bhargavi\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\bhargavi\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.85032

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [11]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85237    54]
 [   31   121]]
0.9990051847430451
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85291
           1       0.69      0.80      0.74       152

    accuracy                           1.00     85443
   macro avg       0.85      0.90      0.87     85443
weighted avg       1.00      1.00      1.00     85443



In [12]:
y_train.value_counts()

0    199024
1       340
Name: Class, dtype: int64

In [13]:
class_weight = dict({0:1,1:100})

In [14]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)

RandomForestClassifier()

In [15]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85274    17]
 [   30   122]]
0.9994499256814484
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85291
           1       0.88      0.80      0.84       152

    accuracy                           1.00     85443
   macro avg       0.94      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [16]:
#classifier = RandomForestClassifier(class_weight=class_weight)
#classifier.fit(X_train,y_train)

In [17]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85274    17]
 [   30   122]]
0.9994499256814484
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85291
           1       0.88      0.80      0.84       152

    accuracy                           1.00     85443
   macro avg       0.94      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443



#### Under Sampling
* Use when dataset size is small

In [18]:
y_train.value_counts()

0    199024
1       340
Name: Class, dtype: int64

In [19]:
from collections import Counter
from imblearn.under_sampling import NearMiss
ns = NearMiss(0.8)
X_train_ns,y_train_ns=ns.fit_resample(X_train,y_train)
print("The number of classes before fit{}".format(Counter(y_train)))
print("The number of classes after fit{}".format(Counter(y_train_ns)))



The number of classes before fitCounter({0: 199024, 1: 340})
The number of classes after fitCounter({0: 425, 1: 340})


In [16]:
X_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
147572,88698.0,2.058949,0.136886,-1.669613,0.434400,0.382913,-0.854899,0.159688,-0.205423,0.516325,...,-0.177671,-0.354725,-0.920221,0.360377,0.573642,-0.292591,0.171593,-0.060751,-0.030449,1.98
176617,122843.0,0.360336,0.912758,-0.549064,0.949526,0.967822,-0.542551,1.022748,-0.294865,-0.793735,...,0.153243,0.198797,0.748838,-0.255829,0.663539,-0.411170,1.046227,0.145262,0.186936,0.76
32560,36851.0,-0.562679,0.541107,1.689286,0.399807,0.362725,-0.768328,0.511264,-0.014275,-0.020889,...,-0.080848,-0.140417,-0.357213,-0.031185,0.376766,0.047927,-0.703519,0.058508,0.054311,5.55
44961,42150.0,-0.431354,1.090529,1.736730,0.506751,-0.102729,-1.027930,0.681786,-0.129837,-0.724023,...,0.079780,-0.177755,-0.553491,-0.028086,0.673502,-0.306897,0.153292,0.072919,0.123958,4.49
278868,168487.0,1.842403,0.271779,0.091046,3.745000,0.082807,0.845464,-0.502262,0.162633,-0.439373,...,-0.140096,0.244446,0.820950,0.147533,0.621985,-0.131419,0.058352,0.019121,-0.021084,23.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171022,120459.0,-1.497281,1.482645,-0.493921,-2.760444,0.464303,-0.557775,0.452783,0.887374,-0.349536,...,-0.130754,-0.202324,-0.769972,-0.029396,0.228976,0.052472,0.359773,0.080192,0.089803,4.00
13872,24631.0,1.164015,1.378190,-1.434273,1.676561,1.236670,-0.966499,0.565373,-0.233169,0.437633,...,-0.040802,-0.270867,-0.391236,-0.191420,-0.343198,0.762159,-0.323557,0.031666,0.079023,1.79
42175,40980.0,-1.340619,-1.508734,2.691860,-1.176688,-1.457673,0.761704,0.310423,0.147072,-0.024920,...,0.779889,0.309503,0.636165,0.389629,0.076266,0.468652,-0.208799,-0.175871,-0.131826,300.00
40862,40432.0,0.706980,-1.159027,0.696449,0.312597,-1.012896,0.400024,-0.348351,0.104590,1.074497,...,0.467611,-0.067820,-0.431506,-0.166413,-0.164330,0.046256,0.964545,-0.060093,0.053009,260.73


In [17]:
X_test

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
7762,10820.0,-1.615325,0.346875,-0.147872,-1.089688,-4.917660,2.278146,6.988377,-2.104665,2.023255,...,-0.380179,-1.111764,-0.562018,-0.215164,0.446790,-0.245641,0.570353,0.794068,-0.897788,1228.99
284080,172137.0,-3.500417,3.444865,-3.368813,-2.425066,2.376199,2.949492,0.225066,1.087063,2.889967,...,2.086398,-0.821628,-1.256698,0.311346,0.527536,0.411387,0.235665,1.872413,1.174562,4.46
165788,117665.0,-0.666631,-0.505543,-0.621111,0.113665,2.735397,3.980027,-0.492548,1.000795,0.081973,...,-0.138567,0.362108,1.173114,0.027984,0.729788,-1.269114,-0.572840,0.426121,0.206397,134.40
233924,147755.0,-0.626183,0.752950,1.972549,4.329498,1.597951,2.061596,0.195604,0.220830,-1.806989,...,0.603677,0.093246,0.587858,-0.495088,-0.066291,0.680079,0.677596,-0.075921,-0.137966,28.21
195260,130971.0,-1.396430,-0.068552,3.802986,4.941939,-0.083887,1.780350,-0.716438,0.333717,0.361013,...,0.433715,-0.151459,0.838026,0.353850,-0.074783,0.339237,0.667656,0.342663,-0.309941,3.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89623,62652.0,-0.781636,0.759226,-0.319500,-2.482960,2.151210,3.132337,-0.056585,0.972742,0.356773,...,0.379044,-0.223842,-0.610046,-0.041744,1.024452,0.007151,0.747982,0.393802,0.186234,9.99
104282,68997.0,-1.097229,1.088926,1.627556,1.042294,-0.885308,0.397729,0.289859,0.083596,0.201596,...,-0.146965,0.245679,0.926105,-0.192013,0.090988,-0.371508,-0.296968,-0.537871,-0.118014,95.00
174712,122034.0,2.021069,0.161557,-1.589579,0.338791,0.439508,-0.648810,0.128491,-0.137285,0.125513,...,-0.112075,-0.293840,-0.745270,0.343675,0.687123,-0.291434,0.140543,-0.061957,-0.037519,0.89
43880,41688.0,1.015201,-1.467653,-0.460923,-1.833699,-1.184746,-0.962527,-0.090618,-0.226766,0.377736,...,-0.210887,-0.268775,-0.537553,-0.218859,0.083429,0.520227,0.109444,-0.017068,0.035755,207.11


In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [19]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[68212 17094]
 [    6   131]]
0.799866577718479
              precision    recall  f1-score   support

           0       1.00      0.80      0.89     85306
           1       0.01      0.96      0.02       137

    accuracy                           0.80     85443
   macro avg       0.50      0.88      0.45     85443
weighted avg       1.00      0.80      0.89     85443



#### Over Sampling

In [19]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [21]:
os = RandomOverSampler(0.5)
X_train_ns,y_train_ns=os.fit_resample(X_train,y_train)
print("The number of classes before fit()",Counter(y_train))
print("The number of classes after fit()",Counter(y_train_ns))



The number of classes before fit() Counter({0: 199009, 1: 355})
The number of classes after fit() Counter({0: 199009, 1: 99504})


In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [23]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85296    10]
 [   23   114]]
0.9996137776061234
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85306
           1       0.92      0.83      0.87       137

    accuracy                           1.00     85443
   macro avg       0.96      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443



 #### SMOTETomek

In [20]:
from imblearn.combine import SMOTETomek

In [None]:
os = SMOTETomek(0.5)
X_train_ns,y_train_ns=os.fit_resample(X_train,y_train)
print("The number of classes before fit()",Counter(y_train))
print("The number of classes after fit()",Counter(y_train_ns))



In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

In [None]:
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))