<a href="https://colab.research.google.com/github/Achyuta-Harshavardhan/Handling-Imbalanced-Dataset/blob/main/Handling_Imbalanced_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,GridSearchCV,train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from collections import Counter

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Machine Learning/Imbalanced Dataset/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [None]:
# Independent and Dependent features
x = df.drop('Class',axis=1)
y = df.Class

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [None]:
log_class = LogisticRegression()
grid = {'C':10.0 **np.arange(-2,3),'penalty':['l1','l2']}
cv = KFold(n_splits=5,random_state=None,shuffle=False)

In [None]:
clf = GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro')
clf.fit(x_train,y_train)

In [None]:
pred = clf.predict(x_test)
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

0.9988062216916541
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56848
           1       0.74      0.62      0.68       114

    accuracy                           1.00     56962
   macro avg       0.87      0.81      0.84     56962
weighted avg       1.00      1.00      1.00     56962



In [None]:
classifier = RandomForestClassifier(class_weight = dict({0:1,1:100}))
classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 100}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [None]:
pred = classifier.predict(x_test)
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56848
           1       0.99      0.77      0.87       114

    accuracy                           1.00     56962
   macro avg       0.99      0.89      0.93     56962
weighted avg       1.00      1.00      1.00     56962



Because of the imbalanced dataset we are getting the highest accuracy. By checking the precision and recall value it is very clear that dataset is not balanced

In [None]:
y_train.value_counts()

0    227467
1       378
Name: Class, dtype: int64

# Under sampling

In [None]:
# Implementing Unersampling for Handling Imbalanced 
ns = NearMiss(0.8)
x_ns,y_ns = ns.fit_sample(x_train,y_train)

In [None]:
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_ns)))

The number of classes before fit Counter({0: 227467, 1: 378})
The number of classes after fit Counter({0: 472, 1: 378})


In [None]:
0.8*485

388.0

In [None]:
classifier.fit(x_ns,y_ns)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 100}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [None]:
pred = classifier.predict(x_test)
print(confusion_matrix(y_test,pred))
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

[[56314   534]
 [   12   102]]
0.9904146624065167
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     56848
           1       0.16      0.89      0.27       114

    accuracy                           0.99     56962
   macro avg       0.58      0.94      0.63     56962
weighted avg       1.00      0.99      0.99     56962



# Over Sampling

In [None]:
# RandomOverSampler to handle imbalanced data
ros =  RandomOverSampler(ratio=0.5)
x_ros, y_ros = ros.fit_sample(x_train, y_train)



In [None]:
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_ros)))

The number of classes before fit Counter({0: 227467, 1: 378})
The number of classes after fit Counter({0: 227467, 1: 113733})


In [None]:
0.5*227457

113728.5

In [None]:
classifier.fit(x_ros,y_ros)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 100}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [None]:
pred = classifier.predict(x_test)
print(confusion_matrix(y_test,pred))
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

[[56846     2]
 [   26    88]]
0.9995084442259752
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56848
           1       0.98      0.77      0.86       114

    accuracy                           1.00     56962
   macro avg       0.99      0.89      0.93     56962
weighted avg       1.00      1.00      1.00     56962



# SMOTETomek

In [None]:
# Implementing Oversampling for Handling Imbalanced 
smk = SMOTETomek(random_state=42)
x_smk,y_smk = smk.fit_sample(x_train,y_train)



In [None]:
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_smk)))

The number of classes before fit Counter({0: 227467, 1: 378})
The number of classes after fit Counter({0: 226717, 1: 226717})


In [None]:
classifier.fit(x_smk,y_smk)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 100}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [None]:
pred = classifier.predict(x_test)
print(confusion_matrix(y_test,pred))
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

[[56841     7]
 [   20    94]]
0.9995259997893332
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56848
           1       0.93      0.82      0.87       114

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962



# Ensemble Technique

In [None]:
easy = EasyEnsembleClassifier()
easy.fit(x_train,y_train)

In [None]:
pred = easy.predict(x_test)
print(confusion_matrix(y_test,pred))
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

[[54915  1933]
 [   11   103]]
0.9658719848319932
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     56848
           1       0.05      0.90      0.10       114

    accuracy                           0.97     56962
   macro avg       0.53      0.93      0.54     56962
weighted avg       1.00      0.97      0.98     56962

