#### Handling Imbalanced Dataset with Machine Learning

In [1]:
import pandas as pd
df=pd.read_csv('creditcard.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'creditcard.csv'

In [26]:
df.shape

(284807, 31)

In [30]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [31]:
#### Independent and Dependent Features
X=df.drop("Class",axis=1)
y=df.Class

#### Cross Validation Like KFOLD and Hyperpaqrameter Tuning

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import GridSearchCV

In [35]:
10.0 **np.arange(-2,3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [36]:
log_class=LogisticRegression()
grid={'C':10.0 **np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(n_splits=5,random_state=None,shuffle=False)

In [37]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)

In [38]:
clf=GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [39]:
y_pred=clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85251    47]
 [   41   104]]
0.9989700736163291
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85298
           1       0.69      0.72      0.70       145

    accuracy                           1.00     85443
   macro avg       0.84      0.86      0.85     85443
weighted avg       1.00      1.00      1.00     85443



In [45]:
347*100

34700

In [43]:
y_train.value_counts()

0    199017
1       347
Name: Class, dtype: int64

In [46]:
class_weight=dict({0:1,1:100})

In [47]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(class_weight=class_weight)
classifier.fit(X_train,y_train)

RandomForestClassifier(class_weight={0: 1, 1: 100})

y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [41]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85292     6]
 [   30   115]]
0.9995786664794073
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85298
           1       0.95      0.79      0.86       145

    accuracy                           1.00     85443
   macro avg       0.98      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443



#### Under Sampling

In [50]:
from collections import Counter
Counter(y_train)

Counter({0: 199017, 1: 347})

In [51]:
from collections import Counter
from imblearn.under_sampling import NearMiss
ns=NearMiss(0.8)
X_train_ns,y_train_ns=ns.fit_sample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))



The number of classes before fit Counter({0: 199017, 1: 347})
The number of classes after fit Counter({0: 433, 1: 347})


In [58]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [59]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[64448 20850]
 [   10   135]]
0.7558606322343551
              precision    recall  f1-score   support

           0       1.00      0.76      0.86     85298
           1       0.01      0.93      0.01       145

    accuracy                           0.76     85443
   macro avg       0.50      0.84      0.44     85443
weighted avg       1.00      0.76      0.86     85443



##### Over Sampling

In [60]:
from imblearn.over_sampling import RandomOverSampler

In [64]:
os=RandomOverSampler(0.75)
X_train_ns,y_train_ns=os.fit_sample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))



The number of classes before fit Counter({0: 199017, 1: 347})
The number of classes after fit Counter({0: 199017, 1: 149262})


In [65]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [66]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85292     6]
 [   27   118]]
0.9996137776061234
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85298
           1       0.95      0.81      0.88       145

    accuracy                           1.00     85443
   macro avg       0.98      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443



#### SMOTETomek

In [67]:
from imblearn.combine import SMOTETomek

In [69]:
os=SMOTETomek(0.75)
X_train_ns,y_train_ns=os.fit_sample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))



The number of classes before fit Counter({0: 199017, 1: 347})
The number of classes after fit Counter({0: 198395, 1: 148640})


In [70]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [71]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85286    12]
 [   23   122]]
0.9995903701883126
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85298
           1       0.91      0.84      0.87       145

    accuracy                           1.00     85443
   macro avg       0.96      0.92      0.94     85443
weighted avg       1.00      1.00      1.00     85443



#### Ensemble Techniques

In [72]:
from imblearn.ensemble import EasyEnsembleClassifier

In [73]:
easy=EasyEnsembleClassifier()
easy.(X_train,y_train)

EasyEnsembleClassifier()

In [None]:
easy.

In [74]:
y_pred=easy.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[82382  2916]
 [   14   131]]
0.9657081329073184
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     85298
           1       0.04      0.90      0.08       145

    accuracy                           0.97     85443
   macro avg       0.52      0.93      0.53     85443
weighted avg       1.00      0.97      0.98     85443

