In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV

from collections import Counter
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score

from sklearn.linear_model import RidgeClassifier

## Dataset

In [2]:
file = pd.read_csv('subset_creditcard.csv')

In [3]:
file

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5006,56650,-8.762083,2.791030,-7.682767,6.991214,-5.230695,-0.357388,-9.685621,1.749335,-4.495679,...,-0.090527,0.348590,0.051132,-0.415430,0.219665,0.330020,-0.028252,-0.156270,7.52,1
5007,57007,-1.271244,2.462675,-2.851395,2.324480,-1.372245,-0.948196,-3.065234,1.166927,-2.268771,...,0.652941,0.081931,-0.221348,-0.523582,0.224228,0.756335,0.632800,0.250187,0.01,1
5008,57027,-2.335655,2.225380,-3.379450,2.178538,-3.568264,0.316814,-1.734948,1.449139,-1.980033,...,0.785540,0.297412,0.308536,-0.598416,-0.121850,-0.491018,0.701606,0.206966,444.17,1
5009,57163,-10.363049,4.543672,-9.795898,5.508003,-6.037156,-0.133493,-11.724346,-3.198346,-4.767842,...,-2.457145,1.687257,0.977178,-0.543369,-0.289125,-0.107586,0.330642,0.163577,1.00,1


In [4]:
file['Class'].value_counts()

0    4996
1      15
Name: Class, dtype: int64

### Testando a qualidade do classificador

In [5]:
X = file[['V1','V2','V3','V4','V5']]
y = file[['Class']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
print("TREINAMENTO:")
print("X:", X_train.shape, "y:", y_train.shape)
print("TESTE:")
print("X:", X_test.shape, "y:", y_test.shape)

TREINAMENTO:
X: (3357, 5) y: (3357, 1)
TESTE:
X: (1654, 5) y: (1654, 1)


In [8]:
y_train.value_counts()

Class
0        3348
1           9
dtype: int64

In [9]:
y_test.value_counts()

Class
0        1648
1           6
dtype: int64

In [10]:
model = Pipeline([('nor', MinMaxScaler()), ('LR', LogisticRegression())])
params = {'LR__random_state': [0]}

gs = GridSearchCV(model, params, cv=5, scoring='accuracy', refit=True)
gs.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('nor', MinMaxScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__random_state': [0]}, scoring='accuracy')

In [11]:
gs.cv_results_

{'mean_fit_time': array([0.01838808]),
 'std_fit_time': array([0.00286883]),
 'mean_score_time': array([0.00239897]),
 'std_score_time': array([0.00049027]),
 'param_LR__random_state': masked_array(data=[0],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'LR__random_state': 0}],
 'split0_test_score': array([0.99702381]),
 'split1_test_score': array([0.99702381]),
 'split2_test_score': array([0.99850969]),
 'split3_test_score': array([0.99701937]),
 'split4_test_score': array([0.99701937]),
 'mean_test_score': array([0.99731921]),
 'std_test_score': array([0.00059524]),
 'rank_test_score': array([1])}

In [12]:
y_pred = gs.predict(X_test)

In [13]:
print("Accuracy")
accuracy_score(y_test, y_pred)

Accuracy


0.9957678355501813

In [14]:
print("confusion matrix")
confusion_matrix(y_test,y_pred)

confusion matrix


array([[1647,    1],
       [   6,    0]], dtype=int64)

In [15]:
print("ROC_AUC")
roc_auc_score(y_test,y_pred)

ROC_AUC


0.4996966019417476

In [16]:
print("F1 Score")
f1_score(y_test,y_pred, pos_label=0)

F1 Score


0.9978794304756134

In [17]:
print("F1 Score")
f1_score(y_test,y_pred, pos_label=1)

F1 Score


0.0

In [18]:
print("F1 Score")
f1_score(y_test,y_pred, average='macro')

F1 Score


0.4989397152378067

# Técnicas de Balanceamento de Dados

## Undersampling

In [19]:
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours

### Random UnderSampling

In [20]:
print("TREINAMENTO:")
print("X:", X_train.shape, "y:", y_train.shape)
print("TESTE:")
print("X:", X_test.shape, "y:", y_test.shape)

TREINAMENTO:
X: (3357, 5) y: (3357, 1)
TESTE:
X: (1654, 5) y: (1654, 1)


In [21]:
y_train.value_counts()

Class
0        3348
1           9
dtype: int64

In [26]:
print("Reamostragem de dados usando Random UnderSampling (RUS)...")
X_RUS, y_RUS = RandomUnderSampler(random_state=42).fit_sample(X_train, y_train.values.ravel())
print("... Feito!")

print("X e Y RUS:", len(X_RUS), len(y_RUS))

Reamostragem de dados usando Random UnderSampling (RUS)...
... Feito!
X e Y RUS: 18 18


In [27]:
model = Pipeline([('nor', MinMaxScaler()), ('LR', LogisticRegression())])
params = {'LR__random_state': [0]}

gs = GridSearchCV(model, params, cv=5, scoring='accuracy', refit=True)
gs.fit(X_RUS, y_RUS)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('nor', MinMaxScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__random_state': [0]}, scoring='accuracy')

In [28]:
gs.cv_results_

{'mean_fit_time': array([0.00899239]),
 'std_fit_time': array([0.00063265]),
 'mean_score_time': array([0.00299807]),
 'std_score_time': array([0.00063188]),
 'param_LR__random_state': masked_array(data=[0],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'LR__random_state': 0}],
 'split0_test_score': array([0.75]),
 'split1_test_score': array([1.]),
 'split2_test_score': array([0.75]),
 'split3_test_score': array([1.]),
 'split4_test_score': array([0.66666667]),
 'mean_test_score': array([0.83333333]),
 'std_test_score': array([0.13944334]),
 'rank_test_score': array([1])}

In [29]:
y_pred = gs.predict(X_test)

In [30]:
print("Accuracy")
print(accuracy_score(y_test, y_pred))
print("confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("ROC_AUC")
print(roc_auc_score(y_test,y_pred))

Accuracy
0.9879081015719468
confusion matrix
[[1630   18]
 [   2    4]]
ROC_AUC
0.8278721682847896


### Edited Nearest Neighbor

In [31]:
print("TREINAMENTO:")
print("X:", X_train.shape, "y:", y_train.shape)
print("TESTE:")
print("X:", X_test.shape, "y:", y_test.shape)

TREINAMENTO:
X: (3357, 5) y: (3357, 1)
TESTE:
X: (1654, 5) y: (1654, 1)


In [32]:
y_train.value_counts()

Class
0        3348
1           9
dtype: int64

In [33]:
print("Reamostragem de dados usando Edited Nearest Neighbour (ENN)...")
X_ENN, y_ENN = EditedNearestNeighbours().fit_sample(X_train, y_train.values.ravel())
print("... Feito!")
print("X e Y ENN:", len(X_ENN), len(y_ENN))

Reamostragem de dados usando Edited Nearest Neighbour (ENN)...
... Feito!
X e Y ENN: 3350 3350


In [34]:
model = Pipeline([('nor', MinMaxScaler()), ('LR', LogisticRegression())])
params = {'LR__random_state': [0]}

gs = GridSearchCV(model, params, cv=5, scoring='accuracy', refit=True)
gs.fit(X_ENN, y_ENN)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('nor', MinMaxScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__random_state': [0]}, scoring='accuracy')

In [35]:
gs.cv_results_

{'mean_fit_time': array([0.01818795]),
 'std_fit_time': array([0.00248016]),
 'mean_score_time': array([0.00299845]),
 'std_score_time': array([0.00199945]),
 'param_LR__random_state': masked_array(data=[0],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'LR__random_state': 0}],
 'split0_test_score': array([0.99850746]),
 'split1_test_score': array([0.99701493]),
 'split2_test_score': array([0.99701493]),
 'split3_test_score': array([0.99701493]),
 'split4_test_score': array([0.99701493]),
 'mean_test_score': array([0.99731343]),
 'std_test_score': array([0.00059701]),
 'rank_test_score': array([1])}

In [36]:
y_pred = gs.predict(X_test)

In [37]:
print("Accuracy")
print(accuracy_score(y_test, y_pred))
print("confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("ROC_AUC")
print(roc_auc_score(y_test,y_pred))

Accuracy
0.9957678355501813
confusion matrix
[[1647    1]
 [   6    0]]
ROC_AUC
0.4996966019417476


## Oversampling

In [38]:
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler

### Random OverSampling

In [39]:
print("TREINAMENTO:")
print("X:", X_train.shape, "y:", y_train.shape)
print("TESTE:")
print("X:", X_test.shape, "y:", y_test.shape)

TREINAMENTO:
X: (3357, 5) y: (3357, 1)
TESTE:
X: (1654, 5) y: (1654, 1)


In [40]:
y_train.value_counts()

Class
0        3348
1           9
dtype: int64

In [41]:
print("Reamostragem de dados usando Random OverSampling (ROS)...")
ros = RandomOverSampler(random_state=42)
X_ROS, y_ROS = ros.fit_resample(X_train, y_train)
print("... Feito!")
print("X e Y ROS:", len(X_ROS), len(y_ROS))

Reamostragem de dados usando Random OverSampling (ROS)...
... Feito!
X e Y ROS: 6696 6696


In [42]:
y_ROS.value_counts()

Class
1        3348
0        3348
dtype: int64

In [43]:
model = Pipeline([('nor', MinMaxScaler()), ('LR', LogisticRegression())])
params = {'LR__random_state': [0]}

gs = GridSearchCV(model, params, cv=5, scoring='roc_auc', refit=True)
gs.fit(X_ROS, y_ROS.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('nor', MinMaxScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__random_state': [0]}, scoring='roc_auc')

In [44]:
gs.cv_results_

{'mean_fit_time': array([0.03217907]),
 'std_fit_time': array([0.00522712]),
 'mean_score_time': array([0.00399766]),
 'std_score_time': array([0.00109545]),
 'param_LR__random_state': masked_array(data=[0],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'LR__random_state': 0}],
 'split0_test_score': array([0.99451771]),
 'split1_test_score': array([0.98658278]),
 'split2_test_score': array([0.99294782]),
 'split3_test_score': array([0.99307052]),
 'split4_test_score': array([0.99187917]),
 'mean_test_score': array([0.9917996]),
 'std_test_score': array([0.00274019]),
 'rank_test_score': array([1])}

In [45]:
y_pred = gs.predict(X_test)

In [46]:
print("Accuracy")
print(accuracy_score(y_test, y_pred))
print("confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("ROC_AUC")
print(roc_auc_score(y_test,y_pred))

Accuracy
0.9474002418379686
confusion matrix
[[1562   86]
 [   1    5]]
ROC_AUC
0.890574433656958


### SMOTE

In [47]:
print("TREINAMENTO:")
print("X:", X_train.shape, "y:", y_train.shape)
print("TESTE:")
print("X:", X_test.shape, "y:", y_test.shape)

TREINAMENTO:
X: (3357, 5) y: (3357, 1)
TESTE:
X: (1654, 5) y: (1654, 1)


In [48]:
print("Reamostragem de dados usando SMOTE (SMO)...")
sm = SMOTE(random_state=42)
X_SMO, y_SMO = sm.fit_resample(X_train, y_train)
print("... Feito!")
print("X e Y SMO:", len(X_SMO), len(y_SMO))

Reamostragem de dados usando SMOTE (SMO)...
... Feito!
X e Y SMO: 6696 6696


In [49]:
y_SMO.value_counts()

Class
1        3348
0        3348
dtype: int64

In [50]:
model = Pipeline([('nor', MinMaxScaler()), ('LR', LogisticRegression())])
params = {'LR__random_state': [0]}

gs = GridSearchCV(model, params, cv=5, scoring='roc_auc', refit=True)
gs.fit(X_SMO, y_SMO.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('nor', MinMaxScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__random_state': [0]}, scoring='roc_auc')

In [51]:
gs.cv_results_

{'mean_fit_time': array([0.03077946]),
 'std_fit_time': array([0.00466159]),
 'mean_score_time': array([0.00379891]),
 'std_score_time': array([0.00074946]),
 'param_LR__random_state': masked_array(data=[0],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'LR__random_state': 0}],
 'split0_test_score': array([0.99617732]),
 'split1_test_score': array([0.99119871]),
 'split2_test_score': array([0.99577226]),
 'split3_test_score': array([0.99538184]),
 'split4_test_score': array([0.99514089]),
 'mean_test_score': array([0.9947342]),
 'std_test_score': array([0.00180248]),
 'rank_test_score': array([1])}

In [52]:
y_pred = gs.predict(X_test)

In [53]:
print("Accuracy")
print(accuracy_score(y_test, y_pred))
print("confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("ROC_AUC")
print(roc_auc_score(y_test,y_pred))

Accuracy
0.9637243047158404
confusion matrix
[[1589   59]
 [   1    5]]
ROC_AUC
0.8987661812297736


### ADASYN

In [54]:
print("TREINAMENTO:")
print("X:", X_train.shape, "y:", y_train.shape)
print("TESTE:")
print("X:", X_test.shape, "y:", y_test.shape)

TREINAMENTO:
X: (3357, 5) y: (3357, 1)
TESTE:
X: (1654, 5) y: (1654, 1)


In [55]:
print("Reamostragem de dados usando ADASYN (ADA)...")
ada = ADASYN(random_state=42)
X_ADA, y_ADA = ada.fit_resample(X_train, y_train)
print("... Feito!")
print("X e Y ADA:", len(X_ADA), len(y_ADA))

Reamostragem de dados usando ADASYN (ADA)...
... Feito!
X e Y ADA: 6694 6694


In [56]:
y_ADA.value_counts()

Class
0        3348
1        3346
dtype: int64

In [57]:
model = Pipeline([('nor', MinMaxScaler()), ('LR', LogisticRegression())])
params = {'LR__random_state': [0]}

gs = GridSearchCV(model, params, cv=5, scoring='roc_auc', refit=True)
gs.fit(X_ADA, y_ADA.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('nor', MinMaxScaler()),
                                       ('LR', LogisticRegression())]),
             param_grid={'LR__random_state': [0]}, scoring='roc_auc')

In [58]:
gs.cv_results_

{'mean_fit_time': array([0.02678285]),
 'std_fit_time': array([0.00518877]),
 'mean_score_time': array([0.00459766]),
 'std_score_time': array([0.00119877]),
 'param_LR__random_state': masked_array(data=[0],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'LR__random_state': 0}],
 'split0_test_score': array([0.99203311]),
 'split1_test_score': array([0.99273364]),
 'split2_test_score': array([0.99317761]),
 'split3_test_score': array([0.99242799]),
 'split4_test_score': array([0.99277193]),
 'mean_test_score': array([0.99262886]),
 'std_test_score': array([0.00038153]),
 'rank_test_score': array([1])}

In [59]:
y_pred = gs.predict(X_test)

In [60]:
print("Accuracy")
print(accuracy_score(y_test, y_pred))
print("confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("ROC_AUC")
print(roc_auc_score(y_test,y_pred))

Accuracy
0.9540507859733979
confusion matrix
[[1573   75]
 [   1    5]]
ROC_AUC
0.8939118122977346


#### Nota: Ao utilizar uma técnica de Oversampling você deveria garantir que os dados utilizados na validação sejam dados originais e não sintéticos. É complicado assegurar isso utilizando o pipeline do ScikitLearn. Então, você implementar sua própria solução.