In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Imbalanced datasets 
- [UCI data](http://archive.ics.uci.edu/ml/datasets/balance+scale)
- URL https://elitedatascience.com/imbalanced-classes

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
 
df = pd.read_csv('data/balance-scale.data', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])
# df.head()
print(df['balance'].value_counts())

# Transform into binary classification
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
print(df['balance'].value_counts())

# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)

# How's the accuracy?
print( accuracy_score(pred_y_0, y) )

# Should we be excited?
print( np.unique( pred_y_0 ) )

## Oversampling

In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
print(df_upsampled.balance.value_counts())

# Separate input features (X) and target variable (y)
y = df_upsampled.balance
X = df_upsampled.drop('balance', axis=1)
 
# Train model
clf_1 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_1 = clf_1.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
 
# How's our accuracy?
print( accuracy_score(y, pred_y_1) )
# 0.513888888889

## Undersampling

In [None]:
# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=49,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
print(df_downsampled.balance.value_counts())

# Separate input features (X) and target variable (y)
y = df_downsampled.balance
X = df_downsampled.drop('balance', axis=1)
 
# Train model
clf_2 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_2 = clf_2.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )
 
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )

## Area Under ROC Curve (ROC-AUC)

In [None]:
from sklearn.metrics import roc_auc_score
 
# Predict class probabilities
prob_y_0 = clf_0.predict_proba(X)
prob_y_1 = clf_1.predict_proba(X)
prob_y_2 = clf_2.predict_proba(X)
 
# Keep only the positive class
prob_y_0 = [p[1] for p in prob_y_0]
prob_y_1 = [p[1] for p in prob_y_1]
prob_y_2 = [p[1] for p in prob_y_2]

print( roc_auc_score(y, prob_y_0) )
print( roc_auc_score(y, prob_y_1) )
print( roc_auc_score(y, prob_y_2) )

## SVM

In [None]:
from sklearn.svm import SVC

# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_3 = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True)
 
clf_3.fit(X, y)
 
# Predict on training set
pred_y_3 = clf_3.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_3 ) )
 
# How's our accuracy?
print( accuracy_score(y, pred_y_3) )

# What about AUROC?
prob_y_3 = clf_3.predict_proba(X)
prob_y_3 = [p[1] for p in prob_y_3]
print( roc_auc_score(y, prob_y_3) )

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(X, y)
 
# Predict on training set
pred_y_4 = clf_4.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )
 
# How's our accuracy?
print( accuracy_score(y, pred_y_4) )
 
# What about AUROC?
prob_y_4 = clf_4.predict_proba(X)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y, prob_y_4) )

#### While these results are encouraging, the model could be overfit, so you should still evaluate your model on an unseen test set before making the final decision.