In [3]:
# In this example I use Easy Ensemble which is a method of imblearn. Easy Ensemble belongs to 
# the family of ensemble methods used to address the imbalanced class problem.

import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold, train_test_split
from imblearn.ensemble import EasyEnsemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from collections import Counter

# majority class : L  /  minority class : S 
# Our goal is to maximise precision on the majority class and maximise recall on the minority
# class.

In [4]:
loc = r"C:\Users\me\Documents\datasets\yeast3.dat"

df = pd.read_csv(loc, sep=',', header=None)

columns = ['mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc', 'target']

df.columns = columns

df.target = df["target"].str.replace('negative', 'L')
df.target = df["target"].str.replace('positive', 'S')

y = df.target

X = df.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

y_train, y_test = np.asarray(y_train), np.asarray(y_test)

ens = EasyEnsemble()
X_train_res, y_train_res = ens.fit_sample(X_train, y_train)

y_pred_proba = np.zeros(len(y_test))

In [5]:
for idx in range(len(y_train_res)):
    clf_base = AdaBoostClassifier()
    grid = {'n_estimators': [10, 50, 100]}

    cv = KFold(X_train_res.shape[0], n_folds=5, shuffle=True, random_state=0)
    clf = GridSearchCV(clf_base, grid, cv=cv, scoring='f1_macro')
    clf.fit(X_train_res[idx], y_train_res[idx])
    y_pred_proba += list(zip(*clf.predict_proba(X_test)))[0]
    
y_pred_proba = y_pred_proba/len(y_train_res)
y_pred = (y_pred_proba > 0.5).astype(int)
y_pred = y_pred.astype('str')
y_pred[y_pred=='1'] = 'L'
y_pred[y_pred=='0'] = 'S'
y_test[y_test==' L'] = 'L'
y_test[y_test==' S'] = 'S'

print (classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          L       0.99      0.95      0.97       399
          S       0.67      0.91      0.77        47

avg / total       0.96      0.94      0.95       446

