In [8]:
# In this example I use SMOTETomek which is a method of imblearn. SMOTETomek is a hybrid method
# which uses an under sampling method (Tomek) in with an over sampling method (SMOTE).

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek
from collections import Counter

# majority class : L  /  minority class : S 
# Our goal is to maximise precision on the majority class and maximise recall on the minority
# class.

In [5]:
loc = r"C:\Users\me\Documents\datasets\yeast3.dat"

df = pd.read_csv(loc, sep=',', header=None)

columns = ['mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc', 'target']

df.columns = columns

df.target = df["target"].str.replace('negative', 'L')
df.target = df["target"].str.replace('positive', 'S')

y = df.target

X = df.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

y_train, y_test = np.asarray(y_train), np.asarray(y_test)

os_us = SMOTETomek(ratio=0.5, k=5)
X_train_res, y_train_res = os_us.fit_sample(X_train, y_train)

print ("Distribution of class labels before resampling {}".format(Counter(y_train)))
print ("Distribution of class labels after resampling {}".format(Counter(y_train_res)))

Distribution of class labels before resampling Counter({' L': 928, ' S': 110})
Distribution of class labels after resampling Counter({' L': 924, ' S': 464})


In [6]:
def model(X_train_res, X_test, y_train_res, y_test):
    if __name__ == '__main__':
        clf_base = LogisticRegression()
        grid = {'C': 10.0 ** np.arange(-2, 3),
                'penalty': ['l1', 'l2']}

        cv = KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=0)
        clf = GridSearchCV(clf_base, grid, cv=cv, n_jobs=-1, scoring='f1_macro')

        clf.fit(X_train, y_train)

        print (classification_report(y_test, clf.predict(X_test)))

In [9]:
model(X_train_res, X_test, y_train_res, y_test)

             precision    recall  f1-score   support

          L       0.95      0.99      0.97       393
          S       0.87      0.62      0.73        53

avg / total       0.94      0.94      0.94       446

