In [8]:
# In this example I use RandomOverSampler which is a method of imblearn. RandomOverSampler 
# belongs to the family of over sampling methods used to address the imbalanced class problem.

import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold, train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.linear_model import LogisticRegression

# majority class : L  /  minority class : S 
# Our goal is to maximise precision on the majority class and maximise recall on the minority
# class.

In [4]:
loc = r"C:\Users\me\Documents\datasets\yeast3.dat"

df = pd.read_csv(loc, sep=',', header=None)

columns = ['mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc', 'target']

df.columns = columns

df.target = df["target"].str.replace('negative', 'L')
df.target = df["target"].str.replace('positive', 'S')

y = df.target

X = df.drop('target', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

y_train, y_test = np.asarray(y_train), np.asarray(y_test)

os =  RandomOverSampler(ratio=0.5)
X_train_res, y_train_res = os.fit_sample(X_train, y_train)

print ("Distribution of class labels before resampling {}".format(Counter(y_train)))
print ("Distribution of class labels after resampling {}".format(Counter(y_train_res)))

Distribution of class labels before resampling Counter({' L': 930, ' S': 108})
Distribution of class labels after resampling Counter({' L': 930, ' S': 465})


In [5]:
def model(X_train_res, X_test, y_train_res, y_test):
    if __name__ == '__main__':
        clf_base = LogisticRegression()
        grid = {'C': 10.0 ** np.arange(-2, 3),
                'penalty': ['l1', 'l2']}

        cv = KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=0)
        clf = GridSearchCV(clf_base, grid, cv=cv, n_jobs=-1, scoring='f1_macro')

        clf.fit(X_train, y_train)

        print (classification_report(y_test, clf.predict(X_test)))

In [7]:
model(X_train_res, X_test, y_train_res, y_test)

             precision    recall  f1-score   support

          L       0.96      0.98      0.97       391
          S       0.85      0.71      0.77        55

avg / total       0.95      0.95      0.95       446

