In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer, precision_score
from time import time

start_init = time()
path = '../input/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')


def predict_labels_gini(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    start = time()
    y_pred = clf.predict(features)
    end = time()

    print("Predict model in {:.4f} seconds".format(end - start))
    gini = 2 * roc_auc_score(target.values, y_pred) - 1
    print("Gini score set: {:.4f}.".format(gini))
    return gini


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    start = time()

    clf.fit(X_train, y_train)
    end = time()

    print("Trained model in {:.4f} seconds".format(end - start))
    return predict_labels_gini(clf, X_test, y_test)


def training(clfs, features, target):
    print('Training data {}'.format(features.shape))
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    bestScore = 0
    bestModel = None
    for i, clf in enumerate(clfs):
        print("{}:".format(i + 1))
        score = train_predict(clf, X_train, y_train, X_test, y_test)
        if bestScore < score:
            bestScore = score
            bestModel = clf
    return bestModel


def tuning(clf, features, target):
    params = {}

    if clf.__class__.__name__ == "LogisticRegression":
        params = {'class_weight': ['balanced'],
                  'C': [0.0003, 0.003, 0.03, 0.3]
                  }
    if clf.__class__.__name__ == "DecisionTreeClassifier":
        params = {'criterion': ['gini', 'entropy']}
    if params == {}:
        return clf
    grid_search = GridSearchCV(clf, params, scoring=make_scorer(precision_score))
    grid_search.fit(features, target)
    predict_labels_gini(grid_search, features, target)
    return grid_search


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

target = train['target']
features = train.drop(['id', 'target'], axis=1)

clfs = [
    LogisticRegression(class_weight='balanced'),
    SGDClassifier(),
    DecisionTreeClassifier()
]
bestModel              = training(clfs, features, target)
bestModelTunned = tuning(bestModel, features, target)

# Create submission file
sub = pd.DataFrame()
sub['id'] = test['id']
test_pred = pd.DataFrame(test, columns=train.drop(['id', 'target'], axis=1).columns)
y_test_pred = bestModelTunned.predict_proba(test_pred)[:, 1]
sub['target'] = y_test_pred
sub.to_csv('benchmark.csv', float_format='%.6f', index=False)
end_init = time()
print("Finished in {:.4f} seconds".format(end_init - start_init))

Training data (595212, 57)
1:
Training a LogisticRegression using a training set size of 446409. . .
Trained model in 23.1728 seconds
Predict model in 0.0249 seconds
Gini score set: 0.1758.
2:
Training a SGDClassifier using a training set size of 446409. . .




Trained model in 0.8953 seconds
Predict model in 0.0190 seconds
Gini score set: 0.0000.
3:
Training a DecisionTreeClassifier using a training set size of 446409. . .
Trained model in 10.1964 seconds
Predict model in 0.0686 seconds
Gini score set: 0.0130.
Predict model in 0.1318 seconds
Gini score set: 0.1787.
Finished in 257.7403 seconds
