In [39]:
from __future__ import division
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

In [40]:
#### HELPER FUNCTIONS ####

In [41]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [42]:
def create_labels(df, column):
    l = df[column].tolist()
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(l), dict(set(zip(l, label_encoder.fit_transform(l))))


def create_labels_from_list(l):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(l), dict(set(zip(l, label_encoder.fit_transform(l))))


In [43]:
#### LOAD DATA ####

In [44]:
try:
    training_data = pd.read_csv("numerai_training_data.csv")
    tournament_data = pd.read_csv("numerai_tournament_data.csv")
except:
    try:
        training_data = pd.read_csv("numerai_training_data_limited.csv")
        tournament_data = pd.read_csv("numerai_tournament_data_limited.csv")
    except:
        print "error - pure sadness. you have no data files in this directory"
    

In [45]:
#rerunning the lines of code below will overwrite data_limited files

"""
#determining number of rows for limited dataset to upload to github (max 100 mb)
github_max = 100
file_size = 120
max_rows = int(round(github_max/file_size*data.shape[0],-3))

train_data_limited = train_data[0:max_rows]
test_data_limited = test_data[0:max_rows]

train_data_limited.to_csv('numerai_training_data_limited.csv')
test_data_limited.to_csv('numerai_tournament_data_limited.csv')
print "The first "+ str(max_rows) +" rows of the data were added to limited datasets"
"""

'\n#determining number of rows for limited dataset to upload to github (max 100 mb)\ngithub_max = 100\nfile_size = 120\nmax_rows = int(round(github_max/file_size*data.shape[0],-3))\n\ntrain_data_limited = train_data[0:max_rows]\ntest_data_limited = test_data[0:max_rows]\n\ntrain_data_limited.to_csv(\'numerai_training_data_limited.csv\')\ntest_data_limited.to_csv(\'numerai_tournament_data_limited.csv\')\nprint "The first "+ str(max_rows) +" rows of the data were added to limited datasets"\n'

In [46]:
#### EXPLORE DATA ####

In [48]:
#print out the first five rows of data set
training_data.head()


Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature50,target
0,0.360233,0.400324,0.376192,0.625575,0.578128,0.456694,0.366812,0.354266,0.498456,0.539569,...,0.538909,0.447431,0.544526,0.574966,0.382544,0.46545,0.336393,0.5776,0.716399,1.0
1,0.524941,0.562669,0.594656,0.691748,0.780364,0.507443,0.473722,0.499549,0.668819,0.289693,...,0.422837,0.174319,0.511849,0.681012,0.506451,0.428276,0.408187,0.721181,0.738231,1.0
2,0.375577,0.343138,0.442795,0.549371,0.238039,0.730104,0.38364,0.439865,0.333091,0.735542,...,0.423726,0.591998,0.482158,0.365507,0.37247,0.337441,0.368906,0.576928,0.5365,1.0
3,0.419479,0.284885,0.299801,0.466293,0.519207,0.471024,0.601611,0.44231,0.486457,0.62848,...,0.660233,0.443502,0.578453,0.544214,0.541356,0.452928,0.353136,0.591958,0.620773,1.0
4,0.563124,0.485997,0.531946,0.532138,0.47272,0.450243,0.746844,0.242323,0.767916,0.497159,...,0.526667,0.443528,0.414979,0.417109,0.544803,0.480563,0.626479,0.438063,0.562606,0.0


In [49]:
#print out the last five rows of data set
training_data.tail()

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature42,feature43,feature44,feature45,feature46,feature47,feature48,feature49,feature50,target
260761,0.494903,0.772782,0.613383,0.50151,0.630539,0.369024,0.406424,0.642597,0.707465,0.385961,...,0.531644,0.446273,0.452734,0.452141,0.450239,0.525593,0.653099,0.406829,0.400346,1.0
260762,0.599877,0.725248,0.612162,0.593145,0.567411,0.546543,0.687254,0.665792,0.551152,0.372492,...,0.448736,0.547254,0.499786,0.517084,0.532767,0.777985,0.683546,0.579801,0.370434,1.0
260763,0.426627,0.646534,0.672396,0.423245,0.363507,0.590633,0.605406,0.811512,0.407315,0.42782,...,0.390572,0.548869,0.426954,0.269059,0.484017,0.656474,0.797997,0.488521,0.093011,1.0
260764,0.345536,0.282349,0.350594,0.512924,0.555426,0.419745,0.354981,0.459965,0.510418,0.572683,...,0.735401,0.262207,0.441347,0.669045,0.475064,0.465721,0.336502,0.350719,0.68929,1.0
260765,0.356547,0.591591,0.506552,0.572069,0.506632,0.362448,0.442549,0.50759,0.559326,0.566939,...,0.542306,0.357975,0.517278,0.440438,0.436393,0.585718,0.59876,0.368111,0.492057,1.0


In [51]:
#dimensions of data
dims = training_data.shape
rows = dims[0]
cols = dims[1]
print "number of rows = "+ str(rows)
print "number of cols = "+ str(cols)

number of rows = 260766
number of cols = 51


In [52]:
#Plot out what feature1 looks like
first_column = training_data['feature1']
plt.hist(first_column, bins=50)
plt.title('Distribution of Feature1')
plt.ylabel('Frequency')

plt.show() #this command will open up a new window showing a plot. you must close this new window to continue with the script


In [53]:
#### CREATE MODEL ####

In [54]:
## set up train test split ##
df = training_data
y, class_names = create_labels(df, 'target')
X = df.drop('target', axis=1)
train_data, test_data, train_labels, test_labels = train_test_split(X,y,test_size=0.25)


## create Logistic regression classifier ##
classifier = linear_model.LogisticRegression(C=1e5) 
classifier.fit(train_data, train_labels) 
predictions = classifier.predict(test_data) 
cnf_matrix = confusion_matrix(predictions, test_labels)

In [55]:
#### MODEL EVALUATION ####

In [56]:
## F1 (accuracy) scoring ##
print f1_score(test_labels,predictions,average=None)
print f1_score(test_labels,predictions,average='weighted')


# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show() #this command will open up 2 new windows, each showing one plot. you must close these new window to continue with the script



[ 0.51  0.53]
0.518782858586
Confusion matrix, without normalization
[[16253 14990]
 [16368 17581]]
Normalized confusion matrix
[[ 0.52  0.48]
 [ 0.48  0.52]]
