In [None]:
#!/usr/bin/env python

"""
Example classifier on Numerai data using a logistic regression classifier.
To get started, install the required packages: pip install pandas, numpy, sklearn
"""

import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model


def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files
    ''' training data contains only training data'''
    training_data = pd.read_csv('in/numerai_training_data.csv', header=0)
    '''
    prediction_data contains both validation data (with targets) and test data
    (without targets)
    '''
    prediction_data = pd.read_csv('in/numerai_tournament_data.csv', header=0)

    # print('\n')
    # print('prediction_data')
    # print(prediction_data)


    # Transform the loaded CSV data into numpy arrays
    '''
    Goes through all the pd.DataFrame columns titles. If they contain 'feature'
    they are added to the features list
    '''
    features = [f for f in list(training_data) if "feature" in f]
    X = training_data[features]                     # pd.DataFrame of all training features
    Y = training_data["target"]                     # pd.Series of the classes
    x_prediction = prediction_data[features]        # pd.DataFrame of all validation and test features
    ids = prediction_data["id"]
    print('\n')
    print('ids')
    print(ids)
    print('\n')
    print('ids')
    print(ids.duplicated)

    # This is your model that will learn to predict
    model = linear_model.LogisticRegression(n_jobs=-1)

    print("Training...")
    # Your model is trained on the training_data
    model.fit(X, Y)

    print("Predicting...")
    # Your trained model is now used to make predictions on the numerai_tournament_data
    # The model returns two columns: [probability of 0, probability of 1]
    # We are just interested in the probability that the target is 1.
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)

    print("Writing predictions to predictions.csv")
    # Save the predictions out to a CSV file
    joined.to_csv("predictions.csv", index=False)
    # Now you can upload these predictions on numer.ai


if __name__ == '__main__':
    main()


In [None]:
project = 'test_datasets'

import os.path
import sys
current_dir = os.path.abspath('./')
project_dir = current_dir[:current_dir.rfind(project)+len(project)+1]
sys.path.insert(0, project_dir)

In [None]:
training_data = pd.read_csv('in/numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('in/numerai_tournament_data.csv', header=0)

In [None]:
%load_ext autoreload

In [None]:
%autoreload
import numpy as np
import pandas as pd
from sklearn import preprocessing, feature_extraction, feature_selection, model_selection, metrics
import xgboost as xgb
import matplotlib.pyplot as plt
# import utils
# from classes import ML


class ML(object):
    def __init__(self, df=None, x_columns=None, y_column=None, convert_dict=None,
                 drop_dict=None, replace_dict=None, feature_dict=None,
                 method=None, model_params={}, *args, **kwargs):
        super(ML, self).__init__()
        self.df = df
        self.x_columns = x_columns
        self.y_column = y_column
        self.convert_dict = convert_dict
        self.drop_dict = drop_dict
        self.replace_dict = replace_dict
        self.feature_dict = feature_dict
        self.method = method
        self.model_params = model_params
        
        self.n_columns = len(self.df.columns)-2

    def convert(self):
        if self.convert_dict is not None:
            for column, func in self.convert_dict.items():
                self.df[column] = func(self.df[column])

    def feature(self):
        '''call feature building functions on columns'''
        for column, func in self.feature_dict.items():
            self.df[column] = func(self.df)

    def save_mapping(self):
        pass

    def load_mapping(self):
        pass

    def get_X(self, sparse=True):
        self.vectorizer_ = feature_extraction.DictVectorizer(sparse=sparse)
        self.X = self.vectorizer_.fit_transform(self.df[self.x_columns].to_dict(orient='records'))
        self.feature_columns_ = self.vectorizer_.vocabulary_

    def get_y(self):
        self.y = np.array(self.df[self.y_column])

    def create_model(self, method=None, model_params=None):
        if method:
            self.method = method
        
        if model_params:
            self.model_params = model_params

        if self.method == 'classification':
            if model_params:
                self.model = xgb.XGBClassifier(**self.model_params)
            else:
                self.model = xgb.XGBClassifier()
        elif self.method == 'regression':
            print('Not implemented yet')

    def train(self):
        self.model.fit(self.X, self.y)
        
    def cross_val(self):
        pass

    def pickle(self):
        pass

    def unpickle(self):
        pass

    def score(self):
        pass



In [None]:
num_features = len(training_data.columns)-3
x_columns = ['feature'+str(i) for i in range(1,num_features)]
y_column = 'target'

drop_dict = {}
replace_dict = {}
convert_dict = {'created': pd.to_datetime,
                     'planned_installation_time': pd.to_numeric,
                     'status_type': pd.to_numeric}

def get_end_of_month(df):
    return df['created'].dt.is_month_end


feature_dict = {'month_end': get_end_of_month}

ml = ML(df=training_data, 
           x_columns=x_columns,
           y_column=y_column,
           drop_dict=drop_dict,
           replace_dict=replace_dict,
           convert_dict=convert_dict,
           feature_dict=feature_dict)
ml.get_X()
ml.get_y()
# ml.y

In [None]:
ml.create_model(method='classification')

In [None]:
ml.model

In [None]:
model = ml.model

In [None]:
df = model.df.copy()

In [None]:
df.describe()

In [None]:
# unique index
df['id'].nunique()/len(df)

In [None]:
for i in range(model.n_columns):
    print('feature'+str(i+1))
    df.sort_values(by='id').plot(kind='line', x='id', y='feature'+str(i+1))
    df.plot(x=df.index, y='feature'+str(i+1))
    plt.show()