In [None]:
#!/usr/bin/env python

"""
Example classifier on Numerai data using a logistic regression classifier.
To get started, install the required packages: pip install pandas, numpy, sklearn
"""

import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model


def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files
    ''' training data contains only training data'''
    training_data = pd.read_csv('in/numerai_training_data.csv', header=0)
    '''
    prediction_data contains both validation data (with targets) and test data
    (without targets)
    '''
    prediction_data = pd.read_csv('in/numerai_tournament_data.csv', header=0)

    # print('\n')
    # print('prediction_data')
    # print(prediction_data)


    # Transform the loaded CSV data into numpy arrays
    '''
    Goes through all the pd.DataFrame columns titles. If they contain 'feature'
    they are added to the features list
    '''
    features = [f for f in list(training_data) if "feature" in f]
    X = training_data[features]                     # pd.DataFrame of all training features
    Y = training_data["target"]                     # pd.Series of the classes
    x_prediction = prediction_data[features]        # pd.DataFrame of all validation and test features
    ids = prediction_data["id"]
    print('\n')
    print('ids')
    print(ids)
    print('\n')
    print('ids')
    print(ids.duplicated)

    # This is your model that will learn to predict
    model = linear_model.LogisticRegression(n_jobs=-1)

    print("Training...")
    # Your model is trained on the training_data
    model.fit(X, Y)

    print("Predicting...")
    # Your trained model is now used to make predictions on the numerai_tournament_data
    # The model returns two columns: [probability of 0, probability of 1]
    # We are just interested in the probability that the target is 1.
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)

    print("Writing predictions to predictions.csv")
    # Save the predictions out to a CSV file
    joined.to_csv("predictions.csv", index=False)
    # Now you can upload these predictions on numer.ai


if __name__ == '__main__':
    main()


In [None]:
%load_ext autoreload
import pandas as pd
import numpy as np
from sklearn import preprocessing, feature_extraction, feature_selection, model_selection, metrics
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
project = 'test_datasets'

import os.path
import sys
current_dir = os.path.abspath('./')
project_dir = current_dir[:current_dir.rfind(project)+len(project)+1]
sys.path.insert(0, project_dir)

In [None]:
training_data = pd.read_csv('in/numerai_training_data.csv', header=0)
prediction_data = pd.read_csv('in/numerai_tournament_data.csv', header=0)

# production code

In [None]:
%autoreload
from classes import ML
import utils
ml = ML(df=training_data, 
           x_columns=utils.x_columns,
           y_column=utils.y_column,
           drop_dict=utils.drop_dict,
           replace_dict=utils.replace_dict,
           convert_dict=utils.convert_dict,
           feature_dict=utils.feature_dict)
ml.feature()

ml.split_train_test()
ml.get_X()
ml.get_y()
ml.create_eval_set()
ml.create_model('classification')
ml.cross_val(metric='f1', eval_metric=['error', 'logloss'])
# ml.plot_learning_curve()
ml.train(eval_metric=['error', 'logloss'])
ml.score()
# ml.plot_feature_importances()
# ml.plot_confusion_matrix()
# ml.plot_roc_auc()
# ml.plot_precision_recall()
# ml.plot_ks_statistic()
ml.describe()

utils.dump(ml, 'out/trained_model.pkl')

In [None]:
ml.plot_learning_curve()

# working env

In [None]:
# correlations:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(ml.df.corr(), annot=True, fmt=".2f", ax=ax)
plt.show()

In [None]:
# distributions of target var for different features
for feature in ml.x_columns:
    fig, ax = plt.subplots(figsize=(10,5))
    g = ml.df.groupby(ml.y_column)
    for n in range(g.ngroups):
        sns.kdeplot(g.get_group(n)[feature], ax=ax)
    plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
for feature in ml.x_columns:
    sns.kdeplot(ml.df[feature], ax=ax)
plt.show()

In [None]:
sns.kdeplot?

In [None]:
sns.distplot?

In [None]:
ml.x_columns

In [None]:
sns.jointplot('feature1', 'feature2', ml.df, hue='target', kind='reg')
plt.show()

In [None]:
sns.regplot?

In [None]:
sns.regplot('feature1', 'feature2', data=ml.df)
plt.show()

In [None]:
ml.df.loc[:100,ml.x_columns].plot(kind='scatter')
plt.show()

In [None]:
g = ml.df.groupby('target')
sns.disg.get_group(0)
# plt.show()

In [None]:
sns.distplot(ml.df['feature6'], groupby=ml.df['target'])
plt.show()

In [None]:
sns.distplot(ml.df['feature2'])
plt.show()

In [None]:
sns.kdeplot(ml.df['feature1'], ml.df['feature2'])
plt.show()

In [None]:
sns.distplot(ml.df[['feature1', 'feature2']], kde=False, color=['red','green'])
plt.show()

In [None]:
sns.factorplot(data=ml.df, x='target', y='feature6', kind='violin')
plt.show()

In [None]:
sns.pairplot(ml.df, vars=['feature1', 'feature2', 'feature3'], hue='target', kind='reg', diag_kind='hist')
plt.show()

In [None]:
plt.show()

In [None]:
import graphviz
xgb.plot_tree(ml.model)