# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Globals

In [None]:
FEATURES_CSV = '../input/jane-street-market-prediction/features.csv'
TRAIN_CSV = '../input/jane-street-market-prediction/train.csv'
TEST_CSV = '../input/jane-street-market-prediction/example_test.csv'
VAL = False

# Read DS

In [None]:
feat = pd.read_csv(FEATURES_CSV)
tr = pd.read_csv(TRAIN_CSV)

# Look at the data

## Features

In [None]:
feat.head()

In [None]:
print('Anonymous Features: {} rows, {} columns'.format(len(feat),len(feat.columns)))

## Train

In [None]:
tr.head()

In [None]:
tr.info()

In [None]:
tr.columns

# Preprocess

In [None]:
def preprocess(ds):
    ds = ds.loc[ds.weight>0] # these entries are not useful 
    ds = ds.dropna() #ignoring NAs for now
    len(ds)
    return ds

# Let's ANALYZZEE!!

In [None]:
len(tr)

In [None]:
# for column in tr.columns:
#     print(column, tr[column].nunique())

In [None]:
# tr['date'].unique()

In [None]:
# tr['resp'].nunique()

In [None]:
print(float(tr.loc[tr.resp>0]['resp'].nunique()/len(tr)))
print(float(tr.loc[tr.resp_1>0]['resp_1'].nunique()/len(tr)))
print(float(tr.loc[tr.resp_2>0]['resp_2'].nunique()/len(tr)))
print(float(tr.loc[tr.resp_3>0]['resp_3'].nunique()/len(tr)))
print(float(tr.loc[tr.resp_4>0]['resp_4'].nunique()/len(tr)))

## ~ 50% of the responses have >0 (can do the trade)

# Prep for Train

In [None]:
resp_cols = ['resp','resp_1','resp_2','resp_3','resp_4']
features = [c for c in tr.columns if 'feature' in c]
f_mean = np.mean(tr[features[1:]].values, axis=0)
tr = preprocess(tr)
split_num = int(0.7*tr.date.nunique())
# print(split_num)

if VAL:
    tr_tr = tr.loc[tr.date<=split_num]
    x_train = tr_tr.loc[:,tr_tr.columns.str.contains('feature')]
    y_train = np.stack([(tr_tr[c] > 0).astype('int') for c in resp_cols]).T

    te = tr.loc[tr.date>split_num]
    x_test = te.loc[:,te.columns.str.contains('feature')]
    y_test = np.stack([(te[c] > 0).astype('int') for c in resp_cols]).T
else:
    x_train = tr.loc[:,tr.columns.str.contains('feature')]
    y_train = np.stack([(tr[c] > 0).astype('int') for c in resp_cols]).T


In [None]:
x_train.head()

In [None]:
resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
print(len([(tr[c] > 0).astype('int') for c in resp_cols]))
print(np.stack([(tr[c] > 0).astype('int') for c in resp_cols]).shape)
print(np.stack([(tr[c] > 0).astype('int') for c in resp_cols]).T.shape)
print(np.stack([(tr[c] > 0).astype('int') for c in resp_cols]).T)

In [None]:
x_train.head()

In [None]:
len(x_train)

In [None]:
len(y_train)

In [None]:
y_train[:,0].shape

# Train!!

In [None]:
models = []

for i in range(len(resp_cols)):
    print('\n\n',i)
    logreg = LogisticRegression(max_iter=500)
    logreg.fit(x_train, y_train[:,i])
    models.append(logreg)

# Predict

In [None]:
if VAL:
    for i,model in enumerate(models):
        y_pred = model.predict(x_test)
        print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(x_test, y_test[:,i])))
        confusion_matrix = metrics.confusion_matrix(y_test[:,i], y_pred)
        print('Confusion Matrix:',confusion_matrix)

else:
    th = 0.502
    f = np.median

    import janestreet
    env = janestreet.make_env() # initialize the environment

    iter_test = env.iter_test() # an iterator which loops over the test set

    for (test_df, sample_prediction_df) in iter_test:
        if test_df['weight'].item() > 0:
            x_tt = test_df.loc[:, features].values
            if np.isnan(x_tt[:, 1:].sum()):
                x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
            x_tt = np.nan_to_num(x_tt)
            pred = np.mean([model.predict(x_tt) for model in models], axis=0)
            pred = f(pred)
            sample_prediction_df.action = np.where(pred >= th, 1, 0).astype(int)
        else:
            sample_prediction_df.action = 0
        env.predict(sample_prediction_df)