In [217]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm
sns.set()

In [218]:
train = pd.read_csv("data/x_train.csv")
test = pd.read_csv("data/x_test.csv")
y_train = pd.read_csv("data/y_train.csv")

In [219]:
all = pd.concat([train,test])
id_data = all[['ID','DATE','STOCK']]

In [220]:
train = train.drop(['ID'],axis=1)
test = test.drop(['ID'],axis=1)



In [221]:
train = train[~train['DATE'].isnull()]

In [222]:
train.columns[6:]

Index(['RET_1', 'VOLUME_1', 'RET_2', 'VOLUME_2', 'RET_3', 'VOLUME_3', 'RET_4',
       'VOLUME_4', 'RET_5', 'VOLUME_5', 'RET_6', 'VOLUME_6', 'RET_7',
       'VOLUME_7', 'RET_8', 'VOLUME_8', 'RET_9', 'VOLUME_9', 'RET_10',
       'VOLUME_10', 'RET_11', 'VOLUME_11', 'RET_12', 'VOLUME_12', 'RET_13',
       'VOLUME_13', 'RET_14', 'VOLUME_14', 'RET_15', 'VOLUME_15', 'RET_16',
       'VOLUME_16', 'RET_17', 'VOLUME_17', 'RET_18', 'VOLUME_18', 'RET_19',
       'VOLUME_19', 'RET_20', 'VOLUME_20'],
      dtype='object')

### Replace by the 0

In [223]:
train = train.fillna(0.0)
test = test.fillna(0.0)

### Add features

In [224]:
filter = ['SECTOR', 'INDUSTRY_GROUP', 'INDUSTRY', 'SUB_INDUSTRY']

def df_by(df, f):

    df = df[['DATE'] + f + list(train.columns[6:])].groupby(['DATE'] + f).agg('mean').reset_index()
    
    new_c = []
    for c in train.columns[6:]:
        df[f[-1] + '#' + c] = df[c]
        df = df.drop(c,axis=1)
        
    if('RET' in df.columns):
        df = df.drop(['RET'],axis=1)
    return df
    


In [225]:
train_m = train.copy()
for i in range(1,len(filter)+1):
    train_m = pd.merge(train_m, df_by(train.copy(),filter[:i]),left_on=['DATE'] + filter[:i], right_on = ['DATE'] + filter[:i])  
    
    
test_m = test.copy()
for i in range(1,len(filter)+1):
    test_m = pd.merge(test_m, df_by(test.copy(),filter[:i]),left_on=['DATE'] + filter[:i], right_on = ['DATE'] + filter[:i])  

### Feature Selection

In [226]:
def acc(features, train, y_train):
    X_train = train[features]
    y_train = train['RET']
    # A quiet large number of trees with low depth to prevent overfits
    rf_params = {
    'n_estimators': 100,
    'max_depth': 2**3,
    'random_state': 0,
    'n_jobs': -1
    }

    train_dates = train['DATE'].unique()
    test_dates = test['DATE'].unique()

    n_splits = 4
    scores = []
    models = []

    splits = KFold(n_splits=n_splits, random_state=0,
                shuffle=True).split(train_dates)

    for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
        local_train_dates = train_dates[local_train_dates_ids]
        local_test_dates = train_dates[local_test_dates_ids]

        local_train_ids = train['DATE'].isin(local_train_dates)
        local_test_ids = train['DATE'].isin(local_test_dates)

        X_local_train = X_train.loc[local_train_ids]
        y_local_train = y_train.loc[local_train_ids]
        X_local_test = X_train.loc[local_test_ids]
        y_local_test = y_train.loc[local_test_ids]

        X_local_train = X_local_train.fillna(0)
        X_local_test = X_local_test.fillna(0)

        model = RandomForestClassifier(**rf_params)
        model.fit(X_local_train, y_local_train)

        y_local_pred = model.predict_proba(X_local_test)[:, 1]

        sub = train.loc[local_test_ids].copy()
        sub['pred'] = y_local_pred
        y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values

        models.append(model)
        score = accuracy_score(y_local_test, y_local_pred)
        scores.append(score)
 
    mean = np.mean(scores)
    std = np.std(scores)*100
    u = (mean + std)
    l = (mean - std)
    
    return mean

In [227]:
n_shifts = 5  # If you don't want all the shifts to reduce noise
features = ['RET_%d' % (i + 1) for i in range(n_shifts)]
features += ['VOLUME_%d' % (i + 1) for i in range(n_shifts)]
features.append('SECTOR#RET_1')

In [228]:
id_data

Unnamed: 0,ID,DATE,STOCK
0,0,0,2
1,1,0,3
2,2,0,4
3,3,0,8
4,4,0,14
...,...,...,...
198424,617019,222,5707
198425,617020,222,5710
198426,617021,222,5714
198427,617022,222,5715


In [229]:
dict_id = {}
for i in id_data.values:
    if(i[1] not in dict_id):
        dict_id[i[1]] = dict()
    dict_id[i[1]][i[2]] = i[0]

In [230]:
train_m['ID'] = train_m.apply(lambda x: dict_id[x['DATE']][x['STOCK']],axis=1)
train_m = pd.merge(train_m, y_train, on='ID')

In [234]:
all_features = train_m.columns[6:]
best_score = 0.514

for f in tqdm(all_features):
   
    if(f in features):
        continue
    features.append(f)
    score = acc(features, train_m, y_train)
    print(score)
    if(score > best_score + 0.002):
        best_score = score
        print("best_score",f)
    else:
        features = features[:-1]
    

  0%|          | 0/202 [00:00<?, ?it/s]

In [None]:
features

In [None]:
all_features = train_m.columns[6:]
best_score = 0.514

for f in tqdm(all_features):
   
    if(f in features):
        continue
    features.append(f)
    score = acc(features, train_m, y_train)
    print(score)
    if(score > best_score + 0.001):
        best_score = score
        print("best_score",f)
    else:
        features = features[:-1]
    

In [None]:
features