In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [5]:
X_train = pd.read_csv("x_train.csv")
X_test = pd.read_csv("x_test.csv")
y_train = pd.read_csv("y_train.csv")
train = pd.concat([X_train, y_train], axis=1)
print(y_train.columns)

Index(['ID', 'RET'], dtype='object')


In [3]:
print(X_train.describe())

                  ID           DATE          STOCK       INDUSTRY  \
count  418595.000000  418595.000000  418595.000000  418595.000000   
mean   209297.000000     108.659002    3373.567833      37.176020   
std    120838.112303      61.891642    1533.157749      19.706505   
min         0.000000       0.000000       0.000000       0.000000   
25%    104648.500000      56.000000    2478.000000      22.000000   
50%    209297.000000     104.000000    3560.000000      43.000000   
75%    313945.500000     161.000000    4606.000000      53.000000   
max    418594.000000     223.000000    5716.000000      74.000000   

       INDUSTRY_GROUP         SECTOR   SUB_INDUSTRY          RET_1  \
count   418595.000000  418595.000000  418595.000000  416236.000000   
mean        12.697959       5.483845      90.391663       0.001383   
std          7.231701       2.410113      47.491157       0.031311   
min          0.000000       0.000000       0.000000      -0.845324   
25%          6.000000       

In [20]:
print(X_train.head())
print(y_train.head())

   ID  DATE  STOCK  INDUSTRY  INDUSTRY_GROUP  SECTOR  SUB_INDUSTRY     RET_1  \
0   0     0      2        18               5       3            44 -0.015748   
1   1     0      3        43              15       6           104  0.003984   
2   2     0      4        57              20       8           142  0.000440   
3   3     0      8         1               1       1             2  0.031298   
4   4     0     14        36              12       5            92  0.027273   

   VOLUME_1     RET_2  ...    RET_16  VOLUME_16    RET_17  VOLUME_17  \
0  0.147931 -0.015504  ...  0.059459   0.630899  0.003254  -0.379412   
1       NaN -0.090580  ...  0.015413        NaN  0.003774        NaN   
2 -0.096282 -0.058896  ...  0.008964  -0.010336 -0.017612  -0.354333   
3 -0.429540  0.007756  ... -0.031769   0.012105  0.033824  -0.290178   
4 -0.847155 -0.039302  ... -0.038461  -0.277083 -0.012659   0.139086   

     RET_18  VOLUME_18    RET_19  VOLUME_19    RET_20  VOLUME_20  
0  0.008752  -0.110

In [19]:
print(X_train[(X_train["STOCK"]==2)].sort_values(by="DATE"))

            ID  DATE  STOCK  INDUSTRY  INDUSTRY_GROUP  SECTOR  SUB_INDUSTRY  \
0            0     0      2        18               5       3            44   
10866    10866     6      2        18               5       3            44   
13258    13258     7      2        18               5       3            44   
18512    18512    10      2        18               5       3            44   
23767    23767    14      2        18               5       3            44   
...        ...   ...    ...       ...             ...     ...           ...   
403131  403131   213      2        18               5       3            44   
405821  405821   214      2        18               5       3            44   
408288  408288   218      2        18               5       3            44   
410734  410734   220      2        18               5       3            44   
416229  416229   223      2        18               5       3            44   

           RET_1  VOLUME_1     RET_2  ...    RET_16

Data Preprocessing

In [None]:
print(X_train.columns)
n_shifts = 5  # If you don't want all the shifts to reduce noise
features = ['RET_%d' % (i + 1) for i in range(n_shifts)]
features += ['VOLUME_%d' % (i + 1) for i in range(n_shifts)]
features += new_features  # The conditional features
train[features].head()

Index(['ID', 'DATE', 'STOCK', 'INDUSTRY', 'INDUSTRY_GROUP', 'SECTOR',
       'SUB_INDUSTRY', 'RET_1', 'VOLUME_1', 'RET_2', 'VOLUME_2', 'RET_3',
       'VOLUME_3', 'RET_4', 'VOLUME_4', 'RET_5', 'VOLUME_5', 'RET_6',
       'VOLUME_6', 'RET_7', 'VOLUME_7', 'RET_8', 'VOLUME_8', 'RET_9',
       'VOLUME_9', 'RET_10', 'VOLUME_10', 'RET_11', 'VOLUME_11', 'RET_12',
       'VOLUME_12', 'RET_13', 'VOLUME_13', 'RET_14', 'VOLUME_14', 'RET_15',
       'VOLUME_15', 'RET_16', 'VOLUME_16', 'RET_17', 'VOLUME_17', 'RET_18',
       'VOLUME_18', 'RET_19', 'VOLUME_19', 'RET_20', 'VOLUME_20'],
      dtype='object')


Baseline Model

In [None]:
X_train = train[features]
y_train = train["RET"]

# A quiet large number of trees with low depth to prevent overfits
rf_params = {
    'n_estimators': 500,
    'max_depth': 2**3,
    'random_state': 0,
    'n_jobs': -1
}

train_dates = train['DATE'].unique()
test_dates = x_test['DATE'].unique()

n_splits = 4
scores = []
models = []

splits = KFold(n_splits=n_splits, random_state=0,
               shuffle=True).split(train_dates)

for i, (local_train_dates_ids, local_test_dates_ids) in enumerate(splits):
    local_train_dates = train_dates[local_train_dates_ids]
    local_test_dates = train_dates[local_test_dates_ids]

    local_train_ids = train['DATE'].isin(local_train_dates)
    local_test_ids = train['DATE'].isin(local_test_dates)

    X_local_train = X_train.loc[local_train_ids]
    y_local_train = y_train.loc[local_train_ids]
    X_local_test = X_train.loc[local_test_ids]
    y_local_test = y_train.loc[local_test_ids]

    X_local_train = X_local_train.fillna(0)
    X_local_test = X_local_test.fillna(0)

    model = RandomForestClassifier(**rf_params)
    model.fit(X_local_train, y_local_train)

    y_local_pred = model.predict_proba(X_local_test)[:, 1] #proba de prédire 1 True
    
    sub = train.loc[local_test_ids].copy() # on garde les lignes du train qu'on va agrémenter de leur proba de predire 1
    sub['pred'] = y_local_pred # on agrémente celle colonne de préditiction
    y_local_pred = sub.groupby('DATE')['pred'].transform(lambda x: x > x.median()).values 

    models.append(model)
    score = accuracy_score(y_local_test, y_local_pred)
    scores.append(score)
    print(f"Fold {i+1} - Accuracy: {score* 100:.2f}%")

mean = np.mean(scores)*100
std = np.std(scores)*100
u = (mean + std)
l = (mean - std)
print(f'Accuracy: {mean:.2f}% [{l:.2f} ; {u:.2f}] (+- {std:.2f})')

In [None]:
model = RandomForestClassifier()
model.fit(X_train.fillna(0), y_train)
y_pred = model.predict_proba(X_test.fillna(0))[:, 1]

sub = X_test.copy()
sub['pred'] = y_pred
y_pred = sub.groupby('DATE')['pred'].transform(
    lambda x: x > x.median()).values

submission = pd.Series(y_pred)
submission.index = X_test.index
submission.name = "RET"

submission.to_csv('./benchmark_qrt.csv', index=True, header=True)