In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GroupKFold, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [2]:
hashed_feature_df = pd.read_csv('hashed_feature.csv')
tabular_data_df = pd.read_csv('tabular_data.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
encoder = LabelEncoder()
tabular_data_df['feature_25'] = encoder.fit_transform(tabular_data_df['feature_25'])

categorical_cols = ['feature_0', 'feature_25', 'feature_43']
numerical_cols = [col for col in tabular_data_df.columns if col not in categorical_cols + ['id','period']]

# fill n/a with new categories:
tabular_data_df['feature_0'] = tabular_data_df['feature_0'].fillna(115.0)
tabular_data_df['feature_43'] = tabular_data_df['feature_43'].fillna(32.0)

tabular_data_df[numerical_cols] = tabular_data_df[numerical_cols].fillna(tabular_data_df.median())

In [4]:
# one hot encode hashed features
hashed_feature_ohe = hashed_feature_df.pivot_table(index='id', 
                                                columns='feature_50', 
                                                aggfunc=len, 
                                                fill_value=0)

# new feature - number of hashed features
hashed_feature_ohe['hashed_sum'] = hashed_feature_ohe.sum(axis=1)

In [5]:
# leave only last 3 periods
tabular_data_df = tabular_data_df[tabular_data_df['period'] <= 3]

all_features = tabular_data_df.join(hashed_feature_ohe, on='id', how='left')
all_features = all_features.fillna(0)

train = all_features.join(train_df.set_index('id', drop=True), on='id', how='right')
test = all_features.join(test_df.set_index('id', drop=True), on='id', how='right').drop(columns='score')

train_X = train.drop(columns=['id', 'target'])
train_y = train['target']

test_X = test.drop(columns=['id'])

train_X.shape, train_y.shape, test_X.shape

((12252, 5061), (12252,), (3066, 5061))

In [8]:
model = RandomForestClassifier(class_weight='balanced', n_jobs=-1, verbose=1)

In [9]:
# calculate one sample prediction as mean of N predictions (over N periods)
def cross_validation_score(model, X, y, scorer, groups, cv):
    scores = []
    for train_idx, test_idx in cv.split(X, y, groups):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        pred = model.predict_proba(X.iloc[test_idx])[:, 1]
        
        pred_mean = groups.iloc[test_idx].to_frame(name='id')
        pred_mean['score'] = pred
        pred_mean = pred_mean.groupby('id')['score'].mean()

        true_mean = groups.iloc[test_idx].to_frame(name='id')
        true_mean['score'] = y.iloc[test_idx]
        true_mean = true_mean.groupby('id')['score'].mean()

        score = scorer(true_mean, pred_mean)
        scores.append(score)
    return np.asarray(scores)
        

In [11]:
model.n_estimators = 1000
cv = GroupKFold(n_splits=5)
score1 = cross_validation_score(model, train_X, train_y, scorer=roc_auc_score, groups=train['id'], cv=cv)
score1, score1.mean(), score1.std()
# 0.7419544399284111

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   22.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.

(array([0.74453428, 0.7144812 , 0.74778754, 0.77514976, 0.72781941]),
 0.7419544399284111,
 0.020480175136882398)

In [42]:
model.n_estimators = 10000
model.fit(train_X, train_y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 10000 out of 

RandomForestClassifier(class_weight='balanced', n_estimators=10000, n_jobs=-1,
                       verbose=1)

In [43]:
probs = model.predict_proba(test_X)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    3.4s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    3.8s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    4.4s
[Parallel(n_jobs=8)]: Done 10000 out of 10000 | elapsed:

In [63]:
result = test['id'].to_frame()
result['score'] = probs

result_mean = result.groupby('id')['score'].mean()
result_mean.to_csv('result.csv')

result_mean

id
4084    0.076933
4085    0.057633
4086    0.076333
4087    0.384833
4088    0.457833
          ...   
5101    0.197133
5102    0.452833
5103    0.209433
5104    0.098433
5105    0.030833
Name: score, Length: 1022, dtype: float64