In [2]:
import gc
import time
import pandas as pd
import numpy as np
from datetime import datetime

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [12]:
def feature_engineering(id_list, id_idx, ctgr_cols, nmrc_cols, df):
    feature = {}
    for id in id_list:
        feature[id] = {}
    
    columns = df.columns.tolist()
    ctgr_cols_idx = [columns.index(col) for col in ctgr_cols]
    nmrc_cols_idx = [columns.index(col) for col in nmrc_cols]
    
    s = time.time()
    num = 0
    
    for i in range(df.shape[0]):
        va = df.loc[i].values
        idx = va[id_idx]
        for ctgr_idx in ctgr_cols_idx:
            for nmrc_idx in nmrc_cols_idx:
                col_name = '&'.join([columns[ctgr_idx], va[ctgr_idx],columns[nmrc_idx]])
                features[idx][col_name] = features[idx].get(col_name, 0) + va[nmrc_idx]
        num += 1
        if num%1000000==0:
            print(time.time()-s, 's')
    
    return feature
    

In [5]:
numeric_cols = ['purchase_amount', 'installments']

category_cols = ['authorized_flag', 'city_id', 'category_1',
       'category_3', 'merchant_category_id','month_lag','most_recent_sales_range',
                 'most_recent_purchases_range', 'category_4',
                 'purchase_month', 'purchase_hour_section', 'purchase_day']

id_cols = ['card_id', 'merchant_id']

train_id_list = train['card_id'].tolist()
test_id_list = test['card_id'].tolist()

In [6]:
trans = pd.read_csv('preprocess/transactions_d_pre.csv')

In [14]:
trans[category_cols].astype(str)

MemoryError: 

In [13]:
feature = feature_engineering(train_id_list, 1, category_cols, numeric_cols, trans)

TypeError: sequence item 1: expected str instance, numpy.int64 found

In [4]:
train_dict = pd.read_csv("train_dict.csv")
test_dict = pd.read_csv("test_dict.csv")
train_groupby = pd.read_csv("preprocess/train_groupby.csv")
test_groupby = pd.read_csv("preprocess/test_groupby.csv")

In [7]:
for col in train_dict.columns:
    if col in train_groupby.columns and col != 'card_id':
        del train_groupby[col]
for col in test_dict.columns:
    if col in test_groupby.columns and col != 'card_id':
        del test_groupby[col]

In [8]:
train = pd.merge(train_dict, train_groupby, how='left', on= 'card_id').fillna(0)
test = pd.merge(test_dict, test_groupby, how='left', on= 'card_id').fillna(0)

In [9]:
train.to_csv('preprocess/train_pred.csv')
test.to_csv('preprocess/test_pred.csv')

In [10]:
train.shape

(201917, 1742)

In [11]:
def get_correlation(X_test, Y_test, models):
    n_estimators = len(models.estimators_)
    prediction = np.zeros((Y_test.shape[0], n_estimators))
    predictions = pd.DataFrame({'Estimator' + str(n+1):[] for n in range(n_estimators)})
    
    for key, model in zip(predictions, models.estimators_):
        predictions[key] = model.predict(X_test)
    
    corr = predictions.corr()
    print('Avg correlations between predictors:', corr.mean().mean()-1/n_estimators)
    
    return corr

In [22]:
feature = train.columns.tolist()
feature.remove('card_id')
feature.remove( 'target')
feature_select = feature[:]

In [23]:
corr = []
for fea in feature_select:
    corr.append(abs(train[[fea, 'target']].corr().values[0][1]))

In [24]:
se = pd.Series(corr, index=feature_select).sort_values(ascending = False)

In [25]:
feature_select = se[:300].index.tolist()

In [26]:
train_head = train[['card_id'] + feature_select + ['target']]
test_head = test[['card_id'] + feature_select]

In [28]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [30]:
features = train_head.columns.tolist()
features.remove('card_id')
features.remove( 'target')

parameter_space = {
    "n_estimators": [79, 80, 81], 
    "min_samples_leaf": [29, 30, 31],
    "min_samples_split": [2, 3],
    "max_depth": [9, 10],
    "max_features": ["auto", 80]
}


In [29]:
rf = RandomForestRegressor(
    criterion='mse'
    , n_jobs = 15
    , random_state = 99)

In [33]:
gsCV = GridSearchCV(rf, parameter_space, cv = 2, scoring='neg_mean_squared_error')
gsCV.fit(train_head[features].values, train_head['target'].values)

GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=15,
           oob_score=False, random_state=99, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [79, 80, 81], 'min_samples_leaf': [29, 30, 31], 'min_samples_split': [2, 3], 'max_depth': [9, 10], 'max_features': ['auto', 80]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)