In [1]:
# Import modules

import pandas as pd
import numpy as np
import os
import sys
import random
import copy
import pickle
import time

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import colorlover as cl

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from sklearn.model_selection import ShuffleSplit
from scipy.stats import spearmanr
from scipy import stats
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler

import plotting_methods as pm

init_notebook_mode(connected=True)

%reload_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.4f}'.format

In [2]:
raw_data_dir = os.path.join(os.getcwd(), os.pardir, 'data')

f = open(os.path.join(raw_data_dir, 'train.csv'), 'r')
raw_data = pd.read_csv(f)
f.close()

id_col = 'ID'
tar_col = 'target'

In [3]:
f = open(os.path.join(raw_data_dir, 'test.csv'), 'r')
raw_test_data = pd.read_csv(f)
f.close()

id_col = 'ID'
tar_col = 'target'

In [4]:
# Clean data

# Remove features with no variance
clean_data = raw_data.drop(columns = raw_data.columns[np.where(raw_data.std() == 0.0)[0]])
feat_names = [x for x in clean_data.columns if x not in [id_col, tar_col]]

In [5]:
# Transform data

trans_data = (clean_data.loc[:, feat_names + [tar_col]] + 1).apply(np.log)
trans_test_data = (raw_test_data.loc[:, feat_names + [tar_col]] + 1).apply(np.log)




Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike



In [6]:
# Split data into training and validation

train_prop = 0.8
valid_prop = 0.2
split_seed = 4

ss = ShuffleSplit(n_splits=1, train_size=train_prop, random_state=split_seed)
split_inds = [(train_index, valid_index) for train_index, valid_index in ss.split(trans_data)]

train_inds = split_inds[0][0]
valid_inds = split_inds[0][1]

train_data = trans_data.loc[train_inds, :]
valid_data = trans_data.loc[valid_inds,:]


From version 0.21, test_size will always complement train_size unless both are specified.



In [9]:
sparse_train_data = train_data.loc[:, feat_names].copy().to_sparse()
sparse_valid_data = valid_data.loc[:, feat_names].copy().to_sparse()

In [13]:
# Scale data

sc = StandardScaler(with_mean = False)
scale_train_data = pd.DataFrame(sc.fit_transform(sparse_train_data), columns = feat_names).to_sparse()
scale_valid_data = pd.DataFrame(sc.transform(sparse_train_data), columns = feat_names).to_sparse()


In [14]:
# Dimension Reduction

pca = decomposition.PCA(svd_solver = 'randomized')
res = pca.fit_transform(scale_train_data)

ex_var = pca.explained_variance_ratio_.cumsum()
num_comp_keep = np.where(np.array(ex_var <= 0.95))[0][-1]

pca_train_data = pd.DataFrame(res).iloc[:, range(num_comp_keep)]

valid_res = pca.transform(scale_valid_data)
pca_valid_data = pd.DataFrame(valid_res).iloc[:, range(num_comp_keep)]
print pca_valid_data.shape

(3567, 1513)


In [20]:
f = open('../data/train_data.p', 'w')
pickle.dump(train_data, f)
f.close()

In [16]:
f = open('../data/proc_train_data.p', 'w')
pickle.dump(pca_train_data.to_sparse(), f)
f.close()

Random Forest Regressor Results

In [26]:
f.close()

In [4]:
f = open('../data/grid_search_res2.p', 'r')
rf_grids_res = pickle.load(f)
f.close()

In [7]:
rf_grids_res.cv_results_


You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True


You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True



{'mean_fit_time': array([ 999.03866657,  932.82300011,  607.35933344,  308.84100008,
          56.9236668 ,   56.11433339,   40.43600011,   34.2249999 ]),
 'mean_score_time': array([ 7.2626667 ,  7.92533334,  6.9509999 ,  2.53966657,  6.56799992,
         5.88499999,  5.08100001,  3.45466669]),
 'mean_test_score': array([ 0.13760091,  0.13561375,  0.10542763,  0.07895887,  0.07022559,
         0.06792146,  0.04070787,  0.02704332]),
 'mean_train_score': array([ 0.82494748,  0.71402358,  0.31977319,  0.19624365,  0.73871847,
         0.56558665,  0.19975556,  0.10975648]),
 'param_max_features': masked_array(data = ['auto' 'auto' 'auto' 'auto' 'sqrt' 'sqrt' 'sqrt' 'sqrt'],
              mask = [False False False False False False False False],
        fill_value = ?),
 'param_min_samples_leaf': masked_array(data = [5 10 50 100 5 10 50 100],
              mask = [False False False False False False False False],
        fill_value = ?),
 'params': [{'max_features': 'auto', 'min_samples_l