In [20]:
# Import modules

import pandas as pd
import numpy as np
import os
import sys
import random
import copy
import pickle
import time

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import colorlover as cl

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from sklearn.model_selection import ShuffleSplit
from scipy.stats import spearmanr
from scipy import stats
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

import plotting_methods as pm

init_notebook_mode(connected=True)

%reload_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.4f}'.format

In [6]:
raw_data_dir = os.path.join(os.getcwd(), os.pardir, 'data')

f = open(os.path.join(raw_data_dir, 'train.csv'), 'r')
raw_data = pd.read_csv(f)
f.close()

id_col = 'ID'
tar_col = 'target'

In [7]:
f = open(os.path.join(raw_data_dir, 'test.csv'), 'r')
raw_test_data = pd.read_csv(f)
f.close()

id_col = 'ID'
tar_col = 'target'

In [8]:
# Clean data

# Remove features with no variance
clean_data = raw_data.drop(columns = raw_data.columns[np.where(raw_data.std() == 0.0)[0]])
feat_names = [x for x in clean_data.columns if x not in [id_col, tar_col]]

In [9]:
# Transform data

trans_data = (clean_data.loc[:, feat_names + [tar_col]] + 1).apply(np.log)
trans_test_data = (raw_test_data.loc[:, feat_names + [tar_col]] + 1).apply(np.log)




Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike



In [10]:
# Split data into training and validation

train_prop = 0.8
valid_prop = 0.2
split_seed = 4

ss = ShuffleSplit(n_splits=1, train_size=train_prop, random_state=split_seed)
split_inds = [(train_index, valid_index) for train_index, valid_index in ss.split(trans_data)]

train_inds = split_inds[0][0]
valid_inds = split_inds[0][1]

train_data = trans_data.loc[train_inds, :]
valid_data = trans_data.loc[valid_inds,:]


From version 0.21, test_size will always complement train_size unless both are specified.



In [11]:
sparse_train_data = train_data.loc[:, feat_names].copy().to_sparse()
sparse_valid_data = valid_data.loc[:, feat_names].copy().to_sparse()

In [32]:
# Scale data

sc = StandardScaler(with_mean = False)
scale_train_data = pd.DataFrame(sc.fit_transform(sparse_train_data), columns = feat_names).to_sparse()
scale_valid_data = pd.DataFrame(sc.transform(sparse_valid_data), columns = feat_names).to_sparse()


In [33]:
# Dimension Reduction

pca = decomposition.PCA(svd_solver = 'randomized')
res = pca.fit_transform(scale_train_data)

ex_var = pca.explained_variance_ratio_.cumsum()
num_comp_keep = np.where(np.array(ex_var <= 0.95))[0][-1]

pca_train_data = pd.DataFrame(res).iloc[:, range(num_comp_keep)]

valid_res = pca.transform(scale_valid_data)
pca_valid_data = pd.DataFrame(valid_res).iloc[:, range(num_comp_keep)]
print pca_valid_data.shape

(446, 1513)


In [14]:
f = open('../data/train_data.p', 'w')
pickle.dump(train_data, f)
f.close()

In [15]:
f = open('../data/proc_train_data.p', 'w')
pickle.dump(pca_train_data.to_sparse(), f)
f.close()

In [34]:
f = open('../data/valid_data.p', 'w')
pickle.dump(valid_data, f)
f.close()

In [35]:
f = open('../data/proc_valid_data.p', 'w')
pickle.dump(pca_valid_data.to_sparse(), f)
f.close()

Random Forest Regressor Results

In [26]:
f.close()

In [2]:
f = open('../data/grid_search_res2.p', 'r')
rf_grids_res = pickle.load(f)
f.close()

In [22]:
params = rf_grids_res.best_params_
params

{'max_features': 'auto', 'min_samples_leaf': 5}

In [36]:
f = open('../data/valid_res_rf.p', 'r')
valid_res_rf = pickle.load(f)
f.close()

In [40]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error

np.sqrt(mean_squared_error(valid_data['target'], valid_res_rf['valid_res']))

1.6051672398048538