In [10]:
# Import modules

import pandas as pd
import numpy as np
import os
import sys
import random
import copy
import pickle
import time

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import colorlover as cl

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from sklearn.model_selection import ShuffleSplit
from scipy.stats import spearmanr
from scipy import stats
from sklearn import decomposition

import plotting_methods as pm

init_notebook_mode(connected=True)

%reload_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.4f}'.format

In [2]:
raw_data_dir = os.path.join(os.getcwd(), os.pardir, 'data')

f = open(os.path.join(raw_data_dir, 'train.csv'), 'r')
raw_data = pd.read_csv(f)
f.close()

id_col = 'ID'
tar_col = 'target'

In [3]:
f = open(os.path.join(raw_data_dir, 'test.csv'), 'r')
raw_test_data = pd.read_csv(f)
f.close()

id_col = 'ID'
tar_col = 'target'

In [4]:
# Clean data

# Remove features with no variance
clean_data = raw_data.drop(columns = raw_data.columns[np.where(raw_data.std() == 0.0)[0]])
feat_names = [x for x in clean_data.columns if x not in [id_col, tar_col]]

In [5]:
# Transform data

trans_data = (clean_data.loc[:, feat_names + [tar_col]] + 1).apply(np.log)
trans_test_data = (raw_test_data.loc[:, feat_names + [tar_col]] + 1).apply(np.log)




Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike



In [6]:
# Split data into training and validation

train_prop = 0.8
valid_prop = 0.2
split_seed = 4

ss = ShuffleSplit(n_splits=1, train_size=train_prop, random_state=split_seed)
split_inds = [(train_index, valid_index) for train_index, valid_index in ss.split(trans_data)]

train_inds = split_inds[0][0]
valid_inds = split_inds[0][1]

train_data = trans_data.loc[train_inds, :]
valid_data = trans_data.loc[valid_inds,:]


From version 0.21, test_size will always complement train_size unless both are specified.



In [7]:
# Dimension Reduction

sparse_train_data = train_data.loc[:, feat_names].copy().to_sparse()
sparse_valid_data = valid_data.loc[:, feat_names].copy().to_sparse()

pca = decomposition.PCA(svd_solver = 'randomized')
res = pca.fit_transform(sparse_train_data)

ex_var = pca.explained_variance_ratio_.cumsum()
num_comp_keep = np.where(np.array(ex_var <= 0.95))[0][-1]

pca_train_data = pd.DataFrame(res).iloc[:, range(num_comp_keep)]

valid_res = pca.transform(sparse_valid_data)
pca_valid_data = pd.DataFrame(valid_res).iloc[:, range(num_comp_keep)]
print pca_valid_data.shape

(446, 1121)


In [8]:
f = open('../data/train_data.p', 'w')
pickle.dump(train_data, f)
f.close()

In [9]:
f = open('../data/proc_train_data.p', 'w')
pickle.dump(pca_train_data.to_sparse(), f)
f.close()

In [27]:
# Train model and perform gridsearch on hyperparameters

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf}

rf = RandomForestRegressor(n_estimators = 200, max_features = 'auto', min_samples_leaf = 1)

df_train = pca_train_data.iloc[range(pca_train_data.shape[0] * 2 / 3),:]
y = train_data[tar_col].iloc[range(pca_train_data.shape[0] * 2 / 3)]

df_test = pca_train_data.iloc[range((pca_train_data.shape[0] * 2 / 3),  pca_train_data.shape[0]),:]

t0 = time.time()
rf.fit(df_train, y)
t1 = time.time()
print t1 - t0

t0 = time.time()
pred = rf.predict(df_test)
t1 = time.time()
print t1 - t0

'''
rf = RandomForestRegressor()
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
#                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
grid_search = GridSearchCV(estimator = rf, param_grid = random_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(pca_train_data, train_data[tar_col])
'''

345.569790125
0.0981910228729


'\nrf = RandomForestRegressor()\n#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, \n#                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)\ngrid_search = GridSearchCV(estimator = rf, param_grid = random_grid, \n                          cv = 3, n_jobs = -1, verbose = 2)\n\ngrid_search.fit(pca_train_data, train_data[tar_col])\n'

In [17]:
pca_train_data.shape[0]

3567

In [22]:
n = np.random.random(pca_train_data.shape)

In [23]:
n.shape

(3567, 1121)