# Import / Preprocessing

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing

In [2]:
# Read in the datas
training = pd.read_csv("./train.csv")
evaluation = pd.read_csv("./test.csv")

In [3]:
# There are 116 categorical columns of the form: cat1-cat116
catCols = ["cat" + str(x) for x in range(1, 117)]

# There are 14 continuous columns of the form cont1-cont14
contCols = ["cont" + str(x) for x in range(1,15)]

In [4]:
# Create label encoder model dictionary and populate dict with label encoding models
leDict = {}
for catCol in catCols:
    leDict[catCol] = preprocessing.LabelEncoder().fit(pd.concat([training[catCol], evaluation[catCol]]))

In [5]:
# Apply encoding models to categorical variables
training[catCols] = training[catCols].apply(lambda x: leDict[x.name].transform(x))
evaluation[catCols] = evaluation[catCols].apply(lambda x: leDict[x.name].transform(x))

In [6]:
def splitDF(df):
    # create and random permutation of indices
    indices = np.random.permutation(df.shape[0])

    # split permutation in two
    numInd = len(indices)
    trainingInds = indices[:math.ceil(numInd/2)]

    # split df using permutations
    inds = df.index.isin(trainingInds)
    training = df[inds]
    testing = df[~inds]

    # double check shape
    print(testing.shape)
    print(training.shape)
    
    return (testing, training)

In [7]:
# Make a training and testing set
train, test = splitDF(training.drop("id", axis=1))

(94159, 131)
(94159, 131)


In [17]:
from joblib import Parallel, delayed
import multiprocessing
import sklearn.ensemble as em

models = [em.AdaBoostRegressor(), em.BaggingRegressor(), em.ExtraTreesRegressor(), em.GradientBoostingRegressor(), em.RandomForestRegressor()]
num_cores = multiprocessing.cpu_count()

allData = pd.concat([train, test])

# prep params
X = allData.drop(["loss"], axis=1)
y = allData["loss"]

# Fit all of the models
# fittedModels = Parallel(n_jobs=num_cores)(delayed(model.fit)(X, y) for model in models)

## Gradient boost with PCA

In [18]:
from sklearn.decomposition import PCA

In [23]:
X_t = evaluation.drop("id", axis=1)

In [41]:
pca = PCA(n_components = 65, svd_solver = 'full')
pca.fit(X)
X_PCA = pca.transform(X)
X_t_PCA = pca.transform(X_t)
print(X_PCA.shape)
print(X_t_PCA.shape)

(188318, 65)
(125546, 65)


In [45]:
m = em.GradientBoostingRegressor(n_estimators=5000)
m.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=5000,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [46]:
y_hat = m.predict(X_t)

In [47]:
evaluation["loss"] = y_hat
evaluation[["id","loss"]].to_csv("./submission11.csv", index=False)

In [27]:
X_t_PCA.shape

(125546, 129)

In [29]:
X_t.shape

(125546, 130)

## TODOs
- run xgboost
- read about effective dimensionality reduction
- read about scaling for boosting algos