# Import / Preprocessing

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing

In [2]:
# Read in the datas
training = pd.read_csv("./train.csv")
evaluation = pd.read_csv("./test.csv")

In [3]:
# There are 116 categorical columns of the form: cat1-cat116
catCols = ["cat" + str(x) for x in range(1, 117)]

# There are 14 continuous columns of the form cont1-cont14
contCols = ["cont" + str(x) for x in range(1,15)]

In [4]:
# Create label encoder model dictionary and populate dict with label encoding models
leDict = {}
for catCol in catCols:
    leDict[catCol] = preprocessing.LabelEncoder().fit(pd.concat([training[catCol], evaluation[catCol]]))

In [5]:
# Apply encoding models to categorical variables
training[catCols] = training[catCols].apply(lambda x: leDict[x.name].transform(x))
evaluation[catCols] = evaluation[catCols].apply(lambda x: leDict[x.name].transform(x))

In [6]:
def splitDF(df):
    # create and random permutation of indices
    indices = np.random.permutation(df.shape[0])

    # split permutation in two
    numInd = len(indices)
    trainingInds = indices[:math.ceil(numInd/2)]

    # split df using permutations
    inds = df.index.isin(trainingInds)
    training = df[inds]
    testing = df[~inds]

    # double check shape
    print(testing.shape)
    print(training.shape)
    
    return (testing, training)

In [7]:
# Make a training and testing set
train, test = splitDF(training.drop("id", axis=1))

(94159, 131)
(94159, 131)


# All the Boosting

In [8]:
import sklearn.ensemble as em

models = [em.AdaBoostRegressor(), em.BaggingRegressor(), em.ExtraTreesRegressor(), em.GradientBoostingRegressor(), em.RandomForestRegressor()]

In [9]:
from joblib import Parallel, delayed
import multiprocessing
import sklearn.ensemble as em

models = [em.AdaBoostRegressor(), em.BaggingRegressor(), em.ExtraTreesRegressor(), em.GradientBoostingRegressor(), em.RandomForestRegressor()]
num_cores = multiprocessing.cpu_count()

# prep params
X = train.drop(["loss"], axis=1)
y = train["loss"]
X_t = test.drop(["loss"], axis=1)
y_t = test["loss"] 

# Fit all of the models
fittedModels = Parallel(n_jobs=num_cores)(delayed(model.fit)(X, y) for model in models)

  **self._backend_args)


In [10]:
# Apply models to training data:
y_hats_train = [m.predict(X) for m in fittedModels]


# Apply models to testing data
y_hats_test = [m.predict(X_t) for m in fittedModels]

In [21]:
# Make Training Ensemble DF from model predictions:
ensembleTrain = pd.DataFrame(y_hats_train).transpose()
ensembleTrain["y"] = y.reset_index().drop("index", axis=1)

# Make Testing Ensemble DF from model predictions:
ensembleTest = pd.DataFrame(y_hats_test).transpose()

In [22]:
# Fit model to training Ensemble DF
ensMod = em.RandomForestRegressor()
ensMod.fit(ensembleTrain.drop("y", axis=1), ensembleTrain["y"])


# Apply ensemble model to testing ensemble DF
y_hat_final = ensMod.predict(ensembleTest)

In [23]:
def score(y, y_hat):
    return abs(y - y_hat).mean()

In [25]:
score(y_hat_final, y_t)

1330.3040535689634

## Thoughts
Sooo, my ensembling score is only 1330, which I think is worse than the gradient boosting score (1200ish)
- Does this mean my ensembling method is not working?
- Could my result mean that ensembling makes my code less liable to overfitting?
- **Submission w/ all of this ensembling gave me a score of 1302, which is not an improvement on my best (RF) of 1242**

# Ensembling All Training Data

In [26]:
from joblib import Parallel, delayed
import multiprocessing
import sklearn.ensemble as em

models = [em.AdaBoostRegressor(), em.BaggingRegressor(), em.ExtraTreesRegressor(), em.GradientBoostingRegressor(), em.RandomForestRegressor()]
num_cores = multiprocessing.cpu_count()

allData = pd.concat([train, test])

# prep params
X = allData.drop(["loss"], axis=1)
y = allData["loss"]

# Fit all of the models
fittedModels = Parallel(n_jobs=num_cores)(delayed(model.fit)(X, y) for model in models)

  **self._backend_args)


In [27]:
# Apply models to training data:
y_hats_train = [m.predict(X) for m in fittedModels]

In [29]:
# Apply models to eval data
X_t = evaluationation.drop("id", axis=1)

y_hats_eval = [m.predict(X_t) for m in fittedModels]

In [28]:
# Make Training Ensemble DF from model predictions:
ensembleTrain = pd.DataFrame(y_hats_train).transpose()
ensembleTrain["y"] = y.reset_index().drop("index", axis=1)

In [30]:
# Make Testing Ensemble DF from model predictions:
ensembleEval = pd.DataFrame(y_hats_eval).transpose()

In [35]:
# Apply ensemble model to eval ensemble DF
y_hat_final = ensMod.predict(ensembleEval)

In [36]:
y_hat_final.shape

(125546,)

In [37]:
evaluation["loss"] = y_hat_final

In [38]:
evaluation[["id","loss"]].to_csv("./submission4.csv", index=False)