In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error
from modelling_functions import *

## Processing for Models

In [2]:
#### Read Data files
housing, housing_features, feat_labels, dict_dictonary = read_and_clean(filepath = "../data/clean_train.csv")

  housing.col, id_dictonary = to_numeric(housing, col, 'SalePrice')


In [3]:
housing.head()

Unnamed: 0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,60,2,65.0,8450,1,1,0,1,1,0,...,0,0,3,2,0,2,2008,1,2,12.247694
1,20,2,80.0,9600,1,1,0,1,1,0,...,0,0,3,2,0,5,2007,1,2,12.109011
2,60,2,68.0,11250,1,1,1,1,1,0,...,0,0,3,2,0,9,2008,1,2,12.317167
3,70,2,60.0,9550,1,1,1,1,1,0,...,0,0,3,2,0,2,2006,1,1,11.849398
4,60,2,84.0,14260,1,1,1,1,1,0,...,0,0,3,2,0,12,2008,1,2,12.429216


In [4]:
htest_id, htest_features, htest_labels, htest_dictonary = read_and_clean(filepath = "../data/clean_test.csv", test = True, dictonary = dict_dictonary)

  housing.col, id_dictonary = to_numeric_test(housing, col, dictonary[col])


In [5]:
housing_features.lotarea = np.sqrt(housing_features.lotarea)
htest_features.lotarea = np.sqrt(htest_features.lotarea)

In [7]:
htrain, htest, ptrain, ptest = train_test_split(housing_features, housing.saleprice, test_size = 0.33)

## RandomForestRegressor

In [None]:
#### CAUTION
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20, 30, 40]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 16, 32, 64]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(htrain, ptrain)

In [None]:
rf_random.best_params_

In [None]:
clf = RandomForestRegressor(n_estimators=4555, 
                            random_state=9, 
                            n_jobs=-1, 
                            min_samples_split = 2, 
                            min_samples_leaf = 1, 
                            max_features = 'sqrt', 
                            max_depth = 162,
                            bootstrap = False)
#sfm = SelectFromModel(clf, threshold = 0.01)
#sfm.fit(htrain, ptrain)
# ## Not Finished
# # Measure Feature Importance
#feature_selected = []
#for feature_list_index in sfm.get_support(indices=True):
#     feature_selected.append(feat_labels[feature_list_index])
#proxy = feature_selected
#trimmed = ['x1stflrsf', 'x2ndflrsf', 'garagecars', 'overallcond', 'saleprice', 'Unnamed: 0', 'bsmtfinsf1']
#testing = testing[proxy]
#housing_features = housing_features[trimmed_features]
#print(feature_selected)

In [None]:
clf.fit(htrain, ptrain)

clf.score(htest, ptest)

In [None]:
nice = clf.predict(htest)
mean_squared_error(nice, ptest)

In [None]:
np.sqrt(mean_squared_error(nice, ptest))

In [None]:
pred = np.exp(clf.predict(htest_features))
Submission(htest_id, pred)

## Lasso

#### Feature Pass Through

In [8]:
alp = optimize_penalty(htrain, ptrain, model=Lasso, min_=1e-5, max_=1, step_=1000, random=False, riter=100)

In [9]:
alp.best_params_

{'alpha': 1e-05}

In [39]:
lasso = LinearRegression()
lasso.fit(htrain, ptrain)

print(lasso.score(htrain, ptrain))
print(lasso.score(htest, ptest))
pred = lasso.predict(htest)
rmsle(ptest.values,pred)
#print(selected_coefs)

0.8933945455717149
0.8903081938287074


0.010010373211258273

In [22]:
pred = np.exp(lasso.predict(htest_features))
Submission(htest_id, pred)

## Kaggle Scoring Metric

In [38]:
def rmsle(real, predicted):
    sum=0.0
    for x in range(len(predicted)):
        if predicted[x]<0 or real[x]<0: #check for negative values
            continue
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted))**0.5

## KFolds Cross Validation

In [None]:
from itertools import combinations
import random
combos = combinations(housing_features.columns,3)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from itertools import combinations
kf = KFold(n_splits=10,shuffle=True,random_state=0)
y = housing.saleprice
for i in random.sample(combos,k=int(len(combos)/1000)):           
    X = housing_features.loc[:,i]
    alphas = []
    R2 = []
    MSE = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        lm = optimize_penalty(X_train, y_train, model=Lasso, min_=1e-5, max_=1, step_=10, random=False, riter=100)
        alphas.append(lm.best_params_['alpha'])
        R2.append(lm.score(X_test, y_test))        
        MSE.append(mean_squared_error(y_test,lm.predict(X_test)))
    print('feat: ', i,'\nR2: ',round(np.mean(R2),4),'MSE: ',round(np.mean(MSE),4))

## Step Forwards

In [52]:
range(len(housing_features.columns.values))
X = housing_features
y = housing.saleprice

In [155]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
def run_kfolds(feat,splits):
    kf = KFold(n_splits=splits,shuffle=True,random_state=0)
    X = housing_features.loc[:,feat]
    MSE, kgs = [], []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        lm = optimize_penalty(X_train, y_train, model=Lasso, min_=1e-5, max_=1, step_=10, random=False, riter=100)
        kgs.append(rmsle(y_test.values, lm.predict(X_test)))
    return lm

In [54]:
total_features = list(enumerate(housing_features.columns.values))
idx = [idx[0] for idx in total_features]
names = [idx[1] for idx in total_features]
feat = idx[:10]
run_kfolds(X.iloc[:,feat].columns.values, 5)

0.02592096308852797

In [64]:
mask = np.ones(len(idx),dtype=bool)


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

In [107]:
kgs = []
for i in range(len(feat)):
    mask = np.ones(len(feat),dtype=bool)
    mask[i] = False
    kgs.append(run_kfolds(X.iloc[:,feat[mask]].columns.values, 5))
    print("{:.4E}".format(kgs[-1]))

1.9999E-02
2.0509E-02
2.0025E-02
1.9998E-02
2.0058E-02
2.0003E-02
2.0006E-02
2.0017E-02
2.0004E-02
2.7877E-02


In [170]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
X_train, X_test, y_train, y_test = train_test_split(housing_features, housing.saleprice, test_size = 0.33)
sfs1 = sfs(clf,k_features=5,forward=True,
           floating=False,verbose=2,
           scoring='r2',cv=5)

sfs1.fit(X_train.values,y_train)




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  79 out of  79 | elapsed:  2.1min finished

[2019-07-24 22:13:18] Features: 1/5 -- score: 0.6635602931313859[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  78 out of  78 | elapsed:  2.1min finished

[2019-07-24 22:15:24] Features: 2/5 -- score: 0.7311148384809913[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s


In [150]:
total_features = list(enumerate(housing_features.columns.values))
X = housing_features
idx = [idx[0] for idx in total_features]
names = [idx[1] for idx in total_features]
feat = np.asarray(idx[:10])
splits = 2
best = 1
j = 0
while (j+10) < 79:
    print('Iter: ',j,'\tTrying Features: ',feat)
    kg_b = run_kfolds(X.iloc[:,feat].columns.values, splits)
    if kg_b < best:
        best = kg_b
        best_feat = feat
        print('KG new best: {:.4E}'.format(best))
    else:    
        print('KG now: {:.4E}'.format(kg_b))
    kgs = []
    for i in range(len(feat)):
        mask = np.ones(len(feat),dtype=bool)
        mask[i] = False
        kgs.append(run_kfolds(X.iloc[:,feat[mask]].columns.values, splits))
        #print("{:.4E}".format(kgs[-1]))
    remove = np.argmax(kgs)
    print('Removing feature idx: ', feat[remove])
    feat = feat = np.delete(feat,remove)
    feat = np.append(feat, j+10)
    j += 1
    print('**'*40)

Iter:  0 	Trying Features:  [0 1 2 3 4 5 6 7 8 9]
KG new best: 2.5757E-02
Removing feature idx:  1
********************************************************************************
Iter:  1 	Trying Features:  [ 0  2  3  4  5  6  7  8  9 10]
KG now: 2.7108E-02
Removing feature idx:  3
********************************************************************************
Iter:  2 	Trying Features:  [ 0  2  4  5  6  7  8  9 10 11]
KG new best: 1.9827E-02
Removing feature idx:  11
********************************************************************************
Iter:  3 	Trying Features:  [ 0  2  4  5  6  7  8  9 10 12]
KG now: 2.7099E-02
Removing feature idx:  2
********************************************************************************
Iter:  4 	Trying Features:  [ 0  4  5  6  7  8  9 10 12 13]
KG now: 2.8278E-02
Removing feature idx:  6
********************************************************************************
Iter:  5 	Trying Features:  [ 0  4  5  7  8  9 10 12 13 14]
KG now: 2.8618

KeyboardInterrupt: 

In [149]:
run_kfolds(X.iloc[:,feat].columns.values, splits)

0.025352026878293438

In [None]:
pred = np.exp(model.predict(htest_features.iloc[:,feat]))
pred

In [None]:
Submission(htest_id, pred)

In [None]:
raw = pd.read_csv('../data/Raw Data/test.csv').Id
bin = pd.read_csv('../data/clean_test.csv').Id
cle = pd.read_csv('../data/clean_test_beforebin.csv').Id

check = pd.DataFrame({'raw':raw,'bin':bin,'cle':cle})