# This notebook treats the number of columns to drop (dropcolumns) as a hyperparameter.
# dropcolumns is the number of columns to drop from the bottom of the feature importance list generated from the random forest model
# It finds the best model accuracy for several random states and averages the number of columns to drop. This number is then used to create the final model in another notebook.

In [1]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import matplotlib.pyplot as plt
import math
import warnings
import pyautogui
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
warnings.filterwarnings('ignore')

# Functions

In [2]:
#Runs regression on data, returns test scores
def runregression(X_train, y_train, X_test, y_test, dropcolumns):
    model = LinearRegression()

    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)

    predictions = model.predict(X_test)
    predictions = np.power(10, predictions)
    pricesunlogged = np.power(10, y_test)

    testRMSLE = math.sqrt(mean_squared_log_error(pricesunlogged, predictions))
    
    predictions = model.predict(X_train)
    predictions = np.power(10, predictions)
    pricesunlogged = np.power(10, y_train)

    trainRMSLE = math.sqrt(mean_squared_log_error(pricesunlogged, predictions))

    return score, trainRMSLE, testRMSLE, dropcolumns

In [3]:
# drops columns for data and then splits into test and train data with a specified random state
def prepdata(randomstate, dropcolumns):
    inputs = pd.read_csv('data/traincleaned.csv')
    testinputs = pd.read_csv('data/testcleaned.csv')
    featureimportance = pd.read_csv('data/feature importance.csv')


    featureimportance = featureimportance.rename(columns={'0': 'importance', '1':'feature'})
    featureimportance = featureimportance.drop(columns=['Unnamed: 0'])
    featureimportance = featureimportance.sort_values(by='importance')

    featureimportance = featureimportance[0:dropcolumns]
    featureimportance = featureimportance['feature'].tolist()

    inputs = inputs.drop(columns=['SalePrice'])
    alldata = pd.concat([inputs, testinputs])
    alldata.set_index('Id', inplace=True)
    alldata = alldata.fillna(0)
    alldata = pd.get_dummies(alldata)
    alldata = alldata.drop(columns=['Unnamed: 0'])


    for column in featureimportance:
        try:
            alldata = alldata.drop(columns=column)
        except:
            print(f'Couldnt drop column {column}')

    inputs = alldata.loc[0:1460,]
    testinputs = alldata.loc[1461:]

    numericalcolumns = []
    for column in inputs.columns:
        if set(inputs[column].tolist()) != {0, 1}:
            numericalcolumns.append(column)

    prices = pd.read_csv('data/traincleaned.csv')
    prices = prices['SalePrice']
    prices = np.array(prices)
    prices = np.log10(prices)
    prices

    X_train, X_test, y_train, y_test = train_test_split(inputs, prices, random_state=randomstate, shuffle=True, test_size=.20)

    testinputs = testinputs.values
    inputs = inputs.values
    X_train = X_train.values
    X_test = X_test.values
    return inputs, testinputs, prices, X_train, X_test, y_train, y_test

In [4]:
#Runs the specified type of regression on data, returns test scores
def runspecialregression(X_train, y_train, X_test, y_test, dropcolumns, regtype, alpha):
    model = regtype(alpha=alpha).fit(X_train, y_train)
    score = model.score(X_test, y_test)

    predictions = model.predict(X_test)
    predictions = np.power(10, predictions)
    pricesunlogged = np.power(10, y_test)
    
    try:
        testRMSLE = math.sqrt(mean_squared_log_error(pricesunlogged, predictions))
    except:
        pass
        
    predictions = model.predict(X_train)
    predictions = np.power(10, predictions)
    pricesunlogged = np.power(10, y_train)

    try:
        trainRMSLE = math.sqrt(mean_squared_log_error(pricesunlogged, predictions))
    except:
        pass

    return score, trainRMSLE, testRMSLE, dropcolumns

# Normal Linear Regression

In [5]:
#Find optimal dropcolumns

#number of randomstates to average over
randomstates = 6
randomstatestart = 30

totalbestdropcolumn = 0

for randomstate in range(randomstatestart, randomstatestart + randomstates):
    
    LastRMSLE = 1
    besttest = 1
    besttestcol = 0
    
    for dropcolumns in range(160, 250, 2):
 
        inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, dropcolumns)

        score, trainRMSLE, testRMSLE, dropcolumns = runregression(X_train, y_train, X_test, y_test, dropcolumns)
            
        if testRMSLE < besttest:
            besttest = testRMSLE
            besttestcol = dropcolumns
            
    totalbestdropcolumn+=besttestcol            
        
    print(f'randomstate = {randomstate}, besttestcol = {besttestcol}')

averagedropcolumns = int(round(totalbestdropcolumn/randomstates, 0))
print(f'average dropcolumns = {averagedropcolumns}')

randomstate = 10, besttestcol = 204
randomstate = 11, besttestcol = 194
randomstate = 12, besttestcol = 194
randomstate = 13, besttestcol = 194
randomstate = 14, besttestcol = 164
randomstate = 15, besttestcol = 244
average dropcolumns = 199


In [6]:
#Feed columns (using drop columns) into the model to produce predictions

inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, averagedropcolumns)

model = LinearRegression()
model.fit(inputs, prices)

score = round(model.score(inputs, prices),3)

predictions = model.predict(inputs)
predictions = np.power(10, predictions)
pricesunlogged = np.power(10, prices)

RMSLE = round(math.sqrt(mean_squared_log_error(pricesunlogged, predictions)),4)

predictions = model.predict(testinputs)
predictions = np.power(10, predictions)

print(f'R2: {score}, RMSLE: {RMSLE}, dropcolumns: {averagedropcolumns}')

submittest = pd.read_csv('data/testcleaned.csv')
submittest = submittest[['Id']]
submittest['SalePrice'] = predictions
submittest.to_csv('Submissions/Linear.csv', index=False)

submittest.head()

R2: 0.937, RMSLE: 0.1001, dropcolumns: 199


Unnamed: 0,Id,SalePrice
0,1461,109944.546366
1,1462,162373.961748
2,1463,183270.376161
3,1464,200217.436496
4,1465,192750.877668


# Ridge Regression

In [7]:
#Find optimal dropcolumns

totalbestdropcolumn = 0

for randomstate in range(randomstatestart, randomstatestart + randomstates):
    
    LastRMSLE = 1
    besttest = 1
    besttestcol = 0
    
    for dropcolumns in range(160, 250, 2):
 
        inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, dropcolumns)

        regtype = Ridge
        alpha = 0.1
        try:
            score, trainRMSLE, testRMSLE, dropcolumns = runspecialregression(X_train, y_train, X_test, y_test, dropcolumns, regtype, alpha)
        except Exception as e:
            print(e)
            continue
            
        if testRMSLE < besttest:
            besttest = testRMSLE
            besttestcol = dropcolumns
            
    totalbestdropcolumn+=besttestcol
        
    print(f'randomstate = {randomstate}, besttestcol = {besttestcol}')
averagedropcolumns = int(round(totalbestdropcolumn/randomstates, 0))
print(f'average dropcolumns = {averagedropcolumns}')

randomstate = 30, besttestcol = 198
randomstate = 31, besttestcol = 194
randomstate = 32, besttestcol = 180
randomstate = 33, besttestcol = 206
randomstate = 34, besttestcol = 208
randomstate = 35, besttestcol = 244
average dropcolumns = 205


In [8]:
#Feed columns (using drop columns) into the model to produce predictions

inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, averagedropcolumns)

model = Ridge(alpha=alpha)
model.fit(inputs, prices)

score = round(model.score(inputs, prices),3)

predictions = model.predict(inputs)
predictions = np.power(10, predictions)
pricesunlogged = np.power(10, prices)

RMSLE = round(math.sqrt(mean_squared_log_error(pricesunlogged, predictions)),4)

predictions = model.predict(testinputs)
predictions = np.power(10, predictions)

print(f'R2: {score}, RMSLE: {RMSLE}, dropcolumns: {averagedropcolumns}')

submittest = pd.read_csv('data/testcleaned.csv')
submittest = submittest[['Id']]
submittest['SalePrice'] = predictions
submittest.to_csv('Submissions/Ridge.csv', index=False)

submittest.head()

R2: 0.935, RMSLE: 0.1022, dropcolumns: 205


Unnamed: 0,Id,SalePrice
0,1461,108768.30972
1,1462,161807.652874
2,1463,182792.182848
3,1464,201789.058189
4,1465,180771.703681


# Lasso Regression

In [9]:
#Find optimal dropcolumns

totalbestdropcolumn = 0

for randomstate in range(randomstatestart, randomstatestart + randomstates):
    
    LastRMSLE = 1
    besttest = 1
    besttestcol = 0
    
    for dropcolumns in range(160, 250, 2):
 
        inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, dropcolumns)

        regtype = Lasso
        alpha = 0.0
        try:
            score, trainRMSLE, testRMSLE, dropcolumns = runspecialregression(X_train, y_train, X_test, y_test, dropcolumns, regtype, alpha)
        except Exception as e:
            print(e)
            continue
            
        if testRMSLE < besttest:
            besttest = testRMSLE
            besttestcol = dropcolumns
            
    totalbestdropcolumn+=besttestcol
            
        
    print(f'randomstate = {randomstate}, besttestcol = {besttestcol}')
averagedropcolumns = int(round(totalbestdropcolumn/randomstates, 0))
print(f'average dropcolumns = {averagedropcolumns}')

randomstate = 30, besttestcol = 198
randomstate = 31, besttestcol = 194
randomstate = 32, besttestcol = 178
randomstate = 33, besttestcol = 218
randomstate = 34, besttestcol = 240
randomstate = 35, besttestcol = 244
average dropcolumns = 212


In [10]:
#Feed columns (using drop columns) into the model to produce predictions

inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, averagedropcolumns)

model = Lasso(alpha=alpha)
model.fit(inputs, prices)

score = round(model.score(inputs, prices),3)

predictions = model.predict(inputs)
predictions = np.power(10, predictions)
pricesunlogged = np.power(10, prices)

RMSLE = round(math.sqrt(mean_squared_log_error(pricesunlogged, predictions)),4)

predictions = model.predict(testinputs)
predictions = np.power(10, predictions)

print(f'R2: {score}, RMSLE: {RMSLE}, dropcolumns: {averagedropcolumns}')

submittest = pd.read_csv('data/testcleaned.csv')
submittest = submittest[['Id']]
submittest['SalePrice'] = predictions
submittest.to_csv('Submissions/Lasso.csv', index=False)

submittest.head()

R2: 0.931, RMSLE: 0.1051, dropcolumns: 212


Unnamed: 0,Id,SalePrice
0,1461,115412.420013
1,1462,162953.982324
2,1463,179419.305736
3,1464,200548.979141
4,1465,181965.447929


# ElasticNet Regression

In [11]:
#Find optimal dropcolumns

totalbestdropcolumn = 0

for randomstate in range(randomstatestart, randomstatestart + randomstates):
    
    LastRMSLE = 1
    besttest = 1
    besttestcol = 0

    for dropcolumns in range(160, 250, 2):
 
        inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, dropcolumns)

        regtype = ElasticNet
        alpha = 0.1
        
        try:
            score, trainRMSLE, testRMSLE, dropcolumns = runspecialregression(X_train, y_train, X_test, y_test, dropcolumns, regtype, alpha)
        except Exception as e:
            print(e)
            continue
            
        if testRMSLE < besttest:
            besttest = testRMSLE
            besttestcol = dropcolumns
            
    totalbestdropcolumn+=besttestcol
        
    print(f'randomstate = {randomstate}, besttestcol = {besttestcol}')
averagedropcolumns = int(round(totalbestdropcolumn/randomstates, 0))
print(f'average dropcolumns = {averagedropcolumns}')

randomstate = 30, besttestcol = 232
randomstate = 31, besttestcol = 160
randomstate = 32, besttestcol = 160
randomstate = 33, besttestcol = 190
randomstate = 34, besttestcol = 190
randomstate = 35, besttestcol = 190
average dropcolumns = 187


In [12]:
#Feed columns (using drop columns) into the model to produce predictions

inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdata(randomstate, averagedropcolumns)

model = ElasticNet(alpha=alpha)
model.fit(inputs, prices)

score = round(model.score(inputs, prices),3)

predictions = model.predict(inputs)
predictions = np.power(10, predictions)
pricesunlogged = np.power(10, prices)

RMSLE = round(math.sqrt(mean_squared_log_error(pricesunlogged, predictions)),4)

predictions = model.predict(testinputs)
predictions = np.power(10, predictions)

print(f'R2: {score}, RMSLE: {RMSLE}, dropcolumns: {averagedropcolumns}')

submittest = pd.read_csv('data/testcleaned.csv')
submittest = submittest[['Id']]
submittest['SalePrice'] = predictions
submittest.to_csv('Submissions/ElasticNet.csv', index=False)

submittest.head()

R2: 0.897, RMSLE: 0.1285, dropcolumns: 187


Unnamed: 0,Id,SalePrice
0,1461,130067.229436
1,1462,163099.031237
2,1463,180400.211703
3,1464,192508.704985
4,1465,189745.869975


# Scratch

In [13]:
#Kaggle scores
#0.12363, R2 Score = .94, RMSLE = .0977, no scaling, drop columns = 150
#.12054, R2: 0.9361934187245556, RMSLE: 0.10093248123296446, dropcolumns: 200
#.12057, R2: 0.9299809808208424, RMSLE: 0.10573194685650467, dropcolumns: 225, with all feature engineering
#.11940, R2: 0.9375345605388429, RMSLE: 0.09986609478879857, dropcolumns: 196, normal
#.11940, alpha: 0.0, RMSLE: 0.09954994348817671, R2: 0.9379294381208503, dropcolumns: 178, LASSO
#.11930, alpha: 0.0, RMSLE: 0.10620373708852855, R2: 0.9293547242432479, dropcolumns: 220, LASSO (using the finder thing)
#.11788, alpha: 0.1, RMSLE: 0.10047125196219843, R2: 0.9367752278000487, dropcolumns: 192, RIDGE
#.11764, alpha: 0.1, RMSLE: 0.10087057830058703, R2: 0.9362716458560815, dropcolumns: 199, RIDGE
#.11748, R2: 0.931127307306459, RMSLE: 0.10486287970080205, dropcolumns: 217, NORMAL
#.11741, alpha: 0.0, RMSLE: 0.09982164053839784, R2: 0.937590166070204, dropcolumns: 186, ELASTIC NET
#.11718, alpha: 0.0, RMSLE: 0.10151567301251134, R2: 0.9354539338776001, dropcolumns: 204 LASSO

In [14]:
# #SVR dropcolumns
# for randomstate in range(2):
#     bestRMSLE = 1
#     bestdropcolumns = 0
#     inputs, testinputs, prices, X_train, X_test, y_train, y_test = prepdataSVR(randomstate)
    
#     for dropcolumns in range(0, 200, 10):
        
#         featureimportance = pd.read_csv('feature importance.csv')

#         featureimportance = featureimportance.rename(columns={'0': 'importance', '1':'feature'})
#         featureimportance = featureimportance.drop(columns=['Unnamed: 0'])
#         featureimportance = featureimportance.sort_values(by='importance')
#         featureimportance = featureimportance[0:dropcolumns]
#         featureimportance = featureimportance['feature'].tolist()
        
#         for column in featureimportance:
#             try:
#                 inputs = inputs.drop(columns=column)
#                 testinputs = testinputs.drop(columns=column)
#                 X_train = X_train.drop(columns=column)
#                 X_test = X_test.drop(columns=column)
#             except:
#                 print(f'Couldnt drop column {column}')
        
# #         X_train = X_train.values
# #         X_test = X_test.values
# #         testinputs = testinputs.values
# #         inputs = inputs.values     
        
#         clf = SVR(kernel='poly', gamma='auto', C=200, degree=6, epsilon=100, coef0=2)
#         clf.fit(X_train, y_train) 
        
#         testpredictions = clf.predict(X_test)

#         testpredictions[testpredictions < y_train.min()/2] = y_train.min()

#         testRMSLE = math.sqrt(mean_squared_log_error(y_test, testpredictions))
        
#         if testRMSLE < bestRMSLE:
#             bestRMSLE = testRMSLE
#             bestdropcolumns = dropcolumns
#     print(randomstate, bestdropcolumns, bestRMSLE)

# def prepdataSVR(randomstate):
#     inputs = pd.read_csv('traincleaned.csv')
#     testinputs = pd.read_csv('testcleaned.csv')
#     featureimportance = pd.read_csv('feature importance.csv')

#     inputs = inputs.drop(columns=['SalePrice'])
#     alldata = pd.concat([inputs, testinputs])
#     alldata.set_index('Id', inplace=True)
#     alldata = alldata.fillna(0)
#     alldata = pd.get_dummies(alldata)
#     alldata = alldata.drop(columns=['Unnamed: 0'])

#     inputs = alldata.loc[0:1460,]
#     testinputs = alldata.loc[1461:]

#     numericalcolumns = []
#     for column in inputs.columns:
#         if set(inputs[column].tolist()) != {0, 1}:
#             numericalcolumns.append(column)
            
    

#     prices = pd.read_csv('traincleaned.csv')
#     prices = prices['SalePrice']
#     prices = np.array(prices)

#     X_train, X_test, y_train, y_test = train_test_split(inputs, prices, random_state=randomstate, shuffle=True, test_size=.20)

#     for column in numericalcolumns:
#         X_scaler = StandardScaler().fit(X_train[column].values.reshape(-1,1))
#         X_train[column] = X_scaler.fit_transform(X_train[column].values.reshape(-1,1))
#         X_test[column] = X_scaler.fit_transform(X_test[column].values.reshape(-1,1))
#         testinputs[column] = X_scaler.fit_transform(testinputs[column].values.reshape(-1,1))
#         inputs[column] = X_scaler.fit_transform(inputs[column].values.reshape(-1,1))
    
#     return inputs, testinputs, prices, X_train, X_test, y_train, y_test
