In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import mathis as mt
import math
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
train = pd.read_csv("datasets/modified_train.csv")
test = pd.read_csv("datasets/modified_test.csv")

train = train.drop(columns = ['Unnamed: 0', 'training'])
test = test.drop(columns = ['Unnamed: 0', 'training'])

house = train.append(test)

In [3]:
def quick_lr(columns):
    X = train[columns]
    y = train['sale_price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    return r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred, squared = False)

In [4]:
def colinearity_count(df, columns, threshold = .5):
    corr = df.corr()
    counts = {key: 0 for key in columns}
    
    for row in columns:
        for col in columns:
            if corr[row][col] > threshold and row != col:
                counts[row] += 1
                
    return counts

# Modeling

***Test Models***

In [5]:
x_list = house.columns.tolist()
x_list.remove('sale_price')
X = train[x_list]
y = train['sale_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)
r_alphas = np.logspace(0,100,1000)
lasso_cv = LassoCV(alphas=r_alphas, random_state=42, cv=5, max_iter = 10000)
lasso_cv.fit(Z_train,y_train);
lasso_pred = lasso_cv.predict(Z_test)

In [7]:
sc = StandardScaler()
Z_train2 = sc.fit_transform(X_train)
Z_test2 = sc.transform(X_test)
r_alphas = np.logspace(0,100,1000)
ridge_cv = RidgeCV(alphas=r_alphas,scoring="neg_mean_squared_error", cv=5)
ridge_cv.fit(Z_train2,y_train);
ridge_pred = ridge_cv.predict(Z_test2)

In [8]:
best_corrs = mt.corrs_selection(train, 'sale_price')
best_corrs

Unnamed: 0,overall_qual,year_built,year_remod/add,mas_vnr_area,exter_qual,bsmt_qual,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,kitchen_qual,tot_rms_abv_grd,fireplace_qu,garage_yr_blt,garage_finish,garage_cars,garage_area,foundation_PConc
0,0.810548,0.585223,0.559321,0.503621,0.714711,0.619759,0.655701,0.643099,0.713283,0.542345,0.698426,0.503016,0.537636,0.568093,0.565849,0.653993,0.654364,0.542314


In [9]:
counts = colinearity_count(train, best_corrs.columns.tolist())

In [10]:
lasso_cv.score(Z_train,y_train), mean_squared_error(y_test, lasso_pred, squared = False)

(0.9321633937005478, 24577.652585376145)

In [11]:
ridge_cv.score(Z_train2,y_train), mean_squared_error(y_test, ridge_pred, squared = False)

(0.9346830370861068, 25038.74948855554)

In [12]:
quick_lr(best_corrs.columns.tolist())

(0.8611679778139132, 29926.7794449836)

In [13]:
quick_lr([key for key, val in counts.items() if val > 3])

(0.8317891247574051, 32941.410686420255)

In [14]:
quick_lr([key for key, val in counts.items() if val < 4])

(0.6948960767937642, 44364.8537576636)

***Kaggle submission***

In [15]:
sc = StandardScaler()
Z_train = sc.fit_transform(train[x_list])
Z_test = sc.transform(test[x_list])
r_alphas = np.logspace(0,100,1000)
lasso_cv = LassoCV(alphas=r_alphas, random_state=42, cv=5, max_iter = 10000)
lasso_cv.fit(Z_train,y)
test_ids = test['id']
test_preds = lasso_cv.predict(Z_test)

In [16]:
kaggle_submission = {
    'Id': test_ids,
    'SalePrice': test_preds
}

kaggle_submission = pd.DataFrame(kaggle_submission)

In [17]:
kaggle_submission.head(5)

Unnamed: 0,Id,SalePrice
0,2658,135547.345824
1,2718,154943.062667
2,2414,230483.684384
3,1989,112894.736296
4,625,176565.127608


In [18]:
kaggle_submission.to_csv('./datasets/kaggle_submission_lasso-out.csv', index=False)

***Analysis***

Submissions:
   - LASSO no buckets no outliers: 26734.00198
   - LASSO no buckets: 29754.67899
   - LASSO with outliers: 32135.00957
   - LASSO without outliers: 32414.92745 (30734.27885 with better buckets)
   - Ridge no buckets: 32813.94436
   - Ridge with outliers: 32593.92522
   - Ridge without outliers: 33994.54079
   - Just best corrs: 34950.18614
   - Min colinearity: 51789.76432