In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import mathis as mt
import math
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
train = pd.read_csv("datasets/modified_train.csv")
test = pd.read_csv("datasets/modified_test.csv")

train = train.drop(columns = ['Unnamed: 0', 'training'])
test = test.drop(columns = ['Unnamed: 0', 'training'])

house = train.append(test)

In [3]:
def quick_lr(columns):
    X = train[columns]
    y = train['sale_price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    return r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred, squared = False)

In [4]:
def colinearity_count(df, columns, threshold = .5):
    corr = df.corr()
    counts = {key: 0 for key in columns}
    
    for row in columns:
        for col in columns:
            if corr[row][col] > threshold and row != col:
                counts[row] += 1
                
    return counts

In [5]:
x_list = house.columns.tolist()
x_list.remove('sale_price')
X = train[x_list]
y = train['sale_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)
r_alphas = np.logspace(0,5,100)
lasso_cv = LassoCV(alphas=r_alphas, random_state=42, cv=5)
lasso_cv.fit(Z_train,y_train);
lasso_pred = lasso_cv.predict(Z_test)

In [7]:
sc = StandardScaler()
Z_train2 = sc.fit_transform(X_train)
Z_test2 = sc.transform(X_test)
r_alphas = np.logspace(0,5,100)
ridge_cv = RidgeCV(alphas=r_alphas,scoring="neg_mean_squared_error", cv=5)
ridge_cv.fit(Z_train2,y_train);
ridge_pred = ridge_cv.predict(Z_test2)

In [8]:
coefs = lasso_cv.coef_
lasso_weights = {}
cols = train.columns.tolist()
for i in range(0, len(coefs)):
    lasso_weights[cols[i]] = coefs[i]

In [9]:
best_corrs = mt.corrs_selection(train, 'sale_price')
best_corrs

Unnamed: 0,overall_qual,year_built,year_remod/add,mas_vnr_area,exter_qual,bsmt_qual,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,kitchen_qual,tot_rms_abv_grd,fireplace_qu,garage_yr_blt,garage_finish,garage_cars,garage_area,foundation_PConc
0,0.800207,0.571849,0.55037,0.503579,0.712146,0.612188,0.629303,0.618486,0.697038,0.537969,0.692336,0.504014,0.538925,0.556146,0.557839,0.647781,0.649897,0.529047


In [10]:
counts = colinearity_count(train, best_corrs.columns.tolist())

In [11]:
ridge_cv.score(Z_train2,y_train), mean_squared_error(y_test, ridge_pred, squared = False)

(0.8728699378548094, 27039.79598883738)

In [12]:
lasso_cv.score(Z_train,y_train), mean_squared_error(y_test, lasso_pred, squared = False)

(0.8725758411296889, 26596.73956630602)

In [13]:
quick_lr(best_corrs.columns.tolist())

(0.8635942472140041, 28940.19169706193)

In [14]:
quick_lr([key for key, val in counts.items() if val > 3])

(0.8360487767562901, 31727.97764247045)

In [15]:
quick_lr([key for key, val in counts.items() if val < 4])

(0.677733691411466, 44482.840562171936)

In [16]:
sub_list = [key for key, val in counts.items() if val < 4]
lr = LinearRegression()
lr.fit(train[sub_list], y)
test_preds = lr.predict(test[sub_list])

In [17]:
X_sub = test[x_list]
sc = StandardScaler()
Z_sub = sc.fit_transform(X_sub)
test_ids = test['id']
test_preds = ridge_cv.predict(Z_sub)

In [18]:
kaggle_submission = {
    'Id': test_ids,
    'SalePrice': test_preds
}

kaggle_submission = pd.DataFrame(kaggle_submission)

In [19]:
kaggle_submission.head(5)

Unnamed: 0,Id,SalePrice
0,2658,115035.022075
1,2718,163144.650297
2,2414,228229.000388
3,1989,109464.154162
4,625,192117.620313


In [20]:
kaggle_submission.to_csv('./datasets/kaggle_submission_ridge-out.csv', index=False)

Submissions:
   - Lasso with outliers: 32135.00957
   - LASSO without outliers: 32414.92745
   - Ridge with outliers: 32593.92522
   - Just best corrs: 34950.18614
   - Min colinearity: 51789.76432