In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import mathis as mt
import math
from sklearn.linear_model import LassoCV, RidgeCV, LinearRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
train = pd.read_csv("datasets/modified_train.csv")
test = pd.read_csv("datasets/modified_test.csv")

train = train.drop(columns = ['Unnamed: 0'])
test = test.drop(columns = ['Unnamed: 0'])

house = train.append(test)

# Data Engineering

***Ordinals to numbers***

In [3]:
ord_dict = {'ex': 5, 'gd': 4, 'ta':3, 'fa':2, 'po':1, 'nan': 0}
ord_cats = ['exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'heating_qc', 'kitchen_qual', 'fireplace_qu', 'garage_qual', 'garage_cond', 'pool_qc']

for col in ord_cats:
    house[col] = house[col].map(lambda x: ord_dict[str(x).lower()])

In [4]:
ord_dict = {'gd':4, 'av':3, 'mn':2, 'no':1, 'nan':0, '0':0}
house['bsmt_exposure'] = house['bsmt_exposure'].map(lambda x: ord_dict[str(x).lower()])

In [5]:
ord_dict = {'fin':3, 'rfn':2, 'unf':1, 'nan':0}
house['garage_finish'] = house['garage_finish'].map(lambda x: ord_dict[str(x).lower()])

In [6]:
ord_dict = {'glq':6, 'alq':5, 'blq':4, 'rec':3, 'lwq':2, 'unf':1, 'nan':0}
house['bsmt_fin_type_1'] = house['bsmt_fin_type_1'].map(lambda x: ord_dict[str(x).lower()])
house['bsmt_fin_type_2'] = house['bsmt_fin_type_2'].map(lambda x: ord_dict[str(x).lower()])

In [7]:
ord_dict = {'typ':7, 'min1':6, 'min2':5, 'mod':4, 'maj1':3, 'maj2':2, 'sev':1, 'sal':0}
house['functional'] = house['functional'].map(lambda x: ord_dict[str(x).lower()])

- Second pairplot created at this step

***Determine features to bucket together***

In [8]:
candidates = mt.feature_bucket_candidates(house[house['training'] == 1])
buckets = mt.greedy_bucket_selection(candidates)

In [9]:
all_dummies = pd.get_dummies(train, columns = buckets.keys(), drop_first = True, dummy_na = True)
mt.corrs_selection(all_dummies, 'sale_price')

Unnamed: 0,overall_qual,year_built,year_remod/add,mas_vnr_area,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,garage_yr_blt,garage_cars,garage_area,foundation_PConc
0,0.800207,0.571849,0.55037,0.503579,0.629303,0.618486,0.697038,0.537969,0.504014,0.556146,0.647781,0.649897,0.529047


In [10]:
house = mt.map_buckets(house, buckets)

***Dummy categoricals***

In [11]:
house.shape

(2929, 82)

In [12]:
house = pd.get_dummies(house, columns = buckets.keys(), drop_first = True, dummy_na = True)

In [13]:
house.shape

(2929, 148)

***Remove all outliers***

In [14]:
irrelevant = ['ms_sub_class', 'overall_qual', 'overall_cond', 'central_air', 'bsmt_full_bath', 'bsmt_half_bath', 'full_bath', 'half_bath', 'bedroom_abv_gr', 'kitchen_abv_gr', 'tot_rms_abv_grd', 'fireplaces', 'garage_cars']
train = house[house['training'] == 1]

outliers = mt.outlier_dict(train)

for col in outliers:
    if not col in irrelevant:
        for out in outliers[col]:
            train = train[train[col] != out]

In [15]:
train = house[house['training'] == 1]
test = house[house['training'] == 0]

train.to_csv("datasets/modified_train.csv")
test.to_csv("datasets/modified_test.csv")