In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
import missingno as msno
import re
import math

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
tr = pd.read_csv("datasets/train.csv")
tr_test = pd.read_csv("datasets/test.csv")

***Data Cleaning***

In [3]:
def convert_to_snake_case(df):
    #add a space between any lowercase-capital letter pair, then replace spaces with _, the all to lowercase
    new_cols = {col: re.sub(r"([a-z]{1})([A-Z]{1})", r"\1 \2", col).replace(" ", "_").lower() for col in df.columns}
    return df.rename(columns = new_cols, inplace = True)

In [72]:
def zero_counts(df):
    counts = {col: df[col].value_counts().to_dict().get(0) for col in df.columns if df[col].value_counts().to_dict().get(0) != None}
    return {key: value for key, value in counts.items() if value > 0}

In [71]:
def neg_counts(df):
    counts = {col: sum([val for key, val in df[col].value_counts().items() if type(key) in [float, int] and key < 0]) for col in df.columns}
    return {key: value for key, value in counts.items() if value > 0}

In [5]:
def yes_no_to_bin(word):
    true_words = ['yes', 'true', 'y', 't']
    false_words = ['no', 'false', 'n', 'f']
    
    if str(word).lower() in true_words:
        return 1
    elif str(word).lower() in false_words:
        return 0
    else:
        return word

In [6]:
def get_corr_above_or_below(df, percentage):
    corr = df.corr()
    col_rows = corr.columns.tolist()
    
    keep_corrs = {}
    for col in col_rows:
        for row in col_rows:
            if corr.loc[col, row] >= percentage or corr.loc[col, row] <= percentage*-1:
                if col == row:
                    break
                if col in keep_corrs:
                    keep_corrs[col][row] = corr.loc[col, row]
                else:
                    keep_corrs[col] = {row: corr.loc[col, row]}
                    
    return keep_corrs

In [17]:
def outlier_dict(df, deviation = 3):
    cols = df.dtypes.to_dict()
    
    outlier_locations = {}
    for col, dtype in cols.items():
        if dtype in [np.float64, np.int64]:
            locations = np.abs(stats.zscore(df[col])) > deviation #credit to: https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
            out_rows = df[locations]
            
            for row in out_rows.iterrows():
                loc = df.index.get_loc(row[0])
                if col in outlier_locations:
                    outlier_locations[col].append(loc)
                else:
                    outlier_locations[col] = [loc]
                    
    return outlier_locations

In [8]:
def outlier_dict_to_df(dictionary, original_df):
    out_cols = original_df.columns.tolist()
    out_cols.append("outlier_column_name")
    outliers = pd.DataFrame(columns = out_cols)

    for col, indices in dictionary.items():
        for index in indices:
            row = original_df.iloc[index].to_dict()
            row['outlier_column_name'] = col
            outliers = outliers.append(row, ignore_index = True)
            #outliers.style.applymap('background-color: yellow', outliers.iloc[outliers.shape[0] - 1, outliers.columns.get_loc(col)])outliers = outlier_dict_to_df(outlier_dict(sac), sac)

    return outliers

In [56]:
def corrs_selection(df, col, threshold = .5, greater_than = True):
    sale_corr = df.corr().to_dict()[col]
    if greater_than:
        sale_corr = {key: val for key, val in sale_corr.items() if val > threshold}
    else:
        sale_corr = {key: val for key, val in sale_corr.items() if val < threshold}
        
    return pd.DataFrame(sale_corr, index =[0])

In [9]:
convert_to_snake_case(tr)
convert_to_snake_case(tr_test)

In [10]:
thresh = int(tr.shape[0]*.9)

In [11]:
tr.dropna(axis = 1, thresh = thresh, inplace = True)

In [12]:
tr.dropna(axis = 0, inplace = True)

In [13]:
tr.head()

Unnamed: 0,id,pid,ms_sub_class,ms_zoning,lot_area,street,lot_shape,land_contour,utilities,lot_config,...,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,misc_val,mo_sold,yr_sold,sale_type,sale_price
0,109,533352170,60,RL,13517,Pave,IR1,Lvl,AllPub,CulDSac,...,44,0,0,0,0,0,3,2010,WD,130500
1,544,531379050,60,RL,11492,Pave,IR1,Lvl,AllPub,CulDSac,...,74,0,0,0,0,0,4,2009,WD,220000
2,153,535304180,20,RL,7922,Pave,Reg,Lvl,AllPub,Inside,...,52,0,0,0,0,0,1,2010,WD,109000
3,318,916386060,60,RL,9802,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,4,2010,WD,174000
4,255,906425045,50,RL,14235,Pave,IR1,Lvl,AllPub,Inside,...,59,0,0,0,0,0,3,2010,WD,138500


In [14]:
tr = tr.applymap(yes_no_to_bin)

In [77]:
c = get_corr_above_or_below(tr, .5)
all_good_corrs = pd.DataFrame(c)
all_good_corrs

Unnamed: 0,year_built,year_remod/add,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,gr_liv_area,bsmt_full_bath,full_bath,half_bath,bedroom_abv_gr,tot_rms_abv_grd,garage_yr_blt,garage_cars,garage_area,yr_sold,sale_price
overall_qual,0.59925,0.57162,,0.514039,,0.580137,,0.547817,,,,0.578604,0.585067,0.546118,,0.793771
year_built,,0.649266,,,,,,0.503852,,,,0.836035,0.532475,,,0.556128
bsmt_fin_sf_1,,,-0.522595,0.529482,,,0.6312,,,,,,,,,
total_bsmt_sf,,,,,0.894239,,,,,,,,,0.532053,,0.624425
1st_flr_sf,,,,,,0.551089,,,,,,,,0.521692,,0.625741
2nd_flr_sf,,,,,,0.640179,,,0.637801,0.510678,0.576359,,,,,
gr_liv_area,,,,,,,,0.618391,,,0.814297,,0.517337,0.513397,,0.707227
full_bath,,,,,,,,,,,0.513092,0.510417,0.528035,,,0.551247
bedroom_abv_gr,,,,,,,,,,,0.637672,,,,,
year_remod/add,,,,,,,,,,,,0.68497,,,,0.537782


In [78]:
sale_corrs = corrs_selection(tr, 'sale_price')
sale_corrs.drop(columns = ['sale_price'], inplace = True)
sale_corrs

Unnamed: 0,overall_qual,year_built,year_remod/add,mas_vnr_area,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,garage_yr_blt,garage_cars,garage_area
0,0.793771,0.556128,0.537782,0.504453,0.624425,0.625741,0.707227,0.551247,0.536018,0.536071,0.653573,0.646123


In [90]:
colinears = []
sale_cols = sale_corrs.columns.tolist()
all_colls = all_good_corrs.columns.tolist()
all_rows = list(all_good_corrs[all_colls[0]].to_dict().keys())

for col in sale_cols:
    if col in all_colls:
        for row in sale_cols:
            if row in all_rows and col != row and not math.isnan(all_good_corrs[col][row]):
                colinears.append((col, row))

In [96]:
colinear_counts = {}
for a, b in colinears:
    colinear_counts[a] = colinear_counts[a] + 1 if a in colinear_counts else 1
    colinear_counts[b] = colinear_counts[b] + 1 if b in colinear_counts else 1
    
high_colinear_counts = []
low_colinear_counts = []

for key, val in colinear_counts.items():
    if val > 4:
        high_colinear_counts.append(key)
    else:
        low_colinear_counts.append(key)

In [97]:
x_list = good_corrs.columns.tolist()
X = tr[x_list]
y = tr['sale_price']
lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [98]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

0.7417782682173144

In [112]:
x_list = high_colinear_counts
X = tr[x_list]
y = tr['sale_price']
lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

0.74517917014157

In [101]:
x_list = low_colinear_counts
X = tr[x_list]
y = tr['sale_price']
lr = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_score(y_test, y_pred)

0.5850487697198524

In [102]:
high_colinear_counts

['year_built',
 'overall_qual',
 'gr_liv_area',
 'full_bath',
 'garage_yr_blt',
 'garage_cars',
 'garage_area']