# 1 Preprocessing

In [265]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [266]:
df_train = pd.read_csv("./data/train.csv.gz")
df_test = pd.read_csv("./data/test.csv.gz")

In [286]:
CORR_THRESHOLD = 0.4 # vars should have at least this correlation to be considered as predictor
var_corr = df_train.corr()["SalePrice"].abs().sort_values(ascending=False) # get correlation of each var with dependent var (SalePrice) 
predictor_names = var_corr[(var_corr>CORR_THRESHOLD) & (var_corr<1)].index # get predictor names above threshold

In [287]:
train_data = df_train[list(predictor_names) + ["SalePrice"]]
test_data  = df_test[list(predictor_names)]

In [288]:
for dataset in [train_data, test_data]: 
    dataset["hasGarage"] = (dataset["GrLivArea"]>0).astype('int') # create boolean variables indicating if garage/masonry exists
    dataset["hasMasonry"] = (dataset["MasVnrArea"]>0).astype('int')
    dataset.fillna(0,inplace=True) # fill NaN (garageArea is NaN when there is no garage)

In [289]:
X = train_data.drop("SalePrice",axis=1)
y = train_data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

reg = LinearRegression()
reg.fit(X_train,y_train)
reg.score(X_test,y_test)

0.8012434687880913

In [291]:
df_submission = df_test[["Id"]]
df_submission["SalePrice"] = pred = reg.predict(test_data)
df_submission.to_csv("./submissions/submission_test.csv.gz",index=False)