In [47]:
# Import all the neccessary libraries
import numpy as np 
import pandas as pd 
import datetime as d

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

# import warnings
# warnings.filterwarnings("ignore")

In [48]:
df_original = pd.read_csv('train.csv') 
df_original.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [49]:
df_original = df_original.rename(columns={"1stFlrSF": "lvl1sf","2ndFlrSF": "lvl2sf"})

df_features = df_original[["LotArea","YearBuilt","lvl1sf",
    "lvl2sf","FullBath","BedroomAbvGr","TotRmsAbvGrd","HouseStyle"]]
df_output = df_original["SalePrice"]

In [50]:
df_features.head()

Unnamed: 0,LotArea,YearBuilt,lvl1sf,lvl2sf,FullBath,BedroomAbvGr,TotRmsAbvGrd,HouseStyle
0,8450,2003,856,854,2,3,8,2Story
1,9600,1976,1262,0,2,3,6,1Story
2,11250,2001,920,866,2,3,6,2Story
3,9550,1915,961,756,1,3,7,2Story
4,14260,2000,1145,1053,2,4,9,2Story


In [51]:
# Create a col to calculate the age of the flat
current_year = int(d.datetime.now().year)
df_features["FlatAge"] = current_year - df_features["YearBuilt"]
df_features = df_features.drop(columns=['YearBuilt'])

In [52]:
numerical_features = ["LotArea","FlatAge","lvl1sf","lvl2sf",
    "FullBath","BedroomAbvGr","TotRmsAbvGrd"]
categorical_features = ["HouseStyle"]

In [53]:
preprocess = make_column_transformer((StandardScaler(),numerical_features),
    (OneHotEncoder(drop = 'first'), categorical_features))

steps = [('pre_process',preprocess),('Linear_reg',LinearRegression())]
model = Pipeline(steps)

In [54]:
# Without Cross Validation 
X_train, X_test, y_train, y_test = train_test_split(df_features, df_output, test_size=0.3, random_state = 42)

# Training the data 
model.fit(X_train, y_train)
# Predicting with test set
y_pred = model.predict(X_test)
# Computing accuracy with y test 
model.score(X_test, y_test)
print("The accuracy of the model is: {0:0.3f} ".format(model.score(X_test,y_test)))

The accuracy of the model is: 0.739 


In [71]:
# Cross Validation; Evaluation of the model 
cv_result = cross_val_score(model, df_features, df_output, cv=5, scoring = 'r2')
cv_result = cv_result.mean()
print("The CV accuracy of the model is: {0:0.3f} ".format(cv_result))

The CV accuracy of the model is: 0.693 
