In [422]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import  SGDRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [423]:
# mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [424]:
# loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Housing Price prediction/train.csv')
df.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [425]:
print(df.dtypes)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object


In [426]:
df.shape

(1460, 81)

In [427]:
# seperating inputs and outputs
X = df.drop(columns=['SalePrice', 'Id']) # also removing the 'Id' column

Y = df['SalePrice']

In [428]:
# seperating numerical features from categorical features
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [429]:
# num_transformer - a pipeline which replaces NaNs with the mean of the column and than scales the data
num_transformer = Pipeline([
    ('num_imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())

])

In [430]:
# cat_transformer - a pipeline which replaces NaNs with the moce of the column and than OneHotEncodes the data
cat_transformer = Pipeline([
    ('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore')) #ignoring unseen values in validation

])

In [431]:
# preprocessor pipeline that scales numerical values and one hot encodes categorical features
main_preprocessor = ColumnTransformer([

    ('cat_f', cat_transformer, cat_features),
    ('num_f', num_transformer, num_features),
])

In [432]:
# seperating the dataset into training and testing subsets respectively 80 and 20 %
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=33)

In [433]:
X_train.shape

(1168, 79)

In [434]:
# main training

#preprocessing the data
X_train = main_preprocessor.fit_transform(X_train)
scaler_train_test = StandardScaler()
Y_train =scaler_train_test.fit_transform(Y_train.values.reshape(-1, 1))



# using SGDRegressor with L2 regularization and 5 alpha
model=SGDRegressor(penalty='l2', alpha=5, random_state=1 )
model.fit(X_train, Y_train.squeeze())


mse_train = mean_squared_error(Y_train, model.predict(X_train))
r2_train = r2_score(Y_train, model.predict(X_train))

print(f'main training mse: {mse_train}')
print(f'main training R2: {r2_train}')


main training mse: 0.31772549986713433
main training R2: 0.6822745001328656


In [435]:
# validation

#preprocessing the data
X_val = main_preprocessor.transform(X_val)
Y_val = scaler_train_test.transform(Y_val.values.reshape(-1, 1))

# testing dataset predtiction
Y_val_pred = model.predict(X_val)

# counting mse and R^2
mse_val = mean_squared_error(Y_val, Y_val_pred)
r2_val = r2_score(Y_val, Y_val_pred)


In [436]:
 # mse
print(f'main training mse: {mse_train}')
print(f'val mse: {mse_val}')


main training mse: 0.31772549986713433
val mse: 0.34774308041443797


In [437]:
# R^2
print(f'main training R2: {r2_train}')
print(f'Avarage test R2: {r2_val}')

main training R2: 0.6822745001328656
Avarage test R2: 0.6442062294577551


In [438]:
#### TESTING
#loading the testing dataset
X_test = pd.read_csv('/content/drive/MyDrive/Housing Price prediction/test.csv')
X_test.shape



(1459, 80)

In [439]:
X_test.head(20)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
6,1467,20,RL,,7980,Pave,,IR1,Lvl,AllPub,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,1468,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
8,1469,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
9,1470,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [440]:
#removing the 'Id' column
X_test = X_test.drop(columns=['Id'])

In [441]:
# seperating numerical features from categorical features
num_features = X_test.select_dtypes(include=['int64', 'float64']).columns.tolist()

cat_features = X_test.select_dtypes(include=['object', 'category']).columns.tolist()


In [442]:
# testing

#preprocessing the data
X_test = main_preprocessor.transform(X_test)

# testing dataset predtiction
Y_test_pred = model.predict(X_test)


In [443]:
# reversing the predicted values to their original scale
Y_test_pred =scaler_train_test.inverse_transform(Y_test_pred.reshape(-1, 1))

Y_test_pred[:20]

array([[131111.6319779 ],
       [141065.48236353],
       [185827.72622761],
       [196134.74276822],
       [180142.09362589],
       [181141.85868014],
       [186237.73771416],
       [174087.99293826],
       [185400.09127454],
       [134607.16326965],
       [194704.32836228],
       [129476.31513628],
       [128192.69925698],
       [160787.37610302],
       [124568.25796278],
       [256101.89528055],
       [221825.08137294],
       [245404.60979149],
       [236143.70956228],
       [323932.66623564]])