# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score, mean_squared_error

# Load Data and Data Cleaning

In [None]:
df = pd.read_csv('..\datasets\cleaned_df_2.csv').drop(columns='Unnamed: 0')

In [None]:
df.head()

The data has been clean in the EDA in 03_EDA_Part2

# Model Prep: Creating features matrix and target vector

## Preprocessing

Our models will include the following features

`area_cost`

`flat_type_1 ROOM`

`flat_type_1 ROOM` * (`Tranc_YearMonth`), 

`flat_type_2 ROOM`

`flat_type_2 ROOM` * (`flat_model_DBSS`)

`flat_type_3 ROOM` (Which we will drop to avoid collinearity)

`flat_type_3 ROOM` * (`flat_model_DBSS`, `flat_model_Premium Apartment`, `flat_model_Terrace`)

`flat_type_4 ROOM`

`flat_type_4 ROOM` * (`Hawker_Within_2km`, `flat_model_Model A2`, `flat_model_Simplified`, `flat_model_Adjoined flat`, `flat_model_DBSS`, `flat_model_Terrace`,`flat_model_Premium Apartment Loft`, `flat_model_Type S1`)

`flat_type_5 ROOM`

`flat_type_5 ROOM` * (`Hawker_Within_2km`, `flat_model_Type S2`, `flat_model_Premium Apartment Loft`, `flat_model_DBSS`, `flat_model_Improved-Maisonette`, `flat_model_Standard`, `flat_model_Adjoined flat`)

`flat_type_EXECUTIVE`

`flat_type_EXECUTIVE` * (`Hawker_Within_2km`, `flat_model_Premium Maisonette`, `flat_model_Adjoined flat`)

`flat_type_MULTI-GENERATION`

Flat types will be relative to 3 ROOM flats that are not DBSS, Premium or Terrace houses.
Planning Area will be relative to afforable areas.

In [None]:
X=df[['area_cost','flat_type']]
X=pd.get_dummies(columns=['area_cost','flat_type'],data=X)

In [None]:
flat_dummies = pd.get_dummies(columns=['flat_type','flat_model'], data=df)

In [None]:
interaction_terms_list = [('flat_type_1 ROOM','Tranc_YearMonth_ord'), 
                          ('flat_type_2 ROOM','flat_model_DBSS'),
                          ('flat_type_3 ROOM','flat_model_DBSS'),
                          ('flat_type_3 ROOM','flat_model_Premium Apartment'),
                          ('flat_type_3 ROOM','flat_model_Terrace'),
                          ('flat_type_4 ROOM','Hawker_Within_2km'),
                          ('flat_type_4 ROOM','flat_model_Model A2'),
                          ('flat_type_4 ROOM','flat_model_Simplified'),
                          ('flat_type_4 ROOM','flat_model_Adjoined flat'),
                          ('flat_type_4 ROOM','flat_model_DBSS'),
                          ('flat_type_4 ROOM','flat_model_Terrace'),
                          ('flat_type_4 ROOM','flat_model_Premium Apartment Loft'),
                          ('flat_type_4 ROOM','flat_model_Type S1'),
                          ('flat_type_5 ROOM','Hawker_Within_2km'),
                          ('flat_type_5 ROOM','flat_model_Type S2'),
                          ('flat_type_5 ROOM','flat_model_Premium Apartment Loft'),
                          ('flat_type_5 ROOM','flat_model_DBSS'),
                          ('flat_type_5 ROOM','flat_model_Improved-Maisonette'),
                          ('flat_type_5 ROOM','flat_model_Standard'),
                          ('flat_type_5 ROOM','flat_model_Adjoined flat'),
                          ('flat_type_EXECUTIVE','Hawker_Within_2km'),
                          ('flat_type_EXECUTIVE','flat_model_Premium Maisonette'),
                          ('flat_type_EXECUTIVE','flat_model_Adjoined flat')]

In [None]:
for terms in interaction_terms_list:
    X[f"{terms[0]}*{terms[1]}"]= flat_dummies[terms[0]]*flat_dummies[terms[1]]

In [None]:
#dropping the 3 ROOM
X=X.drop(columns=['flat_type_1 ROOM','flat_type_3 ROOM','area_cost_affordable'])

In [None]:
X

In [None]:
y = df['resale_price']*100_000

# Model Prep: Train/Test split

Since our training set is large, we will use 0.8 of the data as our training size.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, train_size=.8)

We will use the same random_state to make both models comparable.

We will also standardize `floor_area_sqm` for lasso and ridge in X_1.

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

# Instantiate models

In [None]:
lr = LinearRegression()

In [None]:
optimal_lasso = LassoCV(n_alphas = 500,max_iter=2000)

optimal_lasso.fit(X_train,y_train);

In [None]:
optimal_ridge = RidgeCV(alphas = np.linspace(.1, 10, 100))

optimal_ridge.fit(X_train,y_train);

In [None]:
null_model = np.full(shape=len(y), fill_value=y.mean())

Our null model will be the mean resale price.

# X Cross Validation

We'll use RMSE (i.e. closer to 0, better fit)

In [None]:
# Null model
np.sqrt(mean_squared_error(y,null_model))

In [None]:
lr_scores_rmse = -cross_val_score(lr,X_train,y_train, scoring = 'neg_root_mean_squared_error')
lr_scores_rmse.mean()

In [None]:
lasso_scores_rmse = -cross_val_score(optimal_lasso,X_train,y_train, scoring = 'neg_root_mean_squared_error')
lasso_scores_rmse.mean()

In [None]:
ridge_scores_rmse = -cross_val_score(optimal_ridge,X_train,y_train, scoring = 'neg_root_mean_squared_error')
ridge_scores_rmse.mean()

All our regression models for X features turned out really well relative to the null model.

We will now see how it performs for overfitting.

In [None]:
lasso = Lasso(alpha=optimal_lasso.alpha_)

lasso.fit(X_train,y_train)

In [None]:
ridge = Ridge(alpha=optimal_ridge.alpha_)

ridge.fit(X_train,y_train)

In [None]:
lasso_train_rmse = -cross_val_score(ridge,X_train,y_train, scoring = 'neg_root_mean_squared_error')
lasso_test_rmse = -cross_val_score(ridge,X_test,y_test, scoring = 'neg_root_mean_squared_error')

print(f'Train RMSE = {ridge_train_rmse.mean()}, Test RMSE = {ridge_test_rmse.mean()}')

In [None]:
ridge_train_rmse = -cross_val_score(ridge,X_train,y_train, scoring = 'neg_root_mean_squared_error')
ridge_test_rmse = -cross_val_score(ridge,X_test,y_test, scoring = 'neg_root_mean_squared_error')

print(f'Train RMSE = {ridge_train_rmse.mean()}, Test RMSE = {ridge_test_rmse.mean()}')

Let's check out how linear regression compares to lasso and ridge.

In [None]:
lr_train_rmse = -cross_val_score(lr,X_train,y_train, scoring = 'neg_root_mean_squared_error')
lr_test_rmse = -cross_val_score(lr,X_test,y_test, scoring = 'neg_root_mean_squared_error')

print(f'Train RMSE = {lr_train_rmse.mean()}, Test RMSE = {lr_test_rmse.mean()}')

# X coef distribution

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr.intercept_

In [None]:
pd.Series(lr.coef_, index = X.columns)

In [None]:
ridge.intercept_

In [None]:
pd.Series(lr.coef_, index = X.columns)

In [None]:
(pd.Series(ridge.coef_, index = X.columns)
 .sort_values(ascending=True)[:10]
 .plot(kind = 'barh', figsize=(15,7)))
plt.show()

In [None]:
(pd.Series(ridge.coef_, index = X.columns)
 .sort_values(ascending=False)[:10]
 .plot(kind = 'barh', figsize=(15,7)))
plt.show()

In [None]:
(pd.Series(lasso.coef_, index = X.columns)
 .sort_values(ascending=False)[:10]
 .plot(kind = 'barh', figsize=(15,7)))
plt.show()

In [None]:
(pd.Series(lasso.coef_, index = X.columns)
 .sort_values(ascending=True)[:10]
 .plot(kind = 'barh', figsize=(15,7)))
plt.show()

In [None]:
pred = ridge.predict(X_test)

In [None]:
residual = y_test - pred

In [None]:
plt.hist(residual,bins=100)
plt.show()

In [None]:
plt.scatter(x=pred, y=residual)
plt.axhline(0,color='orange')
plt.show()

# Second Kaggle Submission

In [None]:
df_validation = pd.read_csv(r'..\datasets\test.csv')

In [None]:
df_validation.columns[41]

Since we're not using postal, we can skip this step.

In [None]:
X_train = df[['full_flat_type', 'planning_area', 'block_type']] #We'll be using the full training dataset.
X_validation = df_validation[['full_flat_type', 'planning_area', 'block_type']]

y_train = df['resale_price']

In [None]:
for feature in ['full_flat_type', 'planning_area', 'block_type']:
    X_train = pd.get_dummies(columns=[feature], drop_first = True, data=X_train)

for feature in ['full_flat_type', 'planning_area', 'block_type']:
    X_validation = pd.get_dummies(columns=[feature], drop_first = True, data=X_validation)

In [None]:
X_train.shape

In [None]:
X_validation.shape

In [None]:
missing_columns = X_train.columns.difference(X_validation.columns).tolist()

We will add those columns in to X_validation

In [None]:
missing_features = []
for feature in missing_columns:
    missing_features.append((X_train.columns.get_loc(feature),feature))

In [None]:
missing_features

In [None]:
for feature in missing_features:
    X_validation.insert(feature[0], feature[1],0)

In [None]:
X_validation.shape

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.predict(X_validation)

In [None]:
final_predicts = pd.DataFrame(data=[df_validation['id'],np.transpose(lr.predict(X_validation))], columns=['Id','Predicted'])

In [None]:
final_predicts = pd.DataFrame({'Id':df_validation['id'],'Predicted':lr.predict(X_validation)*100_000})

In [None]:
df_validation['id']

In [None]:
final_predicts.to_csv(r'..\datasets\predictions.csv')