<a href="https://colab.research.google.com/github/Bhar8at/Regression-Techniques/blob/main/quantrecruitmentround2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error , r2_score
import pandas as pd
import numpy as np
import plotly.figure_factory as ff


In [None]:
df = pd.read_csv('/Boston Housing.csv')
df = df.dropna()

# Predicting property tax per 10,000 $ based on the given data set
### Predictor variables are
• crim: Per capita crime rate by town.

• zn: Proportion of large residential lots (over 25,000 sq. ft.).

• indus: Proportion of non-retail business acres per town.

• Chas: Binary variable indicating if the property is near Charles River (1
for yes, 0 for no).

• nox : Concentration of nitrogen oxides in the air.

• rm: Average number of rooms per dwelling.

• age: Proportion of old owner-occupied units built before 1940.

• dis: Weighted distances to Boston employment centers.

• rad: Index of accessibility to radial highways.

In [None]:
df.head(5)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
df['tax'].max(), df['tax'].min()

(711, 187)

In [None]:
# Splitting Data into Training and Testing
Z_train, Z_test, y_train, y_test = train_test_split(
    df[['crim', 'zn', 'indus',
        'chas', 'nox', 'rm',
        'age', 'dis', 'rad',
        'ptratio', 'b', 'lstat',
        'medv']],
     df['tax'],
    test_size=0.2)



# OLS Model

In [None]:
# OLS Model
lm = LinearRegression()
lm.fit(Z_train, y_train)
Yhat = lm.predict(Z_test)

# Evaluvating the Model
hist_data = [Yhat , df['tax']]
group_labels = ['Predicted', 'Actual']
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, curve_type='normal',bin_size=.2)
fig.show()

r_squared = r2_score(y_test, Yhat)
mse = mean_squared_error(y_test, Yhat)
rmse = np.sqrt(mse)

print("\n")
print(f'R-squared: {r_squared:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')




R-squared: 0.8139
Mean Squared Error (MSE): 5183.9741
Root Mean Squared Error (RMSE): 71.9998


# Inferences
- Since the $R^2$ is close to 1. it says that the model is pretty good to predict tax values for property
- since our tax values are all the ranges (200,700) a RMSE value of 48 is pretty significant.
---



# Ridge Model

In [None]:
RidgeModel = Ridge(alpha=0.1)
RidgeModel.fit(Z_train, y_train)
Yhat = RidgeModel.predict(Z_test)


hist_data = [Yhat , df['tax']]
group_labels = ['Predicted', 'Actual']
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, curve_type='normal',bin_size=.2)
fig.show()

r_squared = r2_score(y_test, Yhat)
mse = mean_squared_error(y_test, Yhat)
rmse = np.sqrt(mse)

print(f'R-squared: {r_squared:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')


R-squared: 0.8139
Mean Squared Error (MSE): 5184.4329
Root Mean Squared Error (RMSE): 72.0030


# Inferences
- Clearly the RMSE hasn't vastly improved.
  - This is probably caused due to lack of strong multicollinearity among the predictor variables
  - This could also be because the chosen alpha parameter is not ideal



# Improving the Alpha Parameter Using Grid Search

In [None]:
np.linspace(8,9,10)

array([8.        , 8.11111111, 8.22222222, 8.33333333, 8.44444444,
       8.55555556, 8.66666667, 8.77777778, 8.88888889, 9.        ])

In [None]:
param_grid = {'alpha': np.logspace(-4,4,100)}

ridge = Ridge()
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5)

grid_search.fit(Z_train, y_train)

best_alpha = grid_search.best_params_['alpha']
best_model = grid_search.best_estimator_

Yhat = best_model.predict(Z_test)

r_squared = r2_score(y_test, Yhat)
mse = mean_squared_error(y_test, Yhat)
rmse = np.sqrt(mse)


print(f'Optimal alpha: {best_alpha:.4f}')
print(f'R-squared: {r_squared:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')

hist_data = [Yhat, df['tax']]
group_labels = ['Predicted', 'Actual']
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, curve_type='normal', bin_size=.2)
fig.show()

Optimal alpha: 8.4975
R-squared: 0.8145
Mean Squared Error (MSE): 5168.6896
Root Mean Squared Error (RMSE): 71.8936


# Results
- Managed to Reduce the RMSE and also slightly improve the R-squared value

----

In [None]:
RidgeModel = Ridge(alpha=40)
RidgeModel.fit(Z_train, y_train)
Yhat = RidgeModel.predict(Z_test)


hist_data = [Yhat , df['tax']]
group_labels = ['Predicted', 'Actual']
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, curve_type='normal',bin_size=.2)
fig.show()

r_squared = r2_score(y_test, Yhat)
mse = mean_squared_error(y_test, Yhat)
rmse = np.sqrt(mse)

print(f'R-squared: {r_squared:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')

R-squared: 0.8155
Mean Squared Error (MSE): 5141.7276
Root Mean Squared Error (RMSE): 71.7058


- But on manually trying out different values I found that alpha = 40 gives me the best scores
- Probably because during GridSearch , the alpha = 8.495 performed best upon cross validation but alpha = 40 is what suits well for the training data



# Lasso Model

In [None]:
# OLS Model
lm = Lasso()
lm.fit(Z_train, y_train)
Yhat = lm.predict(Z_test)

# Evaluvating the Model
hist_data = [Yhat , df['tax']]
group_labels = ['Predicted', 'Actual']
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, curve_type='normal',bin_size=.2)
fig.show()

r_squared = r2_score(y_test, Yhat)
mse = mean_squared_error(y_test, Yhat)
rmse = np.sqrt(mse)

print("\n")
print(f'R-squared: {r_squared:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')



R-squared: 0.8157
Mean Squared Error (MSE): 5136.0554
Root Mean Squared Error (RMSE): 71.6663


- A possible reason as to why Lasso outperforms Ridge is probably because Lasso prevents the data from overfitting better than Ridge and gives a more generalized model that performs better on the testing data