# Multiple Linear Regression

In this exercise, we will use multiple linear regression to predict median house values in Californian districts, given a number of features from these districts.

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Loading the dataset

In [None]:
dataset = pd.read_csv('housing.csv')

## Inspect the Data Structure

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset["ocean_proximity"].value_counts()

In [None]:
dataset.describe()

In [None]:
import matplotlib.pyplot as plt
dataset.hist(bins=50, figsize=(20,15))
plt.show()

## Data cleaning

In [None]:
dataset.info()

incomplete_rows = dataset[dataset.isnull().any(axis=1)].head()

incomplete_rows

# In this cases we can do the following

#dataset.dropna(subset=["total_bedrooms"])    # option 1 remove rows with Null or NAN
#dataset.drop("total_bedrooms", axis=1)       # option 2 remove full column

median = dataset["total_bedrooms"].median()  # option 3

dataset["total_bedrooms"].fillna(median, inplace=True)

dataset.info()


## Handling Text and Categorical Attributes

In [None]:
ocean_prox_cat = dataset[["ocean_proximity"]]
ocean_prox_cat[:10]

In [None]:
ocean_prox_cat["ocean_proximity"].unique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
ocean_prox_cat_1hot = cat_encoder.fit_transform(ocean_prox_cat)
ocean_prox_cat_1hot

In [None]:
cat_encoder.categories_

## Select dependent and independent variables

In [None]:
feature_cols = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity']

X = dataset[feature_cols]  # Features
y = dataset['median_house_value']  # Target variable

## Apply one hot encoder to data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
cat_attribs = ["ocean_proximity"]
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), cat_attribs)], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

---



In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

 ## Inspecting the intercept (constant) and the coefficients (slopes)

In [None]:
intercept = regressor.intercept_
print("Intercept: {}".format(intercept))

coefficient = regressor.coef_
print("coefficient: {}".format(coefficient))

## Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test)
print(y_pred)

## Evaluating the model with R2, MAE, MSE and RMSE

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

test_r2 = r2_score(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)
test_mape = mean_absolute_percentage_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

print("R2: {}".format(test_r2))
print("MAE: {}".format(test_mae))
print("MAPE: {}".format(test_mape))
print("MSE: {}".format(test_mse))
print("RMSE: {}".format(test_rmse))

## Visualize predicted VS actual data comparison

In [None]:
import matplotlib.pyplot as plt
import numpy as np

y_pred_s = y_pred[:50]
y_test_s = y_test[:50]
x = np.linspace(0, len(y_pred_s),len(y_pred_s))

plt.scatter(x, y_pred_s, label='Predicted')
plt.scatter(x, y_test_s, label='Actual')
plt.ylabel("Median House Value")
plt.legend()
plt.title("Comparison between predicted and actual data LR model")
plt.show()

## Training the Random Forest Regression model on the Training set

In [None]:

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train, y_train)

## Predicting the Test set results with the Random Forest Regression model

In [None]:
y_pred = regressor.predict(X_test)
print(y_pred)

## Evaluating the Random Forest Regression model with R2, MAE, MSE and RMSE

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

test_r2 = r2_score(y_test, y_pred)
test_mae = mean_absolute_error(y_test, y_pred)
test_mape = mean_absolute_percentage_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = np.sqrt(test_mse)

print("R2: {}".format(test_r2))
print("MAE: {}".format(test_mae))
print("MAPE: {}".format(test_mape))
print("MSE: {}".format(test_mse))
print("RMSE: {}".format(test_rmse))

## Visualize predicted VS actual data comparison of the Random Forest Regression model

In [None]:
import matplotlib.pyplot as plt
import numpy as np

y_pred_s = y_pred[:50]
y_test_s = y_test[:50]
x = np.linspace(0, len(y_pred_s),len(y_pred_s))

plt.scatter(x, y_pred_s, label='Predicted')
plt.scatter(x, y_test_s, label='Actual')
plt.ylabel("Median House Value")
plt.legend()
plt.title("Comparison between predicted and actual data with RF model")
plt.show()

## Test with new data points

In [None]:
house_data = {'longitude': [-122], 'latitude': [38], 'housing_median_age': [40],'total_rooms': [880],
     'total_bedrooms': [129],'population': [322],'households': [126],'median_income': [8.3],'ocean_proximity': ['NEAR BAY']}

house_df = pd.DataFrame(data=house_data)
print(house_df)

# apply the same one hot encoding transformation as was fitted on the data
house_df_transformed = np.array(ct.transform(house_df))
print(house_df_transformed)


house_pred = regressor.predict(house_df_transformed)
print(house_pred)