## Understanding the Housing Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [None]:
housing.keys()

In [None]:
#Description of the dataset
print(housing.DESCR)

In [None]:
housing.data

In [None]:
housing.target

In [None]:
housing.feature_names

In [None]:
housing.target_names

## Preparing the Dataset

In [11]:
dataset = pd.DataFrame(housing.data)

In [12]:
dataset.columns = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

In [13]:
df = dataset.copy()

In [None]:
df.head()

In [15]:
df['Price'] = housing.target

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#Analysis of Data
df.describe()

In [None]:
#Finding Null Values
df.isnull().sum()

## Exploratory Data Analysis

In [None]:
df.corr()

In [22]:
# DO NOT RUN (very heavy)

# import seaborn as sns
# sns.pairplot(dataset)

## Analyzing The Correlated Features

In [None]:
# Positive Correlation

import seaborn as sns
sns.regplot(x="MedInc", y="Price", data=df)

In [None]:
# Negative Correlation

import seaborn as sns
sns.regplot(x="Population", y="Price", data=df)

In [None]:
# No Correlation

import seaborn as sns
sns.regplot(x="Latitude", y="Price", data=df)

## Splitting features into Independant and Dependant Category

In [28]:
# x - Dependant, y - Independant

x = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
x.head()

In [None]:
y.head()

In [31]:
# Training Test Split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 49)

In [None]:
x_train.head()

In [None]:
x_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

## Standardizing the Dataset

In [37]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [38]:
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [39]:
import pickle
pickle.dump(scaler, open('scaling.pkl','wb'))

In [None]:
x_train

In [None]:
x_test

## Model Training

In [43]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [None]:
regression.fit(x_train, y_train)

In [None]:
regression.coef_

In [None]:
regression.intercept_

In [None]:
# Parameters that the model has been trained
regression.get_params()

In [48]:
# Prediction with Test Data
reg_pred = regression.predict(x_test) 

In [None]:
reg_pred

## Assumptions

In [None]:
# Scatter Plot for Prediction
plt.scatter(y_test, reg_pred)

In [52]:
# Residual(error)
residuals = y_test - reg_pred

In [None]:
residuals

In [None]:
## Plot this residuals 
sns.displot(residuals, kind="kde")

In [None]:
# Scatter plot wrt prediction and residuals
plt.scatter(reg_pred, residuals)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(y_test, reg_pred))

print(mean_squared_error(y_test, reg_pred))

## R Square and Adjusted R Square

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_test, reg_pred)
score

In [None]:
adjusted_r2 = 1 - ((1 - score) * (len(y_test) - 1)) / (len(y_test) - x_test.shape[1] - 1)
adjusted_r2

## New Data Prediction

In [None]:
housing.data[0].reshape(1, -1)

In [None]:
# Transformation of New Data
Transformed_New_Data = scaler.transform(housing.data[0].reshape(1, -1))

In [None]:
regression.predict(Transformed_New_Data)

In [None]:
y.head()

## Pickling the Model for Deployment

In [66]:
import pickle
pickle.dump(regression, open('regmodel.pkl','wb'))

In [67]:
pickle_model = pickle.load(open('regmodel.pkl', 'rb'))

In [None]:
# Prediction
pickle_model.predict(Transformed_New_Data)