#Here we gonna start with our ML project on Caifornia Housing dataset

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly_express as px

In [None]:
raw_df = pd.read_csv('cal_housing[1].data')

In [None]:
columns = ['Longitude','Latitude','housingMedianAge','totalRooms','totalBedrooms','population','households','medianIncome','medianHouseValue']
raw_df.columns=columns

In [None]:
raw_df.head()

In [None]:
raw_df.info()

In [None]:
raw_df.describe()

In [None]:
raw_df.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
train_val_df,test_df=train_test_split(raw_df, test_size=0.2,random_state=42)

In [None]:
train_val_df.shape,test_df.shape

In [None]:
train_df,val_df=train_test_split(train_val_df,test_size=0.2,random_state=42)

In [None]:
train_df.shape,val_df.shape,test_df.shape

In [None]:
input_cols=raw_df.drop(columns=['medianHouseValue']).columns.tolist()
target_col=["medianHouseValue"]

In [None]:
train_input=train_df[input_cols].copy()
val_input=val_df[input_cols].copy()
train_target=train_df[target_col].copy()
val_target=val_df[target_col].copy()

In [None]:
sns.histplot(data=raw_df,x='medianIncome',bins=[0., 1.5, 3.0, 4.5, 6.])
plt.show()

In [None]:
sns.scatterplot(data=raw_df,x='Longitude',
                y='Latitude',
                alpha=0.7)
plt.show()

In [None]:
raw_df

In [None]:
fig=figsize=(8,10)
fig=px.scatter(
    raw_df,
    x="Longitude",
    y="Latitude",
    size=raw_df['population']/100,
    color='medianHouseValue',
    color_continuous_scale='jet',
    opacity=0.5
)
fig.show()

In [None]:
corr_df = raw_df.select_dtypes(np.number)
corr_matrix = corr_df.corr()
sns.heatmap(corr_matrix, cmap='Reds', annot=True, fmt=".2f")
plt.show()

Some basic Feature Engineering 

In [None]:
raw_df['rooms_per_household']=raw_df['totalRooms']/raw_df['households']
raw_df['bedroom_per_room']=raw_df['totalBedrooms']/raw_df['totalRooms']
raw_df['popu_per_household']=raw_df['population']/raw_df['households']

In [None]:
corr_mat2=raw_df.corr()
corr_mat2

In [None]:
corr_mat2['medianHouseValue'].sort_values(ascending=False)

Added new features to our train, val, test data

In [None]:
train_input['rooms_per_household']=train_input['totalRooms']/train_input['households']
train_input['popu_per_household']=train_input['population']/train_input['households']
train_input['bedroom_per_room']=train_input['totalBedrooms']/train_input['totalRooms']

val_input['rooms_per_household']=val_input['totalRooms']/val_input['households']
val_input['popu_per_household']=val_input['population']/val_input['households']
val_input['bedroom_per_room']=val_input['totalBedrooms']/val_input['totalRooms']

test_df['rooms_per_household']=test_df['totalRooms']/test_df['households']
test_df['popu_per_household']=test_df['population']/test_df['households']
test_df['bedroom_per_room']=test_df['totalBedrooms']/test_df['totalRooms']

In [None]:
train_input.info()

Since we already have a Clean dataset no need for imputation

In [None]:
fig = px.scatter_mapbox(raw_df, lat='Latitude', lon='Longitude', zoom=5, mapbox_style="carto-positron")
fig.show()

In [None]:
raw_df.to_csv('raw_df',index=False)

Starting with Models

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg=LinearRegression()
lin_reg.fit(train_input,train_target)

In [None]:
lin_train_pred=lin_reg.predict(train_input)

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse_train=mean_squared_error(lin_train_pred,train_target)
lin_rmse_train=np.sqrt(lin_mse_train)
lin_rmse_train

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg=DecisionTreeRegressor()
tree_reg.fit(train_input,train_target)
tree_pred=tree_reg.predict(train_input)

In [None]:
tree_mse=mean_squared_error(tree_pred,train_target)
tree_rmse=np.sqrt(tree_mse)
tree_rmse

In [None]:
tree_val_pred=tree_reg.predict(val_input)
tree_val_mse=mean_squared_error(tree_val_pred,val_target)
tree_val_rmse=np.sqrt(tree_val_mse)
tree_val_rmse

In [None]:
from sklearn.model_selection import cross_val_score
scores=cross_val_score(tree_reg,train_input,train_target,
                       scoring="neg_mean_squared_error",cv=10)
tree_rmse_score=np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:",scores)
    print("Mean",scores.mean())
    print("Standard Deviation:",scores.std())

In [None]:
display_scores(lin_train_pred),
display_scores(tree_rmse_score)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg=RandomForestRegressor()
forest_reg.fit(train_input,train_target)
forest_train_pred=forest_reg.predict(train_input)
forest_mse=mean_squared_error(train_target,forest_train_pred)
forest_rmse=np.sqrt(forest_mse)

In [None]:
forest_rmse

In [None]:
import joblib
joblib.dump(forest_reg,"my_model.pkl")
my_model_loaded=joblib.load("my_model.pkl")

In [None]:
print(my_model_loaded)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# from joblib import dump,load

# joblib.dump(forest_reg,'random_forest_model.pkl')
# print("Model saved successfully")

Model saved successfully


In [None]:
# import joblib 
# loaded_model=joblib.load('random_forest_model.pkl')
# print("Model loaded successfully")

Model loaded successfully
