In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_csv('../data/processed/clean_data.csv')

In [3]:
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122,37,41,880,129,322,126,8,452600,0,0,0,1,0
1,-122,37,21,7099,1106,2401,1138,8,358500,0,0,0,1,0
2,-122,37,52,1467,190,496,177,7,352100,0,0,0,1,0
3,-122,37,52,1274,235,558,219,5,341300,0,0,0,1,0
4,-122,37,52,1627,280,565,259,3,342200,0,0,0,1,0


In [4]:
X=df.drop("median_house_value", axis=1)
y=df["median_house_value"]

In [5]:
X.shape

(20433, 13)

In [6]:
y.shape

(20433,)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.2)
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
15215,-117,33,12,2924,433,1193,394,6,1,0,0,0,0
6504,-118,34,40,1983,298,853,271,5,1,0,0,0,0
2628,-124,40,38,2220,426,1041,401,2,1,0,0,0,0
6434,-118,34,37,778,205,850,198,2,1,0,0,0,0
2716,-115,32,35,1185,202,615,191,4,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1306,-121,38,36,1141,234,562,213,2,0,1,0,0,0
211,-122,37,52,3424,690,2273,685,3,0,0,0,1,0
770,-122,37,28,1784,311,735,278,4,0,0,0,1,0
8060,-118,33,36,2000,343,956,352,5,1,0,0,0,0


In [9]:
X_train.shape

(16346, 13)

In [10]:
y_train.shape

(16346,)

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score 
from sklearn.metrics import root_mean_squared_error
model=LinearRegression()


In [12]:
model.fit(X_train,y_train)
y_train_predict=model.predict(X_train)
y_test_prediction=model.predict(X_test)


## Model performance

In [13]:

#r2_score
train_r2_score=r2_score(y_train_predict,y_train)
test_r2_score=r2_score(y_test_prediction,y_test)
print(f"train_r2_score:{train_r2_score}, test_r2_score: {test_r2_score}")

train_r2_score:0.404346094856345, test_r2_score: 0.3938892819787322


In [14]:
#Root mean squred error 
train_rmse=root_mean_squared_error(y_train_predict,y_train)
test_rmse=root_mean_squared_error(y_test_prediction,y_test)
print(f"train_rmse:{train_rmse},  test_rmse: {test_rmse}")

train_rmse:70403.09905436312,  test_rmse: 72428.3029981733


In [15]:
model.score(X_test,y_test)

0.6117640328857219

### Preprocessing


In [16]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_S=scaler.fit_transform(X_train)
X_test_S=scaler.fit_transform(X_test)
model.fit(X_train_S, y_train)


In [17]:
y_train_predict_s=model.predict(X_train_S)
y_test_prediction_s=model.predict(X_test_S)

In [18]:
train_r2_score=r2_score(y_train_predict_s,y_train)
test_r2_score=r2_score(y_test_prediction_s,y_test)

In [19]:
print(f"train_r2_score:{train_r2_score},  test_r2_score: {test_r2_score}")

train_r2_score:0.40454903270601295,  test_r2_score: 6.534461860496776e-11


In [20]:
#Root mean squred error 
train_rmse=float(root_mean_squared_error(y_train_predict_s,y_train))
test_rmse=float(root_mean_squared_error(y_test_prediction_s,y_test))

In [21]:
print(f"train_rmse:{train_rmse},  test_rmse: {test_rmse}")

train_rmse:70403.79844739512,  test_rmse: 950139667406980.4


In [22]:
model.score(X_test_S,y_test)

-6.681193457555315e+19

### Random forest Classifier

In [25]:
from sklearn.ensemble import RandomForestRegressor
forest=RandomForestRegressor()


In [26]:
forest.fit(X_train_S, y_train)
    

In [27]:
y_train_predict_s=forest.predict(X_train_S)
y_test_prediction_s=forest.predict(X_test_S)

In [28]:
train_r2_score=r2_score(y_train_predict_s,y_train)
test_r2_score=r2_score(y_test_prediction_s,y_test)
print(f"train_r2_score:{train_r2_score},  test_r2_score: {test_r2_score}")

train_r2_score:0.9565154083787223,  test_r2_score: 0.6419082265641818


In [29]:
forest.score(X_test_S,y_test)

0.723592994088245

### Hyperparameter Tuning

### Grid Search CV

In [30]:
from sklearn.model_selection import GridSearchCV

param_grid={
    'n_estimators':[100,150, 200],
    'max_depth':[None,10,20,30],
    'min_samples_split':[5,10,20]
}

grid_search=GridSearchCV(
    estimator=forest,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error'
)



In [None]:
grid_search.fit(X_train_S, y_train)

In [117]:
grid_search.best_params_

{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}

In [118]:
grid_search.best_estimator_.score(X_test_S, y_test)

0.7398987930018291

In [119]:
grid_search=GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5
)

In [120]:
grid_search.fit(X_train_S, y_train)

ValueError: Invalid parameter 'max_depth' for estimator LinearRegression(). Valid parameters are: ['copy_X', 'fit_intercept', 'n_jobs', 'positive'].