In [155]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [100]:
df = pd.read_csv("housing price prediction.csv")

In [101]:
df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100


In [102]:
df.shape

(20640, 10)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


# Using pandas to clean dataset

In [104]:
new_df = df.copy()

In [105]:
new_df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100


In [106]:
new_df["longitude"] = new_df["longitude"].astype("str")
new_df["longitude"] = new_df["longitude"].str.extract('-(\d+).')
new_df["longitude"] = new_df["longitude"].astype("int")

In [107]:
new_df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,122,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,122,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,122,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100


In [108]:
new_df["longitude"].unique()

array([122, 121, 119, 120, 124, 123, 118, 115, 116, 114, 117])

In [110]:
new_df["total_bedrooms"].unique()

array([ 129., 1106.,  190., ..., 3008., 1857., 1052.])

In [111]:
new_df.dtypes

longitude               int32
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object

In [112]:
new_df["total_bedrooms"] = new_df["total_bedrooms"].fillna(0).astype(int)

In [126]:
new_df["ocean_proximity"] = new_df["ocean_proximity"].str.replace("<1H OCEAN","H OCEAN")

In [127]:
new_df["ocean_proximity"].unique()

array(['NEAR BAY', 'H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [128]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  int32  
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20640 non-null  int32  
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(2), int32(2), int64(5), object(1)
memory usage: 1.4+ MB


# Now data is cleaned

# Training model

In [129]:
df = new_df.copy()

In [130]:
df.head(4)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,122,37.88,41,880,129,322,126,8.3252,NEAR BAY,452600
1,122,37.86,21,7099,1106,2401,1138,8.3014,NEAR BAY,358500
2,122,37.85,52,1467,190,496,177,7.2574,NEAR BAY,352100
3,122,37.85,52,1274,235,558,219,5.6431,NEAR BAY,341300


In [144]:
x = df.drop(["median_house_value","ocean_proximity"],axis=1)

In [145]:
x.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,122,37.88,41,880,129,322,126,8.3252
1,122,37.86,21,7099,1106,2401,1138,8.3014
2,122,37.85,52,1467,190,496,177,7.2574


In [146]:
y = df["median_house_value"]

In [147]:
y.head(3)

0    452600
1    358500
2    352100
Name: median_house_value, dtype: int64

In [148]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

In [149]:
x_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
20421,118,34.17,17,4668,628,1917,624,8.1397
20307,119,34.15,18,2509,688,3129,677,2.6098
1479,122,37.96,20,1143,346,578,298,2.2411
13519,117,34.2,5,9269,1605,4916,1519,4.4367
3217,119,36.33,47,1059,268,693,241,1.3882


In [150]:
lr = LinearRegression()

In [151]:
lr.fit(x_train,y_train)

In [152]:
pred = lr.predict(x_test)

In [159]:
print("MSE: ",mean_squared_error(y_test,pred))

MSE:  5105343869.972781


In [160]:
print("Square root of MSE: ",np.sqrt(mean_squared_error(y_test,pred)))

Square root of MSE:  71451.68906312
