In [76]:
#Import Necessary Libraries:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### 1. Load Data
#### Read the “housing.csv” file from the folder into the program.
#### Print first few rows of this data.
#### Extract input(X) and output(Y) data from the dataset. It will be implemented after EDA. In Section 4. Split the data set.

In [77]:
df_house=pd.read_excel("1553768847_housing.xlsx")

In [78]:
print(df_house.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                  41          880           129.0   
1    -122.22     37.86                  21         7099          1106.0   
2    -122.24     37.85                  52         1467           190.0   
3    -122.25     37.85                  52         1274           235.0   
4    -122.25     37.85                  52         1627           280.0   

   population  households  median_income ocean_proximity  median_house_value  
0         322         126         8.3252        NEAR BAY              452600  
1        2401        1138         8.3014        NEAR BAY              358500  
2         496         177         7.2574        NEAR BAY              352100  
3         558         219         5.6431        NEAR BAY              341300  
4         565         259         3.8462        NEAR BAY              342200  


In [79]:
print(df_house.tail())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
20635    -121.09     39.48                  25         1665           374.0   
20636    -121.21     39.49                  18          697           150.0   
20637    -121.22     39.43                  17         2254           485.0   
20638    -121.32     39.43                  18         1860           409.0   
20639    -121.24     39.37                  16         2785           616.0   

       population  households  median_income ocean_proximity  \
20635         845         330         1.5603          INLAND   
20636         356         114         2.5568          INLAND   
20637        1007         433         1.7000          INLAND   
20638         741         349         1.8672          INLAND   
20639        1387         530         2.3886          INLAND   

       median_house_value  
20635               78100  
20636               77100  
20637               92300  
20638               84700  


In [80]:
print("The number of rows and colums are {} and also called shape of the matrix".format(df_house.shape))
print("Columns names are \n {}".format(df_house.columns))

The number of rows and colums are (20640, 10) and also called shape of the matrix
Columns names are 
 Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'median_house_value'],
      dtype='object')


### 2. Handle missing values
#### Fill the missing values with the mean of the respective column.

In [81]:
# Let's check if there are any missing values or null
df_house.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [82]:
df_house.total_bedrooms

0         129.0
1        1106.0
2         190.0
3         235.0
4         280.0
          ...  
20635     374.0
20636     150.0
20637     485.0
20638     409.0
20639     616.0
Name: total_bedrooms, Length: 20640, dtype: float64

In [83]:
# We see that there are 207 null values in Column total_bedrooms. 
# So,We replace the null values with the mean and check for nulls again.
df_house.total_bedrooms=df_house.total_bedrooms.fillna(df_house.total_bedrooms.mean())

In [84]:
df_house.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64

### 3. Encode categorical data :
#### Convert categorical column in the dataset to numerical data

In [85]:
print(df_house.dtypes)

longitude             float64
latitude              float64
housing_median_age      int64
total_rooms             int64
total_bedrooms        float64
population              int64
households              int64
median_income         float64
ocean_proximity        object
median_house_value      int64
dtype: object


In [86]:
df_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [87]:
df_house.ocean_proximity.nunique()

5

In [88]:
df_house.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [89]:
print(df_house["ocean_proximity"].value_counts())

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64


In [90]:
# One of the column is a categorical feature (ocean_proximity)
# Label encode for categorical feature (ocean_proximity)
le = LabelEncoder()
df_house['ocean_proximity']=le.fit_transform(df_house['ocean_proximity'])

In [91]:
df_house.ocean_proximity.unique()

array([3, 0, 1, 4, 2])

In [92]:
df_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  int64  
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(6)
memory usage: 1.6 MB


### 4. Split the dataset :
#### Split the data into 80% training dataset and 20% test dataset.

In [93]:
df_house.shape

(20640, 10)

In [94]:
# Extracting input(X) as X_Features and output(Y) as 
X_Features=['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity']
Y_Target=['median_house_value']
X=df_house[X_Features]
Y=df_house[Y_Target]

In [95]:
X.shape, Y.shape

((20640, 9), (20640, 1))

In [96]:
df_house = df_house.drop("median_house_value",axis=1)

In [97]:
df_house.shape

(20640, 9)

In [98]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=1)

In [99]:
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(16512, 9) (16512, 1)
(4128, 9) (4128, 1)


In [100]:
print("x_train shape {} and size {}".format(x_train.shape,x_train.size))
print("x_test shape {} and size {}".format(x_test.shape,x_test.size))
print("y_train shape {} and size {}".format(y_train.shape,y_train.size))
print("y_test shape {} and size {}".format(y_test.shape,y_test.size))

x_train shape (16512, 9) and size 148608
x_test shape (4128, 9) and size 37152
y_train shape (16512, 1) and size 16512
y_test shape (4128, 1) and size 4128


In [101]:
x_train.info(),x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 15961 to 235
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  int64  
 3   total_rooms         16512 non-null  int64  
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  int64  
 6   households          16512 non-null  int64  
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  int64  
dtypes: float64(4), int64(5)
memory usage: 1.3 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 4712 to 11878
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float64
 1   latitude            4128 non-null   flo

(None, None)

### 5. Standardize data :
#### Standardize training and test datasets

In [102]:
# Feature scaling is to bring all the independent variables in a dataset into
# same scale, to avoid any variable dominating  the model. Here we will not 
# transform the dependent variables.

In [103]:
# Get column names first
names = df_house.columns
# Create the Scaler object
scaler = StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(df_house)
scaled_df = pd.DataFrame(scaled_df, columns=names)
scaled_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-1.327835,1.052548,0.982143,-0.804819,-0.975228,-0.974429,-0.977033,2.344766,1.291089
1,-1.322844,1.043185,-0.607019,2.04589,1.355088,0.861439,1.669961,2.332238,1.291089
2,-1.332827,1.038503,1.856182,-0.535746,-0.829732,-0.820777,-0.843637,1.782699,1.291089
3,-1.337818,1.038503,1.856182,-0.624215,-0.722399,-0.766028,-0.733781,0.932968,1.291089
4,-1.337818,1.038503,1.856182,-0.462404,-0.615066,-0.759847,-0.629157,-0.012881,1.291089


### 6. Perform Linear Regression :
#### Perform Linear Regression on training data.
#### Predict output for test dataset using the fitted model.
#### Print root mean squared error (RMSE) from Linear Regression.

In [104]:
# We apply Linear Regresssion because median_income and median_house_value has a linear relationship.
linreg=LinearRegression()
linreg.fit(x_train,y_train)

LinearRegression()

In [105]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_predict = linreg.predict(x_test)

In [106]:
print(sqrt(mean_squared_error(y_test,y_predict)))
print((r2_score(y_test,y_predict)))

69888.7939155867
0.6276223517950272


### 7. Perform Linear Regression with one independent variable :
#### Extract just the median_income column from the independent variables (from X_train and X_test).
#### Perform Linear Regression to predict housing values based on median_income.
#### Predict output for test dataset using the fitted model.
#### Plot the fitted model for training data as well as for test data to check if the fitted model satisfies the test data.

In [107]:
x_train_Income=x_train[['median_income']]
x_test_Income=x_test[['median_income']]
print(x_train_Income.shape)
print(y_train.shape)

(16512, 1)
(16512, 1)


In [108]:
linreg=LinearRegression()
linreg.fit(x_train_Income,y_train)

LinearRegression()

In [109]:
y_predict = linreg.predict(x_test_Income)

In [110]:
#print intercept and coefficient of the linear equation
print(linreg.intercept_, linreg.coef_)

[44721.83362107] [[42055.4573838]]


In [112]:
print(sqrt(mean_squared_error(y_test,y_predict)))
print((r2_score(y_test,y_predict)))

83228.17849797675
0.4719083593446771
