# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Pandas Version

In [2]:
pd.__version__

'2.1.0'

### Loading dataset

In [3]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
df = pd.read_csv(url)

### Data Exploration

In [4]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
df.shape

(20640, 10)

In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

- only the total_bedrooms has null values

### Unique values per column

In [7]:
df.nunique()

longitude               844
latitude                862
housing_median_age       52
total_rooms            5926
total_bedrooms         1923
population             3888
households             1815
median_income         12928
median_house_value     3842
ocean_proximity           5
dtype: int64

### some summary Stats

In [8]:
df.describe(include='all')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0,20640
unique,,,,,,,,,,5
top,,,,,,,,,,<1H OCEAN
freq,,,,,,,,,,9136
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909,
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874,
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0,
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0,
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0,


### Mean value for the houses with ocean proximity

In [9]:
df[df.ocean_proximity == 'NEAR BAY'].median_house_value.mean()

259212.31179039303

#### Mean value for all houses

In [10]:
total_room_mean = df.total_bedrooms.mean()

### Filling null values

In [11]:
df.total_bedrooms = df.total_bedrooms.fillna(total_room_mean)

#### Printing result

In [12]:
print(f'Mean for total bedrooms after filing null values {df.total_bedrooms.mean()}')

Mean for total bedrooms after filing null values 537.8705525375617


In [13]:
print(f'Mean for total bedrooms  before filing null values {total_room_mean}')

Mean for total bedrooms  before filing null values 537.8705525375618


### Question 7

In [14]:
df.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [15]:
df_island = df[df.ocean_proximity == 'ISLAND']

In [16]:
df_island.shape

(5, 10)

In [17]:
X = df_island[['housing_median_age', 'total_rooms', 'total_bedrooms']].values
X

array([[  27., 1675.,  521.],
       [  52., 2359.,  591.],
       [  52., 2127.,  512.],
       [  52.,  996.,  264.],
       [  29.,  716.,  214.]])

In [18]:
X.T

array([[  27.,   52.,   52.,   52.,   29.],
       [1675., 2359., 2127.,  996.,  716.],
       [ 521.,  591.,  512.,  264.,  214.]])

In [19]:
XTX = np.dot(X.T, X)
XTX

array([[9.6820000e+03, 3.5105300e+05, 9.1357000e+04],
       [3.5105300e+05, 1.4399307e+07, 3.7720360e+06],
       [9.1357000e+04, 3.7720360e+06, 9.9835800e+05]])

In [20]:
y = [950, 1300, 800, 1000, 1300]

In [21]:
inv_XTX = np.linalg.inv(XTX)

In [22]:
w = np.dot(np.dot(inv_XTX, X.T), y)

In [23]:
w

array([23.12330961, -1.48124183,  5.69922946])