## Loading the Data

In [7]:

%pip install pandas
import pandas as pd

data = pd.read_csv("housing.csv")

data.head(2)

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.1.1-cp39-cp39-macosx_11_0_arm64.whl (10.9 MB)
[K     |████████████████████████████████| 10.9 MB 2.1 MB/s eta 0:00:01    |██████▉                         | 2.3 MB 2.1 MB/s eta 0:00:05
Collecting pytz>=2020.1
  Downloading pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)
[K     |████████████████████████████████| 502 kB 19.9 MB/s eta 0:00:01
[?25hCollecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[K     |████████████████████████████████| 341 kB 21.4 MB/s eta 0:00:01
[?25hCollecting numpy>=1.22.4
  Downloading numpy-1.26.0-cp39-cp39-macosx_11_0_arm64.whl (14.0 MB)
[K     |████████████████████████████████| 14.0 MB 22.9 MB/s eta 0:00:01
Installing collected packages: tzdata, pytz, numpy, pandas
Successfully installed numpy-1.26.0 pandas-2.1.1 pytz-2023.3.post1 tzdata-2023.3
You should consider upgrading via the '/Library/Developer/Comman

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [8]:
print(len(data))

data.columns

20640


Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

### Pre-Processing

In [9]:
# calulcating null values from each column

print(data.isna().sum())

# Removing null values rows as they are very less compared to total length of the dataset

data = data.dropna(axis='rows')

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [4]:
### Import the data into Postgres database

import pandas as pd
from sqlalchemy import create_engine

def dfToDatabase(data):

    # Define your database connection string
    db_connection_str = 'postgresql://postgres:postgres@localhost:5432/postgres'

    # Create a database connection
    db_connection = create_engine(db_connection_str)

    table_name = "housing"
    data.to_sql(table_name, db_connection, if_exists='replace', index=False)

    db_connection.dispose()
    
dfToDatabase(data)

In [11]:
%pip install sklearn

# Convert the ocean_proximity into numberic values

from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

data['ocean_proximity_category'] = labelEncoder.fit_transform(data['ocean_proximity'])

Defaulting to user installation because normal site-packages is not writeable
Collecting sklearn
  Downloading sklearn-0.0.post9.tar.gz (3.6 kB)
[31m    ERROR: Command errored out with exit status 1:
     command: /Library/Developer/CommandLineTools/usr/bin/python3 -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/private/var/folders/5j/f6qclvf555d323h75x4l2v0w0000gn/T/pip-install-n__1lqnf/sklearn_83cf7c9c70764f57b351423a610a5d4a/setup.py'"'"'; __file__='"'"'/private/var/folders/5j/f6qclvf555d323h75x4l2v0w0000gn/T/pip-install-n__1lqnf/sklearn_83cf7c9c70764f57b351423a610a5d4a/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base /private/var/folders/5j/f6qclvf555d323h75x4l2v0w0000gn/T/pip-pip-egg-info-mc41g0jy
         cwd: /private/va

In [12]:
print(data['ocean_proximity'].unique())
print(data['ocean_proximity_category'].unique())

['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
[3 0 1 4 2]


In [13]:
# Splitting the data

from sklearn.model_selection import train_test_split

X= data[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_category']]

y = data['median_house_value']

X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=104, train_size=0.8)

## Data Exploration

### Data Exploration

In [14]:
print("Number of rows: ", data.shape[0])
print("Number of columns: ", data.shape[1])

Number of rows:  20433
Number of columns:  11


In [15]:
# Data types of all columns

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 20433 non-null  float64
 1   latitude                  20433 non-null  float64
 2   housing_median_age        20433 non-null  float64
 3   total_rooms               20433 non-null  float64
 4   total_bedrooms            20433 non-null  float64
 5   population                20433 non-null  float64
 6   households                20433 non-null  float64
 7   median_income             20433 non-null  float64
 8   median_house_value        20433 non-null  float64
 9   ocean_proximity           20433 non-null  object 
 10  ocean_proximity_category  20433 non-null  int64  
dtypes: float64(9), int64(1), object(1)
memory usage: 1.9+ MB


### Descriptive Statistics



In [67]:
print("Mean value of the Median House: ", data['median_house_value'].mean())
print("Median value of the Median House: ", data['median_house_value'].median())
print("Standard Deviation value of the Median House: ", data['median_house_value'].std())

Mean value of the Median House:  206864.41315519012
Median value of the Median House:  179700.0
Standard Deviation value of the Median House:  115435.66709858434


In [64]:
# Range of Houses

### Data Visualization

### Map Plot

In [68]:
import plotly.express as px

px.scatter(data, x='latitude', y='longitude', color='median_house_value')




From the above map it suggests a correlation between housing preferences with higher latitude and longitude. However,
it's important to consider other factors influencing the housing choices and further analysis may need to dettermine cause


In [69]:
px.pie(data, values='median_house_value', names='ocean_proximity')


As we can see from the above piechart we can clearly see the difference between 1H OCEAN and other places 50% of the data is from 1H OCEAN. At the same time we have 41% data from 1H ocean so it does have significant impact on the visualizaiton

In [70]:
px.scatter(data, x='ocean_proximity', y='median_income', title='Relationship with Income and House Value', color='median_house_value')

### Correlation Matrix

In [71]:
import plotly.graph_objects as go
import pandas as pd


correlation_matrix = data.corr()


fig = go.Figure(go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis',
    colorbar=dict(title='Correlation')
))


fig.update_layout(
    title='Correlation Matrix Heatmap',
)


fig.show()


### Building Neural Network


In [17]:
%pip install torch

import torch 

X = data.drop(['ocean_proximity', 'ocean_proximity_category'], axis=1)
y = data['ocean_proximity_category']

print("X shape: ", X.shape)
print("y shape: ", y.shape)

X_tensor = torch.tensor(X.values, dtype=torch.float32)

y_tensor = torch.tensor(y.values, dtype=torch.float32)

print("X_tensor shape: ", X_tensor.shape)
print("y_tensor shape: ", y_tensor.shape)

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.0.1-cp39-none-macosx_11_0_arm64.whl (55.8 MB)
[K     |████████████████████████████████| 55.8 MB 644 kB/s eta 0:00:011
[?25hCollecting filelock
  Downloading filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting networkx
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 16.2 MB/s eta 0:00:01
[?25hCollecting sympy
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[K     |████████████████████████████████| 5.7 MB 21.2 MB/s eta 0:00:01
[?25hCollecting typing-extensions
  Downloading typing_extensions-4.8.0-py3-none-any.whl (31 kB)
Collecting jinja2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_universal2.whl (17 kB)
Collecting mpmath>=0.19
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
[K     |████████████████████████████████|

In [19]:
import torch.nn as nn
import torch.nn.functional as F

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(9, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 16)
        self.fc7 = nn.Linear(16, 8)
        self.fc8 = nn.Linear(8, 4)
        self.fc9 = nn.Linear(4, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))
        x = F.relu(self.fc8(x))
        x = self.fc9(x)

        return F.log_softmax(x, dim=1)

model = NeuralNetwork()
print(model)

NeuralNetwork(
  (fc1): Linear(in_features=9, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=32, bias=True)
  (fc6): Linear(in_features=32, out_features=16, bias=True)
  (fc7): Linear(in_features=16, out_features=8, bias=True)
  (fc8): Linear(in_features=8, out_features=4, bias=True)
  (fc9): Linear(in_features=4, out_features=1, bias=True)
)


In [41]:
import torch.optim as optim

# use stochastic gradient descent as optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001)

EPOCHS = 3

for epoch in range(EPOCHS):

    for data, target in zip(X_tensor, y_tensor):

        optimizer.zero_grad()

        output = model(data.view(1, -1))

        target_reshaped = target.view(1, -1)

        loss = F.mse_loss(output, target_reshaped)

        # add L2 Regularization
        l2Reg = None
        for param in model.parameters():
            if l2Reg is None:
                l2Reg = param.norm(2)
            else:
                l2Reg = l2Reg + param.norm(2)
        
        loss = loss + 0.001 * l2Reg
        
        loss.backward()

        optimizer.step()

    
    print(f"Loss: {loss: .4f}")

Loss:  1.0000
Loss:  1.0000
Loss:  1.0000


In [31]:
print(output.shape)
print(target.shape)

torch.Size([1, 1])
torch.Size([])
