## Loading the Data

In [83]:
import pandas as pd

data = pd.read_csv("housing.csv")

data.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [84]:
print(len(data))

data.columns

20640


Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

### Pre-Processing

In [85]:
# calulcating null values from each column

print(data.isna().sum())

# Removing null values rows as they are very less compared to total length of the dataset

data = data.dropna(axis='rows')

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [86]:
# Convert the ocean_proximity into number

from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

data['ocean_proximity_category'] = labelEncoder.fit_transform(data['ocean_proximity'])

In [87]:
print(data['ocean_proximity'].unique())
print(data['ocean_proximity_category'].unique())

['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
[3 0 1 4 2]


In [88]:
# Splitting the data

from sklearn.model_selection import train_test_split

X= data[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value']]

y = data['ocean_proximity_category']

X_train, y_train, X_test, y_test = train_test_split(X, y, random_state=104, train_size=0.8)

## Data Exploration

### Data Exploration

In [89]:
print("Number of rows: ", data.shape[0])
print("Number of columns: ", data.shape[1])

Number of rows:  20433
Number of columns:  11


In [90]:
# Data types of all columns

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   longitude                 20433 non-null  float64
 1   latitude                  20433 non-null  float64
 2   housing_median_age        20433 non-null  float64
 3   total_rooms               20433 non-null  float64
 4   total_bedrooms            20433 non-null  float64
 5   population                20433 non-null  float64
 6   households                20433 non-null  float64
 7   median_income             20433 non-null  float64
 8   median_house_value        20433 non-null  float64
 9   ocean_proximity           20433 non-null  object 
 10  ocean_proximity_category  20433 non-null  int64  
dtypes: float64(9), int64(1), object(1)
memory usage: 1.9+ MB


### Descriptive Statistics



In [91]:
print("Mean value of the Median House: ", data['median_house_value'].mean())
print("Median value of the Median House: ", data['median_house_value'].median())
print("Standard Deviation value of the Median House: ", data['median_house_value'].std())

Mean value of the Median House:  206864.41315519012
Median value of the Median House:  179700.0
Standard Deviation value of the Median House:  115435.66709858434


In [64]:
# Range of Houses

### Data Visualization

### Map Plot

In [144]:
import plotly.express as px

px.scatter(data, x='latitude', y='longitude')



From the above map it suggests a correlation between housing preferences with higher latitude and longitude. However,
it's important to consider other factors influencing the housing choices and further analysis may need to dettermine cause


In [125]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=("Total Bedrooms vs. Median House Value", "Households vs. Median House Value"))

# Add scatter plot 1
scatter1 = go.Scatter(x=data["total_bedrooms"], y=data["median_house_value"], mode="markers")
fig.add_trace(scatter1, row=1, col=1)

# Add scatter plot 2
scatter2 = go.Scatter(x=data["households"], y=data["median_house_value"], mode="markers")
fig.add_trace(scatter2, row=1, col=2)

# Add scatter plot 3
scatter2 = go.Scatter(x=data["total_rooms"], y=data["median_house_value"], mode="markers")
fig.add_trace(scatter2, row=2, col=1)

# Add scatter plot 4
scatter2 = go.Scatter(x=data["population"], y=data["median_house_value"], mode="markers")
fig.add_trace(scatter2, row=2, col=2)


# Update layout
fig.update_layout(title="Relationships between Various Features and Median House Value")

# Show the plot
fig.show()


In [131]:
px.bar(data, x='ocean_proximity', y='median_house_value', color='ocean_proximity')


As we can see from the above bargraph we can clearly see the difference between 1H OCEAN and other places the most people are preffering places over 1H OCEAN