<a href="https://colab.research.google.com/github/aryansinghal10/BCA/blob/main/Aryan_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and Data Frame


In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("/content/housing.csv") # upload housing.csv file into Google Colab

# ------------------------------------------------------------------------------
# IMPORTANT
# The data contains information from the 1990 California census.
# The data pertains to the houses found in a given California district and some 
# summary stats about them based on the 1990 census data.

FileNotFoundError: ignored

# Get Data Ready for ML

Using dummies (preferred)

In [None]:
df = pd.get_dummies(df, columns = ['ocean_proximity'])

Using replace (another way)

In [None]:
# pd.unique(df['ocean_proximity'])
# df.replace(to_replace = ['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'], value = [0, 1, 2, 3, 4], inplace = True)

Show data frame

In [None]:
df

#Cleaning Data



In [None]:
# Identify what columns have null values and how many null values are there
df.isnull().sum()

In [None]:
# Replace null values with the mean of that column
df['total_bedrooms'].fillna(df['total_bedrooms'].mean(), inplace = True)

# Check if the null values have been replaced
df.isnull().sum()

#Histograms

For the age of houses, we can see that they mostly lie between 15 - 35 years.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Median age of house (in years)
sns.set()

# Change size
plt.figure(figsize=(15, 10))

# Show histogram of housing_median_age column
plt.hist(df['housing_median_age'], color = 'lightseagreen', alpha = 0.9)

plt.xlabel('House Age (in years)', size = 16)
plt.ylabel('Count', size = 16)
title = 'Median Age of a House in California Within a Block (Lower Number is a Newer Building)'
plt.title(title, size = 18)
plt.savefig(title, dpi = 200)

plt.show()

We can see that a population of 3 - 2133 residing in a housing block is most common.

In [None]:
# Population residing in a housing block
# The code to this histogram was shared by one of the cohort leaders.

from matplotlib.ticker import AutoMinorLocator

w = int(df['population'].quantile(0.2)) #bin width | if we increased the quantile value, number of columns would decrease, columns would become wider, and variability would decrease

#using x-axis labels for each column in the histogram (note that values are rounded to integer in order to conserve space and render readable)
fig, ax = plt.subplots(1, figsize=(31,10))
a, bins, b = plt.hist(df['population'], bins=np.arange(min(df['population']), max(df['population']) + w, w)) #replace df.Age with df.name of column

# Draw the centering grid
minor_locator = AutoMinorLocator(2)
plt.gca().xaxis.set_minor_locator(minor_locator)
plt.grid(which='minor', color='white', lw = 0.5)

#center the labels
xticks = [int((bins[idx+1] + value)/2) for idx, value in enumerate(bins[:-1])]
xticks_labels = [ "{:d}\nto\n{:d}".format(int(value), int(bins[idx+1])) for idx, value in enumerate(bins[:-1])]
plt.xticks(xticks, labels = xticks_labels)

# remove all the extra ticks
ax.tick_params(axis='x', which='both', length=0)

plt.xlabel('Population in Block', size = 24)
plt.ylabel('Count', size = 24)
title = 'Total Number of People Residing Within a Block'
plt.title(title, size = 26)
plt.savefig(title, dpi = 200)

We can see that the median house value for households is most common between 50,000 - 200,000 US Dollars

In [None]:
sns.set()

# change size
plt.figure(figsize=(12, 8))

# show histogram of median_house_value column
plt.hist(df['median_house_value'], color = 'coral', alpha = 0.9)
plt.xlabel('House Value (Measured in US Dollars)', size=16)
plt.ylabel('Count', size = 16)
title = 'Median House Value for Households Within a Block in California'
plt.title(title, size = 20)
plt.savefig(title, dpi = 200)

plt.show()

In [None]:
df.describe()

In [None]:
df.info()

# Scatter Plots

In [None]:
# Find correlation between data columns
df.corr()

In [None]:
sns.scatterplot(data = df, x = 'longitude', y = 'latitude')

In [None]:
plt.figure(figsize = (15,10))
sns.scatterplot(x=df['longitude'], y=df['latitude'],
                size=df['median_house_value'],
                sizes=(100, 400),
                palette='Blues_r')
title = 'Observing Relationship Between Location and Median House Value'
plt.title=(title)
plt.savefig(title, dpi = 200)
plt.show()

In [None]:
plt.figure(figsize = (10, 6))
sns.regplot(df['median_income'], df['median_house_value'])
title = 'Median Income Relation to Median House Value'
plt.title=(title)
plt.savefig(title, dpi = 200)
plt.show()

# Split Data into X and y

In [None]:
# Set X
X = df.loc[:, df.columns!='median_house_value']
X.head()

In [None]:
# Set y
y = df.iloc[:, 8]
y.head()

# Building the Model

We will be using Regression Analysis for this dataset



In [None]:
# Import LinearRegression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

What type is the variable `model`?

In [None]:
type(model)

Enter the variable `model.get_params()` to view the parameters of your model. These are model defaults that may be changed later.

In [None]:
model.get_params()

Fit model to the data.

In [None]:
model.fit(X, y)

View the weights of your model, also known as the multipliers, or coefficients, by entering `model.coef_` in the cell below.

In [None]:
model.coef_

# Scoring the Model

Now score the model

In [None]:
model.score(X, y)

# Making Predictions

In [None]:
model.predict(X[:5])

In [None]:
y[:5]