<a href="https://colab.research.google.com/github/ElarizT/Machine_Learning_Projects/blob/main/US_Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Open the csv file
df = pd.read_csv('US_Housing.csv')

df.head()

Unnamed: 0,Zip Code,Price,Beds,Baths,Living Space,Address,City,State,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude
0,10013,3999000.0,2,3,1967,74 GRAND ST APT 3,New York,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
1,10013,3999000.0,2,3,1967,74 GRAND ST APT 3,New York,New York,29563,20967.9,New York,370046.0,40.72001,-74.00472
2,10014,1650000.0,1,1,718,140 CHARLES ST APT 4D,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601
3,10014,760000.0,3,2,1538,38 JONES ST,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601
4,10014,1100000.0,1,1,600,81 BEDFORD ST APT 3F,New York,New York,29815,23740.9,New York,249880.0,40.73407,-74.00601


In [3]:
# Checking how many empty cells there are on each column
df.isnull().sum()

Unnamed: 0,0
Zip Code,0
Price,0
Beds,0
Baths,0
Living Space,0
Address,0
City,0
State,0
Zip Code Population,0
Zip Code Density,0


In [4]:
# Drop all empty rows
df = df.dropna()

In [5]:
# Check it again empty sum of cells to be sure
df.isnull().sum()

Unnamed: 0,0
Zip Code,0
Price,0
Beds,0
Baths,0
Living Space,0
Address,0
City,0
State,0
Zip Code Population,0
Zip Code Density,0


In [6]:
# Also drop duplicates just to be sure
df = df.drop_duplicates()

In [10]:
# Convert all columns from object to numeric
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])

df.head()

Unnamed: 0,Zip Code,Price,Beds,Baths,Living Space,Address,City,State,Zip Code Population,Zip Code Density,County,Median Household Income,Latitude,Longitude,State_encoded
0,10013,3999000.0,2,3,1967,33466,150,18,29563,20967.9,57,370046.0,40.72001,-74.00472,18
2,10014,1650000.0,1,1,718,5888,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601,18
3,10014,760000.0,3,2,1538,20577,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601,18
4,10014,1100000.0,1,1,600,35129,150,18,29815,23740.9,57,249880.0,40.73407,-74.00601,18
5,10017,764900.0,1,1,643,6596,150,18,15514,20107.7,57,188289.0,40.75235,-73.9726,18


In [57]:
# Create new columns to improve correlation with target column
df['BB'] = df['Beds'] + df['Baths']

city_center = (40.7128, -74.0060)  # Example: New York City coordinates
df['distance_to_center'] = np.sqrt(
    (df['Latitude'] - city_center[0])**2 +
    (df['Longitude'] - city_center[1])**2
)

df.head()

Unnamed: 0,Zip Code,Price,Beds,Baths,Living Space,Address,City,State,Zip Code Population,Zip Code Density,...,Median Household Income,Latitude,Longitude,State_encoded,BB,Income_per_LivingSpace,location_index,space_density_ratio,income_density_ratio,distance_to_center
0,10013,3999000.0,2,3,1967,33466,150,18,29563,20967.9,...,370046.0,40.72001,-74.00472,18,5,188.127097,-3013.472938,0.09381,17.648215,0.007323
2,10014,1650000.0,1,1,718,5888,150,18,29815,23740.9,...,249880.0,40.73407,-74.00601,18,2,348.022284,-3014.565992,0.030243,10.525296,0.02127
3,10014,760000.0,3,2,1538,20577,150,18,29815,23740.9,...,249880.0,40.73407,-74.00601,18,5,162.470741,-3014.565992,0.064783,10.525296,0.02127
4,10014,1100000.0,1,1,600,35129,150,18,29815,23740.9,...,249880.0,40.73407,-74.00601,18,2,416.466667,-3014.565992,0.025273,10.525296,0.02127
5,10017,764900.0,1,1,643,6596,150,18,15514,20107.7,...,188289.0,40.75235,-73.9726,18,2,292.828927,-3014.557286,0.031978,9.364025,0.051766


In [24]:
# Check correlation of target with each column/features
df_corr = df.corr()

df_corr_sorted = df_corr['Price'].sort_values(ascending=False)
print(df_corr_sorted)

Price                      1.000000
Living Space               0.519409
Baths                      0.461261
Median Household Income    0.416939
BB                         0.400180
Beds                       0.276238
distance_to_center         0.182737
Zip Code Density           0.153071
Zip Code                   0.112880
City                       0.081482
County                     0.030518
space_density_ratio        0.024960
income_density_ratio       0.022915
Income_per_LivingSpace    -0.013370
Address                   -0.028187
Zip Code Population       -0.042330
Latitude                  -0.059568
location_index            -0.100542
State                     -0.150874
State_encoded             -0.150874
Longitude                 -0.170149
Name: Price, dtype: float64


In [51]:
# Define features and target
features = df[['Median Household Income', 'Baths', 'distance_to_center', 'Living Space']]

X = features
y = df['Price']

In [52]:
# Split date to test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [53]:
# Build a Random Forest Regressor model
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [54]:
# Check the score for knowing accuracy
y_pred = model.predict(X_test)

print(r2_score(y_test, y_pred))

0.661882000670106


In [55]:
# Save the model for later usage
import joblib

# Save the trained model
joblib.dump(model, 'model_5.pkl')

['model_5.pkl']

In [56]:
# Load the model
loaded_data = joblib.load('model_5.pkl')

In [61]:
# Question inputs to find out criterias
question1 = input('What is the median household income?: ')
question2 = input('How many baths there are?:')
question3 = input('What is the distance to the center?:')
question4 = input('How big is living space?:')

# Adding criterias and question accordingly
new_data = ({
    'Median Household Income': [question1],
    'Baths': [question2],
    'distance_to_center': [question3],
    'Living Space': [question4]
})

# Converting them to new dataframe and printing predicted price
new_df = pd.DataFrame(new_data)

predictions = loaded_data.predict(new_df)
print(f"Prediction of the price is {predictions}")

What is the median household income?: 140000
How many baths there are?:3
What is the distance to the center?:0.4
How big is living space?:2400
Prediction of the price is [1104851.89]
