In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
apartment_data = pd.read_csv('apartment_data.csv')

In [3]:
apartment_data.shape

(99188, 9)

# Filtering Outliers

In [4]:
# Calculate the first quartile (Q1) and third quartile (Q3) of the price data
Q1 = apartment_data['price'].quantile(0.25)
Q3 = apartment_data['price'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds to filter out outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out prices outside of the IQR range
filtered_apartment_data = apartment_data[(apartment_data['price'] >= lower_bound) & (apartment_data['price'] <= upper_bound)]

print("Original DataFrame:")
display(apartment_data)
print("\nFiltered DataFrame (prices within the IQR range):")
filtered_apartment_data

Original DataFrame:


Unnamed: 0,bathrooms,bedrooms,price,square_feet,cityname,state,latitude,longitude,region
0,1.0,1.0,2195.0,542,Redondo Beach,CA,33.8520,-118.3759,West
1,1.5,3.0,1250.0,1500,Newport News,VA,37.0867,-76.4941,South
2,2.0,3.0,1395.0,1650,Raleigh,NC,35.8230,-78.6438,South
3,1.0,2.0,1600.0,820,Vacaville,CA,38.3622,-121.9712,West
4,1.0,1.0,975.0,624,Albuquerque,NM,35.1038,-106.6110,West
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,780.0,605,Houston,TX,29.6151,-95.1998,South
99184,2.0,2.0,813.0,921,Jacksonville,FL,30.2254,-81.7579,South
99185,1.0,1.0,1325.0,650,San Diego,CA,32.7379,-117.0914,West
99186,1.0,1.0,931.0,701,Huntersville,NC,35.4158,-80.8451,South



Filtered DataFrame (prices within the IQR range):


Unnamed: 0,bathrooms,bedrooms,price,square_feet,cityname,state,latitude,longitude,region
0,1.0,1.0,2195.0,542,Redondo Beach,CA,33.8520,-118.3759,West
1,1.5,3.0,1250.0,1500,Newport News,VA,37.0867,-76.4941,South
2,2.0,3.0,1395.0,1650,Raleigh,NC,35.8230,-78.6438,South
3,1.0,2.0,1600.0,820,Vacaville,CA,38.3622,-121.9712,West
4,1.0,1.0,975.0,624,Albuquerque,NM,35.1038,-106.6110,West
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,780.0,605,Houston,TX,29.6151,-95.1998,South
99184,2.0,2.0,813.0,921,Jacksonville,FL,30.2254,-81.7579,South
99185,1.0,1.0,1325.0,650,San Diego,CA,32.7379,-117.0914,West
99186,1.0,1.0,931.0,701,Huntersville,NC,35.4158,-80.8451,South


In [5]:
exclude_bathrooms = [4.0, 4.5, 5.0, 6.0]

filtered_apartment_data = filtered_apartment_data[~filtered_apartment_data['bathrooms'].isin(exclude_bathrooms)]

In [6]:
filtered_apartment_data['bathrooms'].value_counts()

bathrooms
1.0    55008
2.0    33903
1.5     3032
2.5     1763
3.0      633
3.5      149
Name: count, dtype: int64

In [7]:
exclude_bedrooms = [6.0, 8.0]

filtered_apartment_data = filtered_apartment_data[~filtered_apartment_data['bedrooms'].isin(exclude_bedrooms)]

In [8]:
filtered_apartment_data['bedrooms'].value_counts()

bedrooms
2.0    43441
1.0    39377
3.0     9693
4.0     1190
0.0      628
5.0      146
Name: count, dtype: int64

In [9]:
# Calculate the first quartile (Q1) and third quartile (Q3) of the price data
Q1 = apartment_data['square_feet'].quantile(0.25)
Q3 = apartment_data['square_feet'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds to filter out outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out prices outside of the IQR range
filtered_apartment_data = filtered_apartment_data[(filtered_apartment_data['square_feet'] >= lower_bound) & (filtered_apartment_data['square_feet'] <= upper_bound)]

print("\nFiltered DataFrame (square_feet within the IQR range):")
filtered_apartment_data


Filtered DataFrame (square_feet within the IQR range):


Unnamed: 0,bathrooms,bedrooms,price,square_feet,cityname,state,latitude,longitude,region
0,1.0,1.0,2195.0,542,Redondo Beach,CA,33.8520,-118.3759,West
1,1.5,3.0,1250.0,1500,Newport News,VA,37.0867,-76.4941,South
2,2.0,3.0,1395.0,1650,Raleigh,NC,35.8230,-78.6438,South
3,1.0,2.0,1600.0,820,Vacaville,CA,38.3622,-121.9712,West
4,1.0,1.0,975.0,624,Albuquerque,NM,35.1038,-106.6110,West
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,780.0,605,Houston,TX,29.6151,-95.1998,South
99184,2.0,2.0,813.0,921,Jacksonville,FL,30.2254,-81.7579,South
99185,1.0,1.0,1325.0,650,San Diego,CA,32.7379,-117.0914,West
99186,1.0,1.0,931.0,701,Huntersville,NC,35.4158,-80.8451,South


In [10]:
filtered_apartment_data.dtypes

bathrooms      float64
bedrooms       float64
price          float64
square_feet      int64
cityname        object
state           object
latitude       float64
longitude      float64
region          object
dtype: object

In [11]:
filtered_apartment_data.to_csv('filtered_apartment_data.csv', index=False)

# Machine Learning

### Perform Train Test Split

In [12]:
features = filtered_apartment_data.drop(columns = ['price', 'cityname', 'state'])
target = filtered_apartment_data['price']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [14]:
#Separate numerical and categorical columns

X_train_cat= X_train.select_dtypes(include=object)
X_train_num= X_train.select_dtypes(include=np.number)

X_test_cat= X_test.select_dtypes(include=object)
X_test_num= X_test.select_dtypes(include=np.number)

In [15]:
#Transform Numerical Columns (TRAIN)

transformer = MinMaxScaler().fit(X_train_num)
X_train_num_norm = transformer.transform(X_train_num)
X_train_num_scale = pd.DataFrame(X_train_num_norm, index = X_train_num.index, columns=X_train_num.columns).reset_index(drop=True)
X_train_num_scale.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,latitude,longitude
0,0.0,0.2,0.494397,0.320618,0.45513
1,0.4,0.4,0.517469,0.344341,0.766547
2,0.4,0.4,0.451549,0.37957,0.499961
3,0.0,0.2,0.477917,0.437866,0.936717
4,0.4,0.6,0.774555,0.228482,0.704778


In [16]:
#Transform Numerical Columns (TEST)

transformer = MinMaxScaler().fit(X_test_num)
X_test_num_norm = transformer.transform(X_test_num)
X_test_num_scale = pd.DataFrame(X_test_num_norm, index = X_test_num.index, columns=X_test_num.columns).reset_index(drop=True)
X_test_num_scale.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,latitude,longitude
0,0.4,0.6,0.702667,0.301459,0.517074
1,0.0,0.2,0.273333,0.50078,0.695387
2,0.0,0.2,0.462667,0.461378,0.594107
3,0.0,0.4,0.346,0.462191,0.594635
4,0.4,0.4,0.909333,0.211486,0.701517


In [17]:
#Encode Categorical Columns (TRAIN)

encoded_X_train_cat = pd.get_dummies(X_train_cat)
encoded_X_train_cat.head()

Unnamed: 0,region_Midwest,region_Northeast,region_South,region_West
44189,False,False,False,True
79092,False,False,True,False
56787,False,False,False,True
45247,False,True,False,False
51233,False,False,True,False


In [18]:
#Encode Categorical Columns (TEST)

encoded_X_test_cat = pd.get_dummies(X_test_cat)
encoded_X_test_cat.head()

Unnamed: 0,region_Midwest,region_Northeast,region_South,region_West
87822,False,False,False,True
78608,True,False,False,False
21372,False,False,False,True
76256,False,False,False,True
84480,False,False,True,False


## KNN Regressor

In [19]:
from sklearn.neighbors import KNeighborsRegressor

In [20]:
#Numericals Scaled

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_num_scale, y_train)

knn.score(X_test_num_scale, y_test)

0.46784914336399674

In [21]:
#Not Scaled with no n_neighbors (default)

knn = KNeighborsRegressor()

knn.fit(X_train_num, y_train)

knn.score(X_test_num, y_test)

0.6124592944204625

In [22]:
#Not Scaled with n_neighbors at 10

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_num, y_train)

knn.score(X_test_num, y_test)

0.5856695762369845

## Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

In [24]:
lin_reg.fit(X_train_num_scale, y_train)

In [25]:
#Numericals Scaled

pred = lin_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num_scale, y_test))

MAE 381.3846625889636
RMSE 484.52746055883154
R2 score 0.1253887917283475




In [26]:
lin_reg.fit(X_train_num, y_train)

In [27]:
#Not Scaled

pred = lin_reg.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 380.94235578936133
RMSE 484.2012799494534
R2 score 0.12656596006411625




## Bagging and Pasting

In [28]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=100,
                               max_samples = 1000)

In [29]:
bagging_reg.fit(X_train_num_scale, y_train)

In [30]:
#Numericals Scaled

pred = bagging_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num_scale, y_test))

MAE 263.56678128768164
RMSE 352.0681867328382
R2 score -10.040618135155222




In [31]:
bagging_reg.fit(X_train_num, y_train)

In [32]:
#Not Scaled

pred = bagging_reg.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 216.10903125062012
RMSE 288.58916196741035
R2 score 0.12656596006411625




## Random Forest

In [33]:
forest = RandomForestRegressor(n_estimators=100,
                             max_depth=20)

In [35]:
forest.fit(X_train_num_scale, y_train)

In [36]:
#Numericals Scaled

pred = forest.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", forest.score(X_test_num_scale, y_test))

MAE 295.8851810544434
RMSE 410.42601082815054




R2 score 0.37245048227777056


In [37]:
forest.fit(X_train_num, y_train)

In [38]:
#Not Scaled

pred = forest.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 137.33583109841112
RMSE 210.31877871094107
R2 score 0.12656596006411625




## AdaBoost

In [39]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

In [40]:
ada_reg.fit(X_train_num_scale, y_train)

In [41]:
#Numericals Scaled

pred = ada_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", ada_reg.score(X_test_num_scale, y_test))

MAE 283.70685463076023
RMSE 390.42016056598374




R2 score 0.4321381163988298


In [42]:
ada_reg.fit(X_train_num, y_train)

In [43]:
#Not Scaled

pred = ada_reg.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 148.22105329427058
RMSE 215.87633942162324
R2 score 0.12656596006411625




## Gradient Boosting

In [44]:
gb_reg = GradientBoostingRegressor(max_depth=20,
                                   n_estimators=100)

In [45]:
gb_reg.fit(X_train_num_scale, y_train)

In [46]:
#Numericals Scaled

pred = gb_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", gb_reg.score(X_test_num_scale, y_test))

MAE 334.2346838172417
RMSE 460.66329799654966




R2 score 0.20942063938230615


In [47]:
gb_reg.fit(X_train_num, y_train)

In [48]:
#Not Scaled

pred = gb_reg.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 136.52177693524706
RMSE 230.8478075743162
R2 score 0.12656596006411625




## Decision Tree

In [49]:
tree = DecisionTreeRegressor(max_depth=10)

In [50]:
tree.fit(X_train_num_scale, y_train)

In [51]:
#Numericals Scaled

pred = tree.predict(X_test_num_scale)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_num_scale, y_test))

MAE 311.9328569859695
RMSE 430.31512720146526
R2 score 0.31015507431002354




In [52]:
tree.fit(X_train_num, y_train)

In [53]:
#Not Scaled

pred = tree.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 213.59333835455038
RMSE 290.4147313572046
R2 score 0.12656596006411625




In [54]:
#Numericals Scaled

tree_importance = {feature : importance for feature, importance in zip(X_train_num_scale.columns, tree.feature_importances_)}
tree_importance           

{'bathrooms': 0.04394622310081813,
 'bedrooms': 0.007218034698323014,
 'square_feet': 0.17893075761200075,
 'latitude': 0.24312224552623257,
 'longitude': 0.5267827390626255}

In [55]:
#Not Scaled

tree_importance = {feature : importance for feature, importance in zip(X_train_num_scale.columns, tree.feature_importances_)}
tree_importance     

{'bathrooms': 0.04394622310081813,
 'bedrooms': 0.007218034698323014,
 'square_feet': 0.17893075761200075,
 'latitude': 0.24312224552623257,
 'longitude': 0.5267827390626255}

# Hyperparameter Tuning

## Grid Search

In [None]:
grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['distance'],
    'algorithm': ['brute'],
    'p': [1, 2],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

#Best Model
#{'algorithm': 'brute', 'metric': 'manhattan', 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}

#Initial K_score 0.6126916912414175 (no hyperparameter tuning)

#RESULTS of Hyperparameter Tuning
#MAE 165.46497933283416
#RMSE 276.1745440035609
#R2 score 0.7158513883473436

In [None]:
knn = KNeighborsRegressor()

In [None]:
from sklearn.model_selection import GridSearchCV

model = GridSearchCV(knn, grid, cv=5, scoring='neg_mean_absolute_error')

In [None]:
%%time

# model.fit(X_train_num, y_train)

CPU times: user 50min 2s, sys: 31.5 s, total: 50min 34s
Wall time: 7min 27s


In [None]:
# model.best_params_

{'algorithm': 'brute',
 'metric': 'manhattan',
 'n_neighbors': 11,
 'p': 1,
 'weights': 'distance'}

In [None]:
# best_model = model.best_estimator_

In [None]:
# pred = best_model.predict(X_test_num)

# print("MAE", mean_absolute_error(pred, y_test))
# print("RMSE", mean_squared_error(pred, y_test, squared=False))
# print("R2 score", best_model.score(X_test_num, y_test))

MAE 165.46497933283416
RMSE 276.1745440035609
R2 score 0.7158513883473436


## Price Classification

In [56]:
filtered_apartment_data['price'].describe()

count    92468.000000
mean      1391.748226
std        523.041696
min        100.000000
25%        995.000000
50%       1305.000000
75%       1695.000000
max       2966.000000
Name: price, dtype: float64

In [57]:
# Discretizing client age into 9 groups
# takes continuous variables and turns them in discreet variables by binning them
bins = [filtered_apartment_data['price'].min(), 1305.00, 1695.00, filtered_apartment_data['price'].max()]
labels = ['Low', 'Medium', 'High']
# create new column using the data above
filtered_apartment_data['rent_price_range'] = pd.cut(filtered_apartment_data['price'], bins=bins, labels=labels, include_lowest=True)
filtered_apartment_data.rent_price_range.value_counts()

rent_price_range
Low       46258
Medium    23346
High      22864
Name: count, dtype: int64

In [58]:
filtered_apartment_data = pd.get_dummies(filtered_apartment_data, columns=['region'])

In [59]:
features = filtered_apartment_data.drop(columns = ['price', 'cityname', 'state', 'rent_price_range'])
target = filtered_apartment_data['rent_price_range']

In [60]:
features

Unnamed: 0,bathrooms,bedrooms,square_feet,latitude,longitude,region_Midwest,region_Northeast,region_South,region_West
0,1.0,1.0,542,33.8520,-118.3759,False,False,False,True
1,1.5,3.0,1500,37.0867,-76.4941,False,False,True,False
2,2.0,3.0,1650,35.8230,-78.6438,False,False,True,False
3,1.0,2.0,820,38.3622,-121.9712,False,False,False,True
4,1.0,1.0,624,35.1038,-106.6110,False,False,False,True
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,605,29.6151,-95.1998,False,False,True,False
99184,2.0,2.0,921,30.2254,-81.7579,False,False,True,False
99185,1.0,1.0,650,32.7379,-117.0914,False,False,False,True
99186,1.0,1.0,701,35.4158,-80.8451,False,False,True,False


In [61]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)


In [62]:
forest = RandomForestClassifier(n_estimators=100)

forest.fit(X_train, y_train)

In [63]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
forest_pred = forest.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, forest_pred))
print("F1 Score:", f1_score(y_test, forest_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, forest_pred))


Accuracy: 0.840705093543852
F1 Score: 0.8398945253685806
Confusion Matrix:
 [[3808  158  533]
 [ 109 8466  731]
 [ 563  852 3274]]
