In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, DecisionTreeClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
apartment_data = pd.read_csv('apartment_data.csv')

In [3]:
apartment_data.shape

(99188, 9)

# Filtering Outliers

In [4]:
# Calculate the first quartile (Q1) and third quartile (Q3) of the price data
Q1 = apartment_data['price'].quantile(0.25)
Q3 = apartment_data['price'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds to filter out outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out prices outside of the IQR range
filtered_apartment_data = apartment_data[(apartment_data['price'] >= lower_bound) & (apartment_data['price'] <= upper_bound)]

print("Original DataFrame:")
display(apartment_data)
print("\nFiltered DataFrame (prices within the IQR range):")
filtered_apartment_data

Original DataFrame:


Unnamed: 0,bathrooms,bedrooms,price,square_feet,cityname,state,latitude,longitude,region
0,1.0,1.0,2195.0,542,Redondo Beach,CA,33.8520,-118.3759,West
1,1.5,3.0,1250.0,1500,Newport News,VA,37.0867,-76.4941,South
2,2.0,3.0,1395.0,1650,Raleigh,NC,35.8230,-78.6438,South
3,1.0,2.0,1600.0,820,Vacaville,CA,38.3622,-121.9712,West
4,1.0,1.0,975.0,624,Albuquerque,NM,35.1038,-106.6110,West
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,780.0,605,Houston,TX,29.6151,-95.1998,South
99184,2.0,2.0,813.0,921,Jacksonville,FL,30.2254,-81.7579,South
99185,1.0,1.0,1325.0,650,San Diego,CA,32.7379,-117.0914,West
99186,1.0,1.0,931.0,701,Huntersville,NC,35.4158,-80.8451,South



Filtered DataFrame (prices within the IQR range):


Unnamed: 0,bathrooms,bedrooms,price,square_feet,cityname,state,latitude,longitude,region
0,1.0,1.0,2195.0,542,Redondo Beach,CA,33.8520,-118.3759,West
1,1.5,3.0,1250.0,1500,Newport News,VA,37.0867,-76.4941,South
2,2.0,3.0,1395.0,1650,Raleigh,NC,35.8230,-78.6438,South
3,1.0,2.0,1600.0,820,Vacaville,CA,38.3622,-121.9712,West
4,1.0,1.0,975.0,624,Albuquerque,NM,35.1038,-106.6110,West
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,780.0,605,Houston,TX,29.6151,-95.1998,South
99184,2.0,2.0,813.0,921,Jacksonville,FL,30.2254,-81.7579,South
99185,1.0,1.0,1325.0,650,San Diego,CA,32.7379,-117.0914,West
99186,1.0,1.0,931.0,701,Huntersville,NC,35.4158,-80.8451,South


In [5]:
exclude_bathrooms = [4.0, 4.5, 5.0, 6.0]

filtered_apartment_data = filtered_apartment_data[~filtered_apartment_data['bathrooms'].isin(exclude_bathrooms)]

In [6]:
filtered_apartment_data['bathrooms'].value_counts()

bathrooms
1.0    55008
2.0    33903
1.5     3032
2.5     1763
3.0      633
3.5      149
Name: count, dtype: int64

In [7]:
exclude_bedrooms = [6.0, 8.0]

filtered_apartment_data = filtered_apartment_data[~filtered_apartment_data['bedrooms'].isin(exclude_bedrooms)]

In [8]:
filtered_apartment_data['bedrooms'].value_counts()

bedrooms
2.0    43441
1.0    39377
3.0     9693
4.0     1190
0.0      628
5.0      146
Name: count, dtype: int64

In [9]:
# Calculate the first quartile (Q1) and third quartile (Q3) of the price data
Q1 = apartment_data['square_feet'].quantile(0.25)
Q3 = apartment_data['square_feet'].quantile(0.75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds to filter out outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out prices outside of the IQR range
filtered_apartment_data = filtered_apartment_data[(filtered_apartment_data['square_feet'] >= lower_bound) & (filtered_apartment_data['square_feet'] <= upper_bound)]

print("\nFiltered DataFrame (square_feet within the IQR range):")
filtered_apartment_data


Filtered DataFrame (square_feet within the IQR range):


Unnamed: 0,bathrooms,bedrooms,price,square_feet,cityname,state,latitude,longitude,region
0,1.0,1.0,2195.0,542,Redondo Beach,CA,33.8520,-118.3759,West
1,1.5,3.0,1250.0,1500,Newport News,VA,37.0867,-76.4941,South
2,2.0,3.0,1395.0,1650,Raleigh,NC,35.8230,-78.6438,South
3,1.0,2.0,1600.0,820,Vacaville,CA,38.3622,-121.9712,West
4,1.0,1.0,975.0,624,Albuquerque,NM,35.1038,-106.6110,West
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,780.0,605,Houston,TX,29.6151,-95.1998,South
99184,2.0,2.0,813.0,921,Jacksonville,FL,30.2254,-81.7579,South
99185,1.0,1.0,1325.0,650,San Diego,CA,32.7379,-117.0914,West
99186,1.0,1.0,931.0,701,Huntersville,NC,35.4158,-80.8451,South


In [10]:
filtered_apartment_data.dtypes

bathrooms      float64
bedrooms       float64
price          float64
square_feet      int64
cityname        object
state           object
latitude       float64
longitude      float64
region          object
dtype: object

# Machine Learning

### Perform Train Test Split

In [11]:
features = apartment_data.drop(columns = ['price', 'cityname', 'state'])
target = apartment_data['price']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [13]:
#Separate numerical and categorical columns

X_train_cat= X_train.select_dtypes(include=object)
X_train_num= X_train.select_dtypes(include=np.number)

X_test_cat= X_test.select_dtypes(include=object)
X_test_num= X_test.select_dtypes(include=np.number)

In [14]:
#Transform Numerical Columns (TRAIN)

transformer = MinMaxScaler().fit(X_train_num)
X_train_num_norm = transformer.transform(X_train_num)
X_train_num_scale = pd.DataFrame(X_train_num_norm, index = X_train_num.index, columns=X_train_num.columns).reset_index(drop=True)
X_train_num_scale.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,latitude,longitude
0,0.0,0.111111,0.051123,0.316856,0.46267
1,0.125,0.222222,0.126545,0.289129,0.465944
2,0.0,0.222222,0.062474,0.276208,0.863663
3,0.125,0.222222,0.060624,0.290905,0.465043
4,0.125,0.222222,0.098209,0.173805,0.847692


In [15]:
#Transform Numerical Columns (TEST)

transformer = MinMaxScaler().fit(X_test_num)
X_test_num_norm = transformer.transform(X_test_num)
X_test_num_scale = pd.DataFrame(X_test_num_norm, index = X_test_num.index, columns=X_test_num.columns).reset_index(drop=True)
X_test_num_scale.head()

Unnamed: 0,bathrooms,bedrooms,square_feet,latitude,longitude
0,0.0,0.166667,0.038643,0.255318,0.535566
1,0.2,0.5,0.164552,0.370516,0.478541
2,0.2,0.333333,0.114074,0.362282,0.889514
3,0.2,0.5,0.146015,0.45731,0.805625
4,0.0,0.166667,0.044204,0.543398,0.78517


In [16]:
#Encode Categorical Columns (TRAIN)

encoded_X_train_cat = pd.get_dummies(X_train_cat)
encoded_X_train_cat.head()

Unnamed: 0,region_Midwest,region_Northeast,region_South,region_West
56854,False,False,False,True
3997,False,False,False,True
85239,False,False,True,False
82697,False,False,False,True
33681,False,False,True,False


In [17]:
#Encode Categorical Columns (TEST)

encoded_X_test_cat = pd.get_dummies(X_test_cat)
encoded_X_test_cat.head()

Unnamed: 0,region_Midwest,region_Northeast,region_South,region_West
54841,False,False,False,True
44135,False,False,False,True
23266,False,False,True,False
3108,True,False,False,False
3780,True,False,False,False


## KNN Regressor

In [18]:
from sklearn.neighbors import KNeighborsRegressor

In [19]:
knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_num_scale, y_train)

knn.score(X_test_num_scale, y_test)

-0.07285383300445436

In [20]:
knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_num, y_train)

knn.score(X_test_num, y_test)

0.49564329749167857

In [21]:
#From 49.5%

## Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

In [23]:
lin_reg.fit(X_train_num_scale, y_train)

In [24]:
lin_reg.fit(X_train_num, y_train)

In [25]:
pred = lin_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 2484.32505422429
RMSE 2626.194820833969
R2 score 0.19115184065090307




In [26]:
pred = lin_reg.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 493.9943100669866
RMSE 767.4198681935671
R2 score 0.19115184065090307




## Bagging and Pasting

In [27]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=100,
                               max_samples = 1000)

In [28]:
bagging_reg.fit(X_train_num_scale, y_train)

In [29]:
bagging_reg.fit(X_train_num, y_train)

In [30]:
pred = bagging_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num_scale, y_test))

MAE 663.113699465672
RMSE 911.0148906913554
R2 score -8.472286059963375




In [31]:
pred = bagging_reg.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 274.3932828557562
RMSE 524.8976277220972
R2 score 0.19115184065090307




## Random Forest

In [32]:
forest = RandomForestRegressor(n_estimators=100,
                             max_depth=20)

In [33]:
forest.fit(X_train_num_scale, y_train)

In [34]:
forest.fit(X_train_num, y_train)

In [35]:
pred = forest.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", forest.score(X_test_num_scale, y_test))

MAE 518.1192027086064
RMSE 873.4225720247715




R2 score -0.04773088605158482


In [36]:
pred = forest.predict(X_test_num)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 181.96792794088702
RMSE 425.18987924569336
R2 score 0.19115184065090307




## AdaBoost

In [37]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

In [38]:
ada_reg.fit(X_train_num_scale, y_train)

In [39]:
ada_reg.fit(X_train_num, y_train)

In [40]:
pred = ada_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", ada_reg.score(X_test_num_scale, y_test))

MAE 560.0225330174412
RMSE 861.9276375597657




R2 score -0.02033441932268687


In [41]:
pred = ada_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 560.0225330174412
RMSE 861.9276375597657
R2 score 0.19115184065090307




## Gradient Boosting

In [42]:
gb_reg = GradientBoostingRegressor(max_depth=20,
                                   n_estimators=100)

In [43]:
gb_reg.fit(X_train_num_scale, y_train)

In [44]:
gb_reg.fit(X_train_num, y_train)

In [45]:
pred = gb_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", gb_reg.score(X_test_num_scale, y_test))

MAE 1086.214244164797
RMSE 1261.301042063284
R2 score -1.1849345551110586




In [46]:
pred = gb_reg.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 1086.214244164797
RMSE 1261.301042063284
R2 score 0.19115184065090307




## Decision Tree

In [47]:
tree = DecisionTreeRegressor(max_depth=10)

In [48]:
tree.fit(X_train_num_scale, y_train)

In [49]:
tree.fit(X_train_num, y_train)

In [50]:
pred = tree.predict(X_test_num_scale)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_num_scale, y_test))

MAE 2389.7767421111
RMSE 2497.566589009985
R2 score -7.567124458342672




In [51]:
pred = tree.predict(X_test_num_scale)

print("MAE", mean_absolute_error(y_test, pred))
print("RMSE", mean_squared_error(y_test, pred, squared=False))
print("R2 score", lin_reg.score(X_test_num, y_test))

MAE 2389.7767421111
RMSE 2497.566589009985
R2 score 0.19115184065090307




In [52]:
tree_importance = {feature : importance for feature, importance in zip(X_train_num_scale.columns, tree.feature_importances_)}
tree_importance           

{'bathrooms': 0.1678478566594723,
 'bedrooms': 0.025707161878496757,
 'square_feet': 0.24440713776915754,
 'latitude': 0.17726023373860883,
 'longitude': 0.38477760995426463}

### Price Classification

In [72]:
filtered_apartment_data['price'].describe()

count    92468.000000
mean      1391.748226
std        523.041696
min        100.000000
25%        995.000000
50%       1305.000000
75%       1695.000000
max       2966.000000
Name: price, dtype: float64

In [73]:
# Discretizing client age into 9 groups
# takes continuous variables and turns them in discreet variables by binning them
bins = [filtered_apartment_data['price'].min(), 1305.00, 1695.00, filtered_apartment_data['price'].max()]
labels = ['Low', 'Medium', 'High']
# create new column using the data above
filtered_apartment_data['rent_price_range'] = pd.cut(filtered_apartment_data['price'], bins=bins, labels=labels, include_lowest=True)
filtered_apartment_data.rent_price_range.value_counts()

rent_price_range
Low       46258
Medium    23346
High      22864
Name: count, dtype: int64

In [74]:
filtered_apartment_data

Unnamed: 0,bathrooms,bedrooms,price,square_feet,cityname,state,latitude,longitude,region,rent_price_range
0,1.0,1.0,2195.0,542,Redondo Beach,CA,33.8520,-118.3759,West,High
1,1.5,3.0,1250.0,1500,Newport News,VA,37.0867,-76.4941,South,Low
2,2.0,3.0,1395.0,1650,Raleigh,NC,35.8230,-78.6438,South,Medium
3,1.0,2.0,1600.0,820,Vacaville,CA,38.3622,-121.9712,West,Medium
4,1.0,1.0,975.0,624,Albuquerque,NM,35.1038,-106.6110,West,Low
...,...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,780.0,605,Houston,TX,29.6151,-95.1998,South,Low
99184,2.0,2.0,813.0,921,Jacksonville,FL,30.2254,-81.7579,South,Low
99185,1.0,1.0,1325.0,650,San Diego,CA,32.7379,-117.0914,West,Medium
99186,1.0,1.0,931.0,701,Huntersville,NC,35.4158,-80.8451,South,Low


In [77]:
filtered_apartment_data = pd.get_dummies(filtered_apartment_data, columns=['region'])

In [78]:
features = filtered_apartment_data.drop(columns = ['price', 'cityname', 'state', 'rent_price_range'])
target = filtered_apartment_data['rent_price_range']

In [79]:
features

Unnamed: 0,bathrooms,bedrooms,square_feet,latitude,longitude,region_Midwest,region_Northeast,region_South,region_West
0,1.0,1.0,542,33.8520,-118.3759,False,False,False,True
1,1.5,3.0,1500,37.0867,-76.4941,False,False,True,False
2,2.0,3.0,1650,35.8230,-78.6438,False,False,True,False
3,1.0,2.0,820,38.3622,-121.9712,False,False,False,True
4,1.0,1.0,624,35.1038,-106.6110,False,False,False,True
...,...,...,...,...,...,...,...,...,...
99183,1.0,1.0,605,29.6151,-95.1998,False,False,True,False
99184,2.0,2.0,921,30.2254,-81.7579,False,False,True,False
99185,1.0,1.0,650,32.7379,-117.0914,False,False,False,True
99186,1.0,1.0,701,35.4158,-80.8451,False,False,True,False


In [80]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [81]:
forest = RandomForestClassifier(n_estimators=100)

forest.fit(X_train, y_train)

In [82]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
forest_pred = forest.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, forest_pred))
print("F1 Score:", f1_score(y_test, forest_pred, average='weighted'))
print("Confusion Matrix:\n", confusion_matrix(y_test, forest_pred))


Accuracy: 0.8410295230885693
F1 Score: 0.8401375241263545
Confusion Matrix:
 [[3807  160  532]
 [ 109 8479  718]
 [ 583  838 3268]]
