In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

In [5]:
# uncomment if using colab
# ! git clone https://github.com/Daniel-Tran3/CSE_151A_Project.git

fatal: destination path 'CSE_151A_Project' already exists and is not an empty directory.


In [9]:
original_df = pd.read_csv('apartments_for_rent_classified_10K_utf.csv')

# uncomment the following if using colab
# original_df = pd.read_csv('CSE_151A_Project/apartments_for_rent_classified_10K_utf.csv')

In [31]:
df = original_df[["amenities", "bathrooms", "bedrooms", "fee", "price", "price_type", "square_feet", "cityname", "state", "time"]]

In [11]:
df.head(5)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
0,,,0.0,No,790,Monthly,101,Washington,DC,1577359415
1,,,1.0,No,425,Monthly,106,Evansville,IN,1577017063
2,,1.0,0.0,No,1390,Monthly,107,Arlington,VA,1577359410
3,,1.0,0.0,No,925,Monthly,116,Seattle,WA,1576667743
4,,,0.0,No,880,Monthly,125,Arlington,VA,1577359401


In [21]:
df.isna().mean()

amenities      0.354871
bathrooms      0.003401
bedrooms       0.000700
fee            0.000000
price          0.000000
price_type     0.000000
square_feet    0.000000
cityname       0.007702
state          0.007702
time           0.000000
dtype: float64

## Convert timestamp to datetime objects

In [13]:
df['time'] = pd.to_datetime(df['time'], unit='s')

In [14]:
df.head(2)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
0,,,0.0,No,790,Monthly,101,Washington,DC,2019-12-26 11:23:35
1,,,1.0,No,425,Monthly,106,Evansville,IN,2019-12-22 12:17:43


## Normalizing Price and Square Feet


Convert any prices measured in Weeks to Months by converting to price per day then price per year then price per month (roughly 4.348 multiplier).

There is one abnormally high price with 'weekly' price_type. The price for 'week' is 1560, and digging into the dataset, we found that it is a duplicate of index 15. The listing is for the same place except it's on another listing website. This duplicate will be dropped in favor of the other one.

There is also one record with 'Monthly|Weekely' price_type with a price of 275.0, and the price is likely to be on a weekly basis.

In [32]:
df[df['price'] == 1560][:2]

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
15,"AC,Basketball,Cable or Satellite,Gym,Internet ...",1.0,1.0,No,1560,Weekly,200,New Bern,NC,1576618076
16,"AC,Basketball,Cable or Satellite,Gym,Internet ...",1.0,1.0,No,1560,Monthly,200,New Bern,NC,1576406273


In [33]:
# Drop the wrong record?
# df = df.drop([16,17])

df = df.drop([15,16])

In [34]:
df[df['price_type'] == 'Monthly|Weekly']

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
235,"Cable or Satellite,Pool,Refrigerator,Storage,TV",,,No,275,Monthly|Weekly,300,Lakeland,FL,1574891000


In [35]:
for i in df.index:
  if (df['price_type'][i] == "Weekly") or (df['price_type'][i] == "Monthly/Weekly"):
    df.loc[i, 'price'] = df['price'][i] / 7 * 365.25 / 12

df['price'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())

In [36]:
print(df['price'].min())
print(df['price'].max())

0.0
1.0


Since we have standardized the units of the price, we can drop the price_type column.

In [37]:
df = df.drop(columns=['price_type'])

## Normalize the square_feet using Min Max method

In [38]:
df['square_feet'] = (df['square_feet'] - df['square_feet'].min()) / (df['square_feet'].max() - df['square_feet'].min())

In [39]:
print(df['square_feet'].min())
print(df['square_feet'].max())

0.0
1.0


## Impute NaN values using median

In [40]:
df['bathrooms'] = df['bathrooms'].replace({np.nan: df['bathrooms'].median()})

In [41]:
df['bedrooms'] = df['bedrooms'].replace({np.nan: df['bedrooms'].median()})

## Preprocessing on the amenities

In [42]:
df['amenities'] = np.array(df['amenities'].str.strip().str.split(","))

In [43]:
df

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,square_feet,cityname,state,time
0,,1.0,0.0,No,0.011281,0.000000,Washington,DC,1577359415
1,,1.0,1.0,No,0.004302,0.000125,Evansville,IN,1577017063
2,,1.0,0.0,No,0.022753,0.000150,Arlington,VA,1577359410
3,,1.0,0.0,No,0.013862,0.000376,Seattle,WA,1576667743
4,,1.0,0.0,No,0.013002,0.000602,Arlington,VA,1577359401
...,...,...,...,...,...,...,...,...,...
9995,,4.0,5.0,No,0.110899,0.155367,Edina,MN,1575112975
9996,,8.0,6.0,No,0.474187,0.215920,Montecito,CA,1577360419
9997,,8.5,6.0,No,0.206501,0.281135,Potomac,MD,1577360560
9998,"[Basketball, Cable or Satellite, Doorman, Hot ...",1.0,1.0,No,0.087763,1.000000,New York,NY,1577362186


In [None]:
df['amenities'] = df['amenities'].replace({np.nan: 'None'})

In [None]:
df.head(5)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,square_feet,cityname,state,time
0,,1.0,0.0,No,0.011281,0.0,Washington,DC,2019-12-26 11:23:35
1,,1.0,1.0,No,0.004302,0.000125,Evansville,IN,2019-12-22 12:17:43
2,,1.0,0.0,No,0.022753,0.00015,Arlington,VA,2019-12-26 11:23:30
3,,1.0,0.0,No,0.013862,0.000376,Seattle,WA,2019-12-18 11:15:43
4,,1.0,0.0,No,0.013002,0.000602,Arlington,VA,2019-12-26 11:23:21


## One-hot Encoding States and City Names
States and city names are nominal categorical features. To deal with this, we will be creating one-hot encoded fields to reflect each data points location.

In [44]:
all_states = df['state'].unique()
all_states

array(['DC', 'IN', 'VA', 'WA', 'NY', 'CA', 'AZ', 'TX', 'GA', 'NC', 'FL',
       nan, 'AL', 'MD', 'CO', 'NM', 'IL', 'TN', 'AK', 'MA', 'NJ', 'OR',
       'DE', 'PA', 'IA', 'SC', 'MN', 'MI', 'KY', 'WI', 'OH', 'CT', 'RI',
       'NV', 'UT', 'MO', 'OK', 'NH', 'NE', 'LA', 'ND', 'AR', 'KS', 'ID',
       'HI', 'MT', 'VT', 'SD', 'WV', 'MS', 'ME', 'WY'], dtype=object)

In [45]:
display(df['state'].value_counts())

display(df['cityname'].value_counts())

TX    1737
CA     955
WA     519
NC     436
MD     424
NJ     383
GA     372
FL     339
OH     321
CO     318
WI     302
IL     282
MO     239
IN     239
MN     221
VA     205
OR     197
PA     183
IA     179
OK     178
MI     176
MA     167
AZ     126
NV     121
ND     113
NE     105
CT      98
TN      92
UT      84
KS      83
DC      80
SC      77
NY      71
NH      70
SD      66
LA      66
AL      56
AR      56
AK      44
KY      40
ID      21
VT      16
NM      14
HI      12
RI      11
MS       9
MT       7
DE       5
WV       3
ME       2
WY       1
Name: state, dtype: int64

Austin           523
Dallas           216
Houston          186
San Antonio      182
Los Angeles      165
                ... 
Keizer             1
Keyser             1
Pompano Beach      1
Kaysville          1
Bella Vista        1
Name: cityname, Length: 1573, dtype: int64

In [46]:
df = pd.get_dummies(df, columns=['state', 'cityname'], dummy_na=True)

In [47]:
df.head()

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,square_feet,time,state_AK,state_AL,state_AR,...,cityname_York,cityname_Yorktown,cityname_Yorkville,cityname_Youngstown,cityname_Youngsville,cityname_Ypsilanti,cityname_Yuba City,cityname_Yukon,cityname_Zachary,cityname_nan
0,,1.0,0.0,No,0.011281,0.0,1577359415,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,1.0,1.0,No,0.004302,0.000125,1577017063,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,1.0,0.0,No,0.022753,0.00015,1577359410,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,1.0,0.0,No,0.013862,0.000376,1576667743,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,1.0,0.0,No,0.013002,0.000602,1577359401,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# extract unknown cities and states
unknown_city = df[df['cityname_nan'] == 1]
unknown_state = df[df['state_nan'] == 1]
unknown_state

# find if there is intersection between unknown cities and states
print(unknown_city.shape[0])
print(unknown_state.shape[0])
len(unknown_city.index.intersection(unknown_state.index))

77
77


77

In [49]:
df['fee'].unique()

array(['No'], dtype=object)

In [50]:
final_cleaned = df.drop(columns=['fee'])
final_cleaned

Unnamed: 0,amenities,bathrooms,bedrooms,price,square_feet,time,state_AK,state_AL,state_AR,state_AZ,...,cityname_York,cityname_Yorktown,cityname_Yorkville,cityname_Youngstown,cityname_Youngsville,cityname_Ypsilanti,cityname_Yuba City,cityname_Yukon,cityname_Zachary,cityname_nan
0,,1.0,0.0,0.011281,0.000000,1577359415,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,1.0,1.0,0.004302,0.000125,1577017063,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,1.0,0.0,0.022753,0.000150,1577359410,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,1.0,0.0,0.013862,0.000376,1576667743,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,1.0,0.0,0.013002,0.000602,1577359401,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,,4.0,5.0,0.110899,0.155367,1575112975,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,,8.0,6.0,0.474187,0.215920,1577360419,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,,8.5,6.0,0.206501,0.281135,1577360560,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,"[Basketball, Cable or Satellite, Doorman, Hot ...",1.0,1.0,0.087763,1.000000,1577362186,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model 1

In [52]:
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(final_cleaned.drop(['price'], axis = 1), final_cleaned.price, test_size = 0.2, random_state = 42)

def buildHPmodel(hp):
  model = Sequential ([
      Dense(32, activation = 'relu', input_dim = final_cleaned.shape[1]),
      Dense(
          units = hp.Int("units", min_value = 32, max_value = 512, step = 32),
          activation = hp.Choice("activation", ["relu", "tanh"]),
      ),
      Dense(1, activation = 'sigmoid')
  ])
  learning_rate = hp.Float("lr", min_value = 1e-4, max_value = 1e-2, sampling = "log")
  model.compile(optimizer = SGD(learning_rate = learning_rate), loss = 'binary_crossentropy', metrics = ['accuracy'])
  return model

In [53]:
pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/128.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.6 kt-legacy-1.0.5


In [54]:
import keras_tuner

In [55]:
hp = keras_tuner.HyperParameters()

In [56]:
tuner = keras_tuner.GridSearch(
    hypermodel = buildHPmodel,
    objective = "accuracy",
    max_trials = 20,
    seed = 15,
    executions_per_trial = 5,
    tune_new_entries = True,
    allow_new_entries = True,
    max_consecutive_failed_trials = 3
)

In [57]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 0)

In [None]:
# TODO:
# Encode X_train amentities. Now it's string which can't be trained.

In [60]:
tuner.search(X_train.astype('float'), y_train, epochs = 2, validation_data = (X_val, y_val))

ValueError: setting an array element with a sequence.

In [None]:
tuner.results_summary()