In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

In [2]:
original_df = pd.read_csv('apartments_for_rent_classified_10K_utf.csv')

In [3]:
df = original_df[["amenities", "bathrooms", "bedrooms", "fee", "price", "price_type", "square_feet", "cityname", "state", "time"]]

In [4]:
df.head(5)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
0,,,0.0,No,790,Monthly,101,Washington,DC,1577359415
1,,,1.0,No,425,Monthly,106,Evansville,IN,1577017063
2,,1.0,0.0,No,1390,Monthly,107,Arlington,VA,1577359410
3,,1.0,0.0,No,925,Monthly,116,Seattle,WA,1576667743
4,,,0.0,No,880,Monthly,125,Arlington,VA,1577359401


In [5]:
df.isna().mean()

amenities      0.3549
bathrooms      0.0034
bedrooms       0.0007
fee            0.0000
price          0.0000
price_type     0.0000
square_feet    0.0000
cityname       0.0077
state          0.0077
time           0.0000
dtype: float64

## Convert timestamp to datetime objects

In [6]:
df['time'] = pd.to_datetime(df['time'], unit='s')

In [7]:
df.head(2)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
0,,,0.0,No,790,Monthly,101,Washington,DC,2019-12-26 11:23:35
1,,,1.0,No,425,Monthly,106,Evansville,IN,2019-12-22 12:17:43


## Normalizing Price and Square Feet


Convert any prices measured in Weeks to Months by converting to price per day then price per year then price per month (roughly 4.348 multiplier).

There is one abnormally high price with 'weekly' price_type. The price for 'week' is 1560, and digging into the dataset, we found that it is a duplicate of index 15. The listing is for the same place except it's on another listing website. This duplicate will be dropped in favor of the other one.

There is also one record with 'Monthly|Weekely' price_type with a price of 275.0, and the price is likely to be on a monthly basis.

In [8]:
df[df['price'] == 1560][:2]

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
15,"AC,Basketball,Cable or Satellite,Gym,Internet ...",1.0,1.0,No,1560,Weekly,200,New Bern,NC,2019-12-17 21:27:56
16,"AC,Basketball,Cable or Satellite,Gym,Internet ...",1.0,1.0,No,1560,Monthly,200,New Bern,NC,2019-12-15 10:37:53


In [9]:
df = df.drop([16,17])

In [10]:
df[df['price_type'] == 'Monthly|Weekly']

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,price_type,square_feet,cityname,state,time
235,"Cable or Satellite,Pool,Refrigerator,Storage,TV",,,No,275,Monthly|Weekly,300,Lakeland,FL,2019-11-27 21:43:20


In [11]:
for i in df.index:
  if (df['price_type'][i] == "Weekly") or (df['price_type'][i] == "Monthly/Weekly"):
    df.loc[i, 'price'] = df['price'][i] / 7 * 365.25 / 12

df['price'] = (df['price'] - df['price'].min()) / (df['price'].max() - df['price'].min())

In [12]:
print(df['price'].min())
print(df['price'].max())

0.0
1.0


Since we have standardized the units of the price, we can drop the price_type column.

In [13]:
df = df.drop(columns=['price_type'])

## Normalize the square_feet using Min Max method

In [14]:
df['square_feet'] = (df['square_feet'] - df['square_feet'].min()) / (df['square_feet'].max() - df['square_feet'].min())

In [15]:
print(df['square_feet'].min())
print(df['square_feet'].max())

0.0
1.0


## Impute NaN values using median

In [16]:
df['bathrooms'] = df['bathrooms'].replace({np.nan: df['bathrooms'].median()})

In [17]:
df['bedrooms'] = df['bedrooms'].replace({np.nan: df['bedrooms'].median()})

## Preprocessing on the amenities

In [18]:
df['amenities'] = np.array(df['amenities'].str.strip().str.split(","))

In [19]:
df

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,square_feet,cityname,state,time
0,,1.0,0.0,No,0.011281,0.000000,Washington,DC,2019-12-26 11:23:35
1,,1.0,1.0,No,0.004302,0.000125,Evansville,IN,2019-12-22 12:17:43
2,,1.0,0.0,No,0.022753,0.000150,Arlington,VA,2019-12-26 11:23:30
3,,1.0,0.0,No,0.013862,0.000376,Seattle,WA,2019-12-18 11:15:43
4,,1.0,0.0,No,0.013002,0.000602,Arlington,VA,2019-12-26 11:23:21
...,...,...,...,...,...,...,...,...,...
9995,,4.0,5.0,No,0.110899,0.155367,Edina,MN,2019-11-30 11:22:55
9996,,8.0,6.0,No,0.474187,0.215920,Montecito,CA,2019-12-26 11:40:19
9997,,8.5,6.0,No,0.206501,0.281135,Potomac,MD,2019-12-26 11:42:40
9998,"[Basketball, Cable or Satellite, Doorman, Hot ...",1.0,1.0,No,0.087763,1.000000,New York,NY,2019-12-26 12:09:46


In [20]:
df['amenities'] = df['amenities'].replace({np.nan: 'None'})

In [21]:
df.head(5)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,square_feet,cityname,state,time
0,,1.0,0.0,No,0.011281,0.0,Washington,DC,2019-12-26 11:23:35
1,,1.0,1.0,No,0.004302,0.000125,Evansville,IN,2019-12-22 12:17:43
2,,1.0,0.0,No,0.022753,0.00015,Arlington,VA,2019-12-26 11:23:30
3,,1.0,0.0,No,0.013862,0.000376,Seattle,WA,2019-12-18 11:15:43
4,,1.0,0.0,No,0.013002,0.000602,Arlington,VA,2019-12-26 11:23:21


## Assign a unique value to each state and city
Unique values has been assigned to cities and states, np.nan will have a value of 0.

In [22]:
all_states = df['state'].unique()
all_states

array(['DC', 'IN', 'VA', 'WA', 'NY', 'CA', 'AZ', 'NC', 'GA', 'FL', nan,
       'AL', 'MD', 'CO', 'NM', 'IL', 'TN', 'AK', 'MA', 'NJ', 'OR', 'DE',
       'PA', 'TX', 'IA', 'SC', 'MN', 'MI', 'KY', 'WI', 'OH', 'CT', 'RI',
       'NV', 'UT', 'MO', 'OK', 'NH', 'NE', 'LA', 'ND', 'AR', 'KS', 'ID',
       'HI', 'MT', 'VT', 'SD', 'WV', 'MS', 'ME', 'WY'], dtype=object)

In [23]:
rep_vals = {}
for i in range(len(all_states)):
    rep_vals[all_states[i]] = i

In [24]:
rep_vals

{'DC': 0,
 'IN': 1,
 'VA': 2,
 'WA': 3,
 'NY': 4,
 'CA': 5,
 'AZ': 6,
 'NC': 7,
 'GA': 8,
 'FL': 9,
 nan: 10,
 'AL': 11,
 'MD': 12,
 'CO': 13,
 'NM': 14,
 'IL': 15,
 'TN': 16,
 'AK': 17,
 'MA': 18,
 'NJ': 19,
 'OR': 20,
 'DE': 21,
 'PA': 22,
 'TX': 23,
 'IA': 24,
 'SC': 25,
 'MN': 26,
 'MI': 27,
 'KY': 28,
 'WI': 29,
 'OH': 30,
 'CT': 31,
 'RI': 32,
 'NV': 33,
 'UT': 34,
 'MO': 35,
 'OK': 36,
 'NH': 37,
 'NE': 38,
 'LA': 39,
 'ND': 40,
 'AR': 41,
 'KS': 42,
 'ID': 43,
 'HI': 44,
 'MT': 45,
 'VT': 46,
 'SD': 47,
 'WV': 48,
 'MS': 49,
 'ME': 50,
 'WY': 51}

In [25]:
rep_vals[np.nan] = 0
rep_vals['DC'] = 10

In [26]:
rep_vals

{'DC': 10,
 'IN': 1,
 'VA': 2,
 'WA': 3,
 'NY': 4,
 'CA': 5,
 'AZ': 6,
 'NC': 7,
 'GA': 8,
 'FL': 9,
 nan: 0,
 'AL': 11,
 'MD': 12,
 'CO': 13,
 'NM': 14,
 'IL': 15,
 'TN': 16,
 'AK': 17,
 'MA': 18,
 'NJ': 19,
 'OR': 20,
 'DE': 21,
 'PA': 22,
 'TX': 23,
 'IA': 24,
 'SC': 25,
 'MN': 26,
 'MI': 27,
 'KY': 28,
 'WI': 29,
 'OH': 30,
 'CT': 31,
 'RI': 32,
 'NV': 33,
 'UT': 34,
 'MO': 35,
 'OK': 36,
 'NH': 37,
 'NE': 38,
 'LA': 39,
 'ND': 40,
 'AR': 41,
 'KS': 42,
 'ID': 43,
 'HI': 44,
 'MT': 45,
 'VT': 46,
 'SD': 47,
 'WV': 48,
 'MS': 49,
 'ME': 50,
 'WY': 51}

In [27]:
df['state_num'] = df['state'].replace(rep_vals)

In [28]:
df.head(5)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,square_feet,cityname,state,time,state_num
0,,1.0,0.0,No,0.011281,0.0,Washington,DC,2019-12-26 11:23:35,10
1,,1.0,1.0,No,0.004302,0.000125,Evansville,IN,2019-12-22 12:17:43,1
2,,1.0,0.0,No,0.022753,0.00015,Arlington,VA,2019-12-26 11:23:30,2
3,,1.0,0.0,No,0.013862,0.000376,Seattle,WA,2019-12-18 11:15:43,3
4,,1.0,0.0,No,0.013002,0.000602,Arlington,VA,2019-12-26 11:23:21,2


In [29]:
all_cities = df['cityname'].unique()

In [30]:
rep_city_vals = {}
for i in range(len(all_cities)):
    rep_city_vals[all_cities[i]] = i

In [31]:
rep_city_vals

{'Washington': 0,
 'Evansville': 1,
 'Arlington': 2,
 'Seattle': 3,
 'Manhattan': 4,
 'Venice': 5,
 'San Francisco': 6,
 'Tucson': 7,
 'New Bern': 8,
 'Vallejo': 9,
 'Marietta': 10,
 'Charlotte': 11,
 'Tallahassee': 12,
 nan: 13,
 'Phoenix': 14,
 'Riverview': 15,
 'Lafayette': 16,
 'Glen Burnie': 17,
 'Littleton': 18,
 'Waldorf': 19,
 'Aurora': 20,
 'Manassas': 21,
 'Decatur': 22,
 'Tampa': 23,
 'Albuquerque': 24,
 'Chicago': 25,
 'Gastonia': 26,
 'Wimauma': 27,
 'Memphis': 28,
 'Bellingham': 29,
 'Anchorage': 30,
 'Pacifica': 31,
 'Marina Del Rey': 32,
 'Studio City': 33,
 'Moorpark': 34,
 'Torrance': 35,
 'Sacramento': 36,
 'Lake Balboa': 37,
 'North Hollywood': 38,
 'Denver': 39,
 'Brentwood': 40,
 'San Rafael': 41,
 'North Potomac': 42,
 'Andover': 43,
 'Oakland': 44,
 'San Pablo': 45,
 'Alexandria': 46,
 'Thousand Oaks': 47,
 'Burke': 48,
 'Bellevue': 49,
 'Wilton': 50,
 'Woodland Hills': 51,
 'Miami': 52,
 'Hyattsville': 53,
 'Westminster': 54,
 'Arvada': 55,
 'Seal Beach': 56,
 

In [32]:
rep_city_vals[np.nan] = 0
rep_city_vals['Washington'] = 13

In [33]:
rep_city_vals

{'Washington': 13,
 'Evansville': 1,
 'Arlington': 2,
 'Seattle': 3,
 'Manhattan': 4,
 'Venice': 5,
 'San Francisco': 6,
 'Tucson': 7,
 'New Bern': 8,
 'Vallejo': 9,
 'Marietta': 10,
 'Charlotte': 11,
 'Tallahassee': 12,
 nan: 0,
 'Phoenix': 14,
 'Riverview': 15,
 'Lafayette': 16,
 'Glen Burnie': 17,
 'Littleton': 18,
 'Waldorf': 19,
 'Aurora': 20,
 'Manassas': 21,
 'Decatur': 22,
 'Tampa': 23,
 'Albuquerque': 24,
 'Chicago': 25,
 'Gastonia': 26,
 'Wimauma': 27,
 'Memphis': 28,
 'Bellingham': 29,
 'Anchorage': 30,
 'Pacifica': 31,
 'Marina Del Rey': 32,
 'Studio City': 33,
 'Moorpark': 34,
 'Torrance': 35,
 'Sacramento': 36,
 'Lake Balboa': 37,
 'North Hollywood': 38,
 'Denver': 39,
 'Brentwood': 40,
 'San Rafael': 41,
 'North Potomac': 42,
 'Andover': 43,
 'Oakland': 44,
 'San Pablo': 45,
 'Alexandria': 46,
 'Thousand Oaks': 47,
 'Burke': 48,
 'Bellevue': 49,
 'Wilton': 50,
 'Woodland Hills': 51,
 'Miami': 52,
 'Hyattsville': 53,
 'Westminster': 54,
 'Arvada': 55,
 'Seal Beach': 56,
 

In [34]:
df['city_num'] = df['cityname'].replace(rep_city_vals)

In [35]:
df.head(5)

Unnamed: 0,amenities,bathrooms,bedrooms,fee,price,square_feet,cityname,state,time,state_num,city_num
0,,1.0,0.0,No,0.011281,0.0,Washington,DC,2019-12-26 11:23:35,10,13
1,,1.0,1.0,No,0.004302,0.000125,Evansville,IN,2019-12-22 12:17:43,1,1
2,,1.0,0.0,No,0.022753,0.00015,Arlington,VA,2019-12-26 11:23:30,2,2
3,,1.0,0.0,No,0.013862,0.000376,Seattle,WA,2019-12-18 11:15:43,3,3
4,,1.0,0.0,No,0.013002,0.000602,Arlington,VA,2019-12-26 11:23:21,2,2


In [36]:
df['fee'].unique()

array(['No'], dtype=object)

In [37]:
final_cleaned = df.drop(columns=['fee'])
final_cleaned

Unnamed: 0,amenities,bathrooms,bedrooms,price,square_feet,cityname,state,time,state_num,city_num
0,,1.0,0.0,0.011281,0.000000,Washington,DC,2019-12-26 11:23:35,10,13
1,,1.0,1.0,0.004302,0.000125,Evansville,IN,2019-12-22 12:17:43,1,1
2,,1.0,0.0,0.022753,0.000150,Arlington,VA,2019-12-26 11:23:30,2,2
3,,1.0,0.0,0.013862,0.000376,Seattle,WA,2019-12-18 11:15:43,3,3
4,,1.0,0.0,0.013002,0.000602,Arlington,VA,2019-12-26 11:23:21,2,2
...,...,...,...,...,...,...,...,...,...,...
9995,,4.0,5.0,0.110899,0.155367,Edina,MN,2019-11-30 11:22:55,26,628
9996,,8.0,6.0,0.474187,0.215920,Montecito,CA,2019-12-26 11:40:19,5,1534
9997,,8.5,6.0,0.206501,0.281135,Potomac,MD,2019-12-26 11:42:40,12,80
9998,"[Basketball, Cable or Satellite, Doorman, Hot ...",1.0,1.0,0.087763,1.000000,New York,NY,2019-12-26 12:09:46,4,752
