# Initial Data Preparation

In [1]:
import os
import sys
import googlemaps
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

%load_ext autoreload
%autoreload 2
%matplotlib inline 

# Setting up directories
ROOT_DIR = os.path.join(os.getcwd(), '..')
SRC_DIR = os.path.join(os.getcwd(), '..', 'src')
DATA_DIR = os.path.join(ROOT_DIR, "data")

sys.path.append(ROOT_DIR)
sys.path.append(SRC_DIR)
os.chdir(ROOT_DIR)

# Custom Modules
from d00_utils import read_write
from d02_intermediate import zip_code
from d02_intermediate import intial_preprocessing

RAW_DATA_DIR = os.path.join(DATA_DIR,"01_raw")
LISTINGS_DATA_PATH = os.path.join(RAW_DATA_DIR, 'listings_3.csv')

### Exploring Data

In [2]:
listings = pd.read_csv(LISTINGS_DATA_PATH)
listings.head(3)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
0,https://www.renthop.com/listings/21-west-end-a...,"21 West End Avenue, Apt 1213","Lincoln Square, Upper West Side, Upper Manhatt...","$2,973",Studio,1_Bath,
1,https://www.renthop.com/listings/70-west-37th-...,"70 West 37th Street, Apt 1012","Murray Hill, Midtown Manhattan, Manhattan","$3,646",Studio,1_Bath,
2,https://www.renthop.com/listings/205-knickerbo...,"205 Knickerbocker Avenue, Apt ...","Bushwick, Northern Brooklyn, Brooklyn","$2,850",1_Bed,1_Bath,


In [3]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           200 non-null    object
 1   address       200 non-null    object
 2   neighborhood  200 non-null    object
 3   rent          200 non-null    object
 4   beds          200 non-null    object
 5   baths         200 non-null    object
 6   flexs         23 non-null     object
dtypes: object(7)
memory usage: 11.1+ KB


In [4]:
listings.describe()

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
count,200,200,200,200,200,200,23
unique,179,163,56,128,6,5,4
top,https://www.renthop.com/listings/420-lexington...,West 37th Street,"Financial District, Downtown Manhattan, Manhattan","$3,500",1_Bed,1_Bath,/_Flex_3_
freq,2,3,24,6,74,159,12


### Formating Features

In [5]:
# Fix beds feature
listings = intial_preprocessing.fix_beds_feature(listings)
# Fixing baths feature
listings = intial_preprocessing.fix_baths_feature(listings)
# Fixing flexes feature
listings = intial_preprocessing.fix_flex_feature(listings)
# Fixing rent feature
listings=intial_preprocessing.fix_rent_feature(listings)

### Getting Zip codes from Google Maps

In [6]:
listings['zip'] = listings.apply(zip_code.get_zips, axis=1) 

....................................................... 100%


### Expoloring Changes

In [7]:
listings.head(3)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs,zip
0,https://www.renthop.com/listings/21-west-end-a...,"21 West End Avenue, Apt 1213","Lincoln Square, Upper West Side, Upper Manhatt...","$2,973",0,1,0,
1,https://www.renthop.com/listings/70-west-37th-...,"70 West 37th Street, Apt 1012","Murray Hill, Midtown Manhattan, Manhattan","$3,646",0,1,0,
2,https://www.renthop.com/listings/205-knickerbo...,"205 Knickerbocker Avenue, Apt ...","Bushwick, Northern Brooklyn, Brooklyn","$2,850",1,1,0,


In [8]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           200 non-null    object 
 1   address       200 non-null    object 
 2   neighborhood  200 non-null    object 
 3   rent          200 non-null    object 
 4   beds          200 non-null    int64  
 5   baths         200 non-null    int64  
 6   flexs         200 non-null    int64  
 7   zip           0 non-null      float64
dtypes: float64(1), int64(3), object(4)
memory usage: 12.6+ KB


In [11]:
listings.describe()

Unnamed: 0,beds,baths,flexs,zip
count,200.0,200.0,200.0,0.0
mean,1.455,1.225,0.32,
std,1.026275,0.495584,0.933788,
min,0.0,1.0,0.0,
25%,1.0,1.0,0.0,
50%,1.0,1.0,0.0,
75%,2.0,1.0,0.0,
max,5.0,4.0,4.0,


### Saving data

In [10]:
SUB_DIR = '02_intermediate'
read_write.create_data_sub_dir(SUB_DIR)
read_write.save_data_file(SUB_DIR,'listings_interm_with_null_zips_3.csv',listings)
read_write.save_data_file(SUB_DIR,'listings_interm_with_zips_3.csv',listings[listings['zip'].notnull()])

Dir already exist.
File saved.
File saved.
