# Initial Data Preparation

In [1]:
import os
import sys
import googlemaps
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

%load_ext autoreload
%autoreload 2
%matplotlib inline 

# Setting up directories
ROOT_DIR = os.path.join(os.getcwd(), '..')
SRC_DIR = os.path.join(os.getcwd(), '..', 'src')
DATA_DIR = os.path.join(ROOT_DIR, "data")

sys.path.append(ROOT_DIR)
sys.path.append(SRC_DIR)
os.chdir(ROOT_DIR)

In [2]:
# Custom Modules
from d00_utils import read_write
from d02_intermediate import zip_code
from d02_intermediate import intial_preprocessing

RAW_DATA_DIR = os.path.join(DATA_DIR,"01_raw")
LISTINGS_DATA_PATH = os.path.join(RAW_DATA_DIR, 'listings_3.csv')

### Sneak peek the data

In [3]:
listings = pd.read_csv(LISTINGS_DATA_PATH)
listings.head(3)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
0,https://www.renthop.com/listings/420-west-42nd...,"420 West 42nd Street, Apt 22D","Hell's Kitchen, Midtown Manhattan, Manhattan","$4,875",2_Bed,2_Bath,/_Flex_3_
1,https://www.renthop.com/listings/330-west-39th...,"330 West 39th Street, Apt 23E","Garment District, Midtown Manhattan, Manhattan","$5,890",2_Bed,2_Bath,
2,https://www.renthop.com/listings/47th-avenue/5...,47th Avenue,"Hunters Point, Long Island City, Northwestern ...","$3,350",1_Bed,1_Bath,


In [4]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           200 non-null    object
 1   address       200 non-null    object
 2   neighborhood  200 non-null    object
 3   rent          200 non-null    object
 4   beds          200 non-null    object
 5   baths         200 non-null    object
 6   flexs         29 non-null     object
dtypes: object(7)
memory usage: 11.1+ KB


In [5]:
listings.describe()

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
count,200,200,200,200,200,200,29
unique,182,161,60,134,5,8,4
top,https://www.renthop.com/listings/30-74-30th-st...,West 37th Street,"Hell's Kitchen, Midtown Manhattan, Manhattan","$1,800",1_Bed,1_Bath,/_Flex_3_
freq,2,7,36,6,84,156,12


### Formating Features

In [6]:
def fix_rent_feature(listings):
    """
    Fixes format issues from rent feature in listings data
    Removes $ and ,
    Finally, converts to integer data type
    """
    try:
        if(listings['flexs'].dtype != 'int64'):
            # Removindg  '$' and ',' from strings and convert to int
            listings['rent'] = listings['rent'].map(lambda x: str(x).replace('$','').replace(',','')).astype('int')  
            
        return listings
    except:
        print('Something went wrong')

In [7]:
listings['rent'] = listings['rent'].map(lambda x: str(x).replace('$','').replace(',','')).astype('int')  


In [8]:
listings = fix_rent_feature(listings)

In [9]:
listings

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
0,https://www.renthop.com/listings/420-west-42nd...,"420 West 42nd Street, Apt 22D","Hell's Kitchen, Midtown Manhattan, Manhattan",4875,2_Bed,2_Bath,/_Flex_3_
1,https://www.renthop.com/listings/330-west-39th...,"330 West 39th Street, Apt 23E","Garment District, Midtown Manhattan, Manhattan",5890,2_Bed,2_Bath,
2,https://www.renthop.com/listings/47th-avenue/5...,47th Avenue,"Hunters Point, Long Island City, Northwestern ...",3350,1_Bed,1_Bath,
3,https://www.renthop.com/listings/west-37th-str...,West 37th Street,"Hell's Kitchen, Midtown Manhattan, Manhattan",3571,1_Bed,1_Bath,
4,https://www.renthop.com/listings/west-42nd-str...,West 42nd Street,"Hell's Kitchen, Midtown Manhattan, Manhattan",3295,1_Bed,1_Bath,
...,...,...,...,...,...,...,...
195,https://www.renthop.com/listings/john-street/3...,John Street,"Financial District, Downtown Manhattan, Manhattan",3199,1_Bed,1_Bath,
196,https://www.renthop.com/listings/222-east-34th...,"222 East 34th Street, Apt 1624...","Rose Hill, Kips Bay, Midtown Manhattan, Manhattan",3700,1_Bed,1_Bath,
197,https://www.renthop.com/listings/west-51st-and...,West 51st and 10th ave,"Hell's Kitchen, Midtown Manhattan, Manhattan",2595,1_Bed,1_Bath,
198,https://www.renthop.com/listings/134-haven-ave...,"134 Haven Avenue, Apt 3H","Washington Heights, Upper Manhattan, Manhattan",2525,2_Bed,1_Bath,


In [10]:
# Fix beds feature
listings = intial_preprocessing.fix_beds_feature(listings)
# Fixing baths feature
listings = intial_preprocessing.fix_baths_feature(listings)
# Fixing flexes feature
listings = intial_preprocessing.fix_flex_feature(listings)
# Fixing rent feature
listings = intial_preprocessing.fix_rent_feature(listings)

### Getting Zip codes from Google Maps

In [11]:
listings['zip'] = listings.apply(zip_code.get_zips, axis=1) 

....................................................... 100%


### Expoloring Changes

In [12]:
listings.head(3)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs,zip
0,https://www.renthop.com/listings/420-west-42nd...,"420 West 42nd Street, Apt 22D","Hell's Kitchen, Midtown Manhattan, Manhattan",4875,2,2.0,3,
1,https://www.renthop.com/listings/330-west-39th...,"330 West 39th Street, Apt 23E","Garment District, Midtown Manhattan, Manhattan",5890,2,2.0,0,
2,https://www.renthop.com/listings/47th-avenue/5...,47th Avenue,"Hunters Point, Long Island City, Northwestern ...",3350,1,1.0,0,


In [13]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           200 non-null    object 
 1   address       200 non-null    object 
 2   neighborhood  200 non-null    object 
 3   rent          200 non-null    int32  
 4   beds          200 non-null    int64  
 5   baths         200 non-null    float64
 6   flexs         200 non-null    int64  
 7   zip           0 non-null      float64
dtypes: float64(2), int32(1), int64(2), object(3)
memory usage: 11.8+ KB


In [14]:
listings.describe()

Unnamed: 0,rent,beds,baths,flexs,zip
count,200.0,200.0,200.0,200.0,0.0
mean,3967.6,1.435,1.2625,0.39,
std,2768.128223,1.0054,0.553512,1.001456,
min,1550.0,0.0,1.0,0.0,
25%,2695.0,1.0,1.0,0.0,
50%,3462.5,1.0,1.0,0.0,
75%,4603.75,2.0,1.0,0.0,
max,25000.0,4.0,4.0,4.0,


### Saving data

In [15]:
SUB_DIR = '02_intermediate'
read_write.create_data_sub_dir(SUB_DIR)
read_write.save_data_file(SUB_DIR,'listings_interm_with_null_zips_3.csv',listings)
read_write.save_data_file(SUB_DIR,'listings_interm_with_zips_3.csv',listings[listings['zip'].notnull()])

Dir already exist.
File saved.
File saved.
