# Initial Data Preparation

In [1]:
import os
import sys
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

%load_ext autoreload
%autoreload 2
%matplotlib inline 

# Setting up directories
ROOT_DIR = os.path.join(os.getcwd(), '..')
SRC_DIR = os.path.join(os.getcwd(), '..', 'src')
DATA_DIR = os.path.join(ROOT_DIR, "data")

sys.path.append(ROOT_DIR)
sys.path.append(SRC_DIR)
os.chdir(ROOT_DIR)

# Custom Modules
from d01_data import scraping_renthope
from d01_data import format_scraped_data

RAW_DATA_DIR = os.path.join(DATA_DIR,"01_raw")
LISTINGS_DATA_PATH = os.path.join(RAW_DATA_DIR, 'listings_2.csv')

In [3]:
listings = pd.read_csv(LISTINGS_DATA_PATH)
listings.head(5)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
0,https://www.renthop.com/listings/626-1st-avenu...,"626 1st Avenue, Apt W17M","Murray Hill, Midtown Manhattan, Manhattan","$3,408",Studio,1_Bath,
1,https://www.renthop.com/listings/east-20s/b208...,East 20's,"Kips Bay, Midtown Manhattan, Manhattan","$3,500",1_Bed,1_Bath,/_Flex_2_
2,https://www.renthop.com/listings/20-exchange-p...,"20 Exchange Place, Apt 2601","Financial District, Downtown Manhattan, Manhattan","$3,673",1_Bed,1_Bath,
3,https://www.renthop.com/listings/queens-st/na/...,Queens St,"Long Island City, Northwestern Queens, Queens","$2,529",Studio,1_Bath,
4,https://www.renthop.com/listings/first-avenue/...,First Avenue,"Alphabet City, East Village, Downtown Manhatta...","$3,880",2_Bed,1_Bath,


In [4]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           20000 non-null  object
 1   address       19988 non-null  object
 2   neighborhood  20000 non-null  object
 3   rent          20000 non-null  object
 4   beds          20000 non-null  object
 5   baths         20000 non-null  object
 6   flexs         2392 non-null   object
dtypes: object(7)
memory usage: 1.1+ MB


In [5]:
listings.describe()

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
count,20000,19988,20000,20000,20000,20000,2392
unique,18511,9686,252,2637,18,13,15
top,https://www.renthop.com/listings/fresh-pond-rd...,Wall Street,"Hell's Kitchen, Midtown Manhattan, Manhattan","$2,500",1_Bed,1_Bath,/_Flex_2_
freq,4,162,1733,236,7691,15253,1101


### Fixing Beds Feature

In [6]:
listings['beds'].unique()

array(['Studio', '1_Bed', '2_Bed', '3_Bed', '4_Bed', 'Loft', 'Room',
       '5_Bed', '_1_Bed', '_2_Bed', '6_Bed', '_5_Bed', '_Studio', '8_Bed',
       '_3_Bed', '_Room', '_4_Bed', '7_Bed'], dtype=object)

We can few duplicate categories. i.e. '5_Bed' and '\_5_Bed' are same

In [7]:
# Setup  for removing extra '_' from categories

dumy_strs = pd.Series(["_x_Bed","x_Bed"])
dumy_strs.map(lambda x: x[1:] if x.startswith('_') else x)

0    x_Bed
1    x_Bed
dtype: object

In [8]:
# Removindg stating '_'
listings['beds'] = listings['beds'].map(lambda x: x[1:] if x.startswith('_') else x) 

In [9]:
listings['beds'].unique()

array(['Studio', '1_Bed', '2_Bed', '3_Bed', '4_Bed', 'Loft', 'Room',
       '5_Bed', '6_Bed', '8_Bed', '7_Bed'], dtype=object)

In [10]:
# Futher Fixing - Setting other Categoris to 0-Beds

listings['beds'] = listings['beds'].map(lambda x: x.replace('_Bed', '')) 
listings['beds'] = listings['beds'].map(lambda x: x.replace('Studio', '0')) 
listings['beds'] = listings['beds'].map(lambda x: x.replace('Loft', '0'))
listings['beds'] = listings['beds'].map(lambda x: x.replace('Room', '0'))

In [11]:
# Contering to Numeric

listings['beds'] = pd.to_numeric(listings['beds'])
listings['beds'].unique()

array([0, 1, 2, 3, 4, 5, 6, 8, 7], dtype=int64)

### Fixing baths feature

In [12]:
listings['baths'].unique()

array(['1_Bath', '2_Bath', '1.5_Bath', '3_Bath', '_2_Bath', '_1_Bath',
       '4_Bath', '2.5_Bath', '3.5_Bath', '5_Bath', '4.5_Bath',
       '_1.5_Bath', '6_Bath'], dtype=object)

In [13]:
listings['baths'] = listings['baths'].map(lambda x: x[1:] if x.startswith('_') else x) 
listings['baths'] = listings['baths'].map(lambda x: x.replace('_Bath', '')) 

In [14]:
# Contering to Numeric

listings['baths'] = pd.to_numeric(listings['baths'])
listings['baths'].unique()

array([1. , 2. , 1.5, 3. , 4. , 2.5, 3.5, 5. , 4.5, 6. ])

### Fixing Flexes 

In [15]:
listings['flexs'].unique()

array([nan, '/_Flex_2_', '/_Flex_1_', '/_Flex_3_', '/_Flex_4_',
       '/_Flex_5_', '_/_Flex_4_', '/_Flex_4__', '_/_Flex_2_',
       '_/_Flex_3_', '/_Flex_6_', '/_Flex_9_', '/_Flex_5__', '/_Flex_2__',
       '/_Flex_3__', '_/_Flex_1_'], dtype=object)

In [16]:
# listings[listings['flexs'].notnull()]['flexs']

In [17]:
listings['flexs'] = listings[listings['flexs'].notnull()]['flexs'].map(lambda x: x[1:] if x.startswith('_') else x)
listings['flexs'] = listings[listings['flexs'].notnull()]['flexs'].map(lambda x: x[1:] if x.endswith('_') else x)

In [18]:
listings['flexs'].unique()

array([nan, '_Flex_2_', '_Flex_1_', '_Flex_3_', '_Flex_4_', '_Flex_5_',
       '_Flex_4__', '_Flex_6_', '_Flex_9_', '_Flex_5__', '_Flex_2__',
       '_Flex_3__'], dtype=object)

In [19]:
listings['flexs'] = listings[listings['flexs'].notnull()]['flexs'].map(lambda x: x.replace('_Flex_', '')) 
listings['flexs'] = listings[listings['flexs'].notnull()]['flexs'].map(lambda x: x.replace('_', '')) 

In [20]:
listings['flexs'].unique()

array([nan, '2', '1', '3', '4', '5', '6', '9'], dtype=object)

In [21]:
# Contering to Numeric

listings['flexs'] = pd.to_numeric(listings['flexs'])
listings['flexs'].unique()

array([nan,  2.,  1.,  3.,  4.,  5.,  6.,  9.])

Nan here means we have no Flex

In [22]:
listings['flexs'] = listings['flexs'].fillna(0)


In [23]:
listings['flexs'].unique()

array([0., 2., 1., 3., 4., 5., 6., 9.])

In [24]:
listings['flexs'].value_counts()

0.0    17608
2.0     1107
3.0      752
4.0      262
1.0      235
5.0       32
6.0        3
9.0        1
Name: flexs, dtype: int64

### Fixing Rent

In [25]:
listings['rent'] = listings['rent'].map(lambda x: str(x).replace('$','').replace(',','')).astype('int') 

In [26]:
listings.head(3)

Unnamed: 0,url,address,neighborhood,rent,beds,baths,flexs
0,https://www.renthop.com/listings/626-1st-avenu...,"626 1st Avenue, Apt W17M","Murray Hill, Midtown Manhattan, Manhattan",3408,0,1.0,0.0
1,https://www.renthop.com/listings/east-20s/b208...,East 20's,"Kips Bay, Midtown Manhattan, Manhattan",3500,1,1.0,2.0
2,https://www.renthop.com/listings/20-exchange-p...,"20 Exchange Place, Apt 2601","Financial District, Downtown Manhattan, Manhattan",3673,1,1.0,0.0


In [27]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   url           20000 non-null  object 
 1   address       19988 non-null  object 
 2   neighborhood  20000 non-null  object 
 3   rent          20000 non-null  int32  
 4   beds          20000 non-null  int64  
 5   baths         20000 non-null  float64
 6   flexs         20000 non-null  float64
dtypes: float64(2), int32(1), int64(1), object(3)
memory usage: 1015.8+ KB


### Zip Codes
Using Google Maps For getting Related Zip Codes 

In [28]:
# !pip install -U googlemaps

In [29]:
listings.loc[3,['address']].values[0] + ' ' + listings.loc[3,['neighborhood']].values[0].split(', ')[-1]

'Queens St Queens'

In [30]:

def get_key():
    file_path = "Secrets\key.txt"
    with open(file_path, 'r') as file: 
          return file.read()


In [31]:
import googlemaps 
 
gmaps = googlemaps.Client(key=get_key()) 
 
# removing ',' from a record | So, we can get the cleaned string to get zipcode
r = listings.loc[1,['address']].values[0] + ' ' + listings.loc[1,['neighborhood']].values[0].split(', ')[-1] 
r 

"East 20's Manhattan"

In [32]:
# Fetching Related data
geocode_result = gmaps.geocode(r) 
 
geocode_result 

[{'address_components': [{'long_name': 'East 20th Street',
    'short_name': 'E 20th St',
    'types': ['route']},
   {'long_name': 'Manhattan',
    'short_name': 'Manhattan',
    'types': ['political', 'sublocality', 'sublocality_level_1']},
   {'long_name': 'New York',
    'short_name': 'New York',
    'types': ['locality', 'political']},
   {'long_name': 'New York County',
    'short_name': 'New York County',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'New York',
    'short_name': 'NY',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'United States',
    'short_name': 'US',
    'types': ['country', 'political']}],
  'formatted_address': 'E 20th St, New York, NY, USA',
  'geometry': {'bounds': {'northeast': {'lat': 40.7401872,
     'lng': -73.97418739999999},
    'southwest': {'lat': 40.7316118, 'lng': -73.9910835}},
   'location': {'lat': 40.7360191, 'lng': -73.9822744},
   'location_type': 'GEOMETRIC_CENTER',
   'viewpo

In [33]:
geocode_result[0]['address_components']

[{'long_name': 'East 20th Street',
  'short_name': 'E 20th St',
  'types': ['route']},
 {'long_name': 'Manhattan',
  'short_name': 'Manhattan',
  'types': ['political', 'sublocality', 'sublocality_level_1']},
 {'long_name': 'New York',
  'short_name': 'New York',
  'types': ['locality', 'political']},
 {'long_name': 'New York County',
  'short_name': 'New York County',
  'types': ['administrative_area_level_2', 'political']},
 {'long_name': 'New York',
  'short_name': 'NY',
  'types': ['administrative_area_level_1', 'political']},
 {'long_name': 'United States',
  'short_name': 'US',
  'types': ['country', 'political']}]

Dictonary having 'types' key with ['postal_code'] 
have 'short_name' key with zipcode

In [34]:
# looping trough this data to get zipcode
for piece_dict in geocode_result[0]['address_components']: 
    if 'postal_code' in piece_dict['types'] : 
        print(piece_dict['short_name']) 

**Putting all together**

In [35]:
import re 
from IPython.display import clear_output


def get_zip(row): 
    """
    Source: ML Blueprints
    """
    # Remove Previous Print and add new
    clear_output(wait=True)
    print("...")
    print("...")
    
    try: 
        # removing ','
        addr = row['address'] + ' ' + row['neighborhood'].split(', ')[-1]     
        
        print(addr)
    
        if re.match('^\d+\s\w', addr): 
            geocode_result = gmaps.geocode(addr) 
            for piece in geocode_result[0]['address_components']: 
                if 'postal_code' in piece['types']: 
                    return piece['short_name'] 
                else: 
                    pass 
        else: 
            return np.nan 
    except: 
        return np.nan

In [None]:
listings['zip'] = listings.apply(get_zip, axis=1) 

...
...
400 West 63rd Street, Apt 1010... Manhattan


In [None]:
listings['zip'].unique()

Verify that None and np.nan are getting treated as NULLs

In [None]:
listings['zip'].value_counts()

In [None]:
dict(listings['zip'].value_counts()).keys()

In [None]:
total_null = len(listings[listings['zip'].isnull()]['zip'])
total_null

In [None]:
listings.shape[0] - total_null # we only got zip for this number of listings

We Lost most of the data, anyways that would be useful to have zip codes

### Saving data

In [None]:
listings.to_csv('data\listings_interm_with_null_zips_2.csv', index = False)

In [None]:
listings[listings['zip'].notnull()].to_csv('data\listings_interm_with_zips_2.csv', index = False)