## Coded by 
- Frank Zhao
- Gustavo Ferreira
- Yan Li

## Data aquiring date: 10/16/2020

# Import modules

In [2]:
import json
from pprint import pprint
import glob
from datetime import datetime
import requests
import pandas as pd

# Self-define functions

In [2]:
## function for realtor api requests

def realtor_api_request(sort = "relevance", city = "Philadelphia", limit = "200", offset = "0", state_code = "PA"):
    url = "https://realtor.p.rapidapi.com/properties/v2/list-for-sale"

    querystring = {
        "sort": sort,
        "city": city,
        "limit": limit,
        "offset": offset,
        "state_code": state_code
    }

    headers = {
        'x-rapidapi-host': "realtor.p.rapidapi.com",
        'x-rapidapi-key': "e2aef74c96msh10b708b155c5b67p1e6d19jsn4cb3a156a37d"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    
    return response.json()

In [3]:
## convert response to pandas df

def process_response(response_json):
    '''
    This function is to convert each request result to a dataframe.
    
    1. create an empty list
    2. loop for each response and get details from key 'properties' 
    3. convert details to df
    4. append single df to list
    5. concat the list to one df
    
    '''
    
    # empty list
    convert_list=[]
#     if response_json['properties']: return pd.DataFrame([])
    # loop through each 'properties'
    for col in response_json['properties']:
        
        # convert details to dataframe
        single_df = pd.DataFrame.from_dict(col, orient='index').T
        
        # append to list
        convert_list.append(single_df)
        
    # concat to a whole df, null for missing vals
    return pd.concat(convert_list, axis = 0, ignore_index=True, sort=False)

# Get data

In [4]:
## run for each offset
'''
There are 200 instances in each request. Restricted by Rapidapi
'''
def response_to_df(sort = "relevance", city = "Philadelphia", limit = "200", offset = "0", state_code = "PA"):
    property_for_sale = realtor_api_request(sort = sort, city = city, limit = limit, offset = offset, state_code = state_code)
    df_properties_for_sale_raw = process_response(response_json = property_for_sale)
    pd.set_option('display.max_columns', None)
    return df_properties_for_sale_raw


# Export response to csv file

In [5]:
## export each response to csv file
'''
Since each request could get 200 instances, after the each request, we increase the offset 200.
So on the second run, we can get data from 201 - 400, theoratically.
However, the system will keep updating the data so we may either get the data overlapped or miss some data.
We will deal with duplicated data later.

Currently, the loop will crash when the instances in the last run is less than 200. 
'''

start = datetime.now() ## to see how long will it finish the run
i=1
for offset in range(0, 9562, 200):
    response_to_df(offset = offset).to_csv(f'test{i}.csv',encoding='utf-8', index=False)
    if i % 10 == 0:
        print(f'The {i}th Time')
    i += 1    
end = datetime.now() -start

print(f'Total time is: {end}')

The 10th Time
The 20th Time
The 30th Time
The 40th Time
Total time is: 0:02:54.604970


# Data preprocessing

In [3]:
## import all csv to one df

path = r'E:\Projects\Drexel\DSCI 591 Capstone I\Projects\csv'
all_files = glob.glob(path + "/*.csv")

## empty list
file_list = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    file_list.append(df)
    
combined_df = pd.concat(file_list, axis = 0, ignore_index=True)

combined_df

Unnamed: 0,property_id,listing_id,rdc_web_url,prop_type,prop_sub_type,address,branding,prop_status,price,baths_full,...,rank,list_tracking,lot_size,mls,baths_half,virtual_tour,open_houses,plan_id,new_plan,quick_to_sell_days
0,M4046594895,2.922386e+09,https://www.realtor.com/realestateandhomes-det...,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '1516 N 62nd ...",{'listing_office': {'list_item': {'name': 'Arc...,for_sale,249900,3.0,...,1,type|property|data|prop_id|4046594895|list_id|...,"{'size': 1842, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH944934', 'pla...",,,,,,
1,M3939384476,2.918257e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '6102 Reedlan...",{'listing_office': {'list_item': {'name': 'Vih...,for_sale,116800,1.0,...,2,type|property|data|prop_id|3939384476|list_id|...,"{'size': 992, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH911464', 'pla...",,,,,,
2,M4036371277,2.922385e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '5703 N 13th ...",{'listing_office': {'list_item': {'name': 'Pre...,for_sale,215000,1.0,...,3,type|property|data|prop_id|4036371277|list_id|...,"{'size': 1501, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH945010', 'pla...",1.0,,,,,
3,M3553029343,2.922384e+09,https://www.realtor.com/realestateandhomes-det...,single_family,,"{'city': 'Philadelphia', 'line': '1009 Rhawn S...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,394800,1.0,...,4,type|property|data|prop_id|3553029343|list_id|...,"{'size': 15913, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH943096', 'pla...",1.0,,,,,
4,M3649199107,2.922384e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '3850 N Bouvi...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,130000,1.0,...,5,type|property|data|prop_id|3649199107|list_id|...,"{'size': 1240, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH918016', 'pla...",1.0,{'href': 'https://view.ricohtours.com/0701bde8...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9757,M3400474681,2.869904e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '2077 Bridge ...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,94900,1.0,...,33,type|property|data|prop_id|3400474681|list_id|...,"{'size': 1064, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH858856', 'pla...",,,,,,
9758,M3654238233,2.890369e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '1452 N 57th ...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,149900,1.0,...,34,type|property|data|prop_id|3654238233|list_id|...,"{'size': 1231, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH858898', 'pla...",1.0,,,,,
9759,M9041602444,2.704677e+09,https://www.realtor.com/realestateandhomes-det...,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '7354 Ridge A...",{'listing_office': {'list_item': {'name': 'BHH...,for_sale,432990,3.0,...,35,type|property|data|prop_id|9041602444|list_id|...,,"{'name': 'BrightMLS', 'id': 'PAPH838626', 'pla...",1.0,,,,,
9760,M3276555576,2.915162e+09,https://www.realtor.com/realestateandhomes-det...,condo,condos,"{'city': 'Philadelphia', 'line': '1003 N Bodin...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,405000,2.0,...,36,type|property|data|prop_id|3276555576|list_id|...,,"{'name': 'BrightMLS', 'id': 'PAPH888146', 'pla...",,{'href': 'https://my.matterport.com/show/?m=eR...,,,,


There are 9762 instances we achieved. However, there should be some duplicated instances. 

In [5]:
## remove duplicated instances
raw_data = combined_df.drop_duplicates(subset=['property_id'])
raw_data

Unnamed: 0,property_id,listing_id,rdc_web_url,prop_type,prop_sub_type,address,branding,prop_status,price,baths_full,...,rank,list_tracking,lot_size,mls,baths_half,virtual_tour,open_houses,plan_id,new_plan,quick_to_sell_days
0,M4046594895,2.922386e+09,https://www.realtor.com/realestateandhomes-det...,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '1516 N 62nd ...",{'listing_office': {'list_item': {'name': 'Arc...,for_sale,249900,3.0,...,1,type|property|data|prop_id|4046594895|list_id|...,"{'size': 1842, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH944934', 'pla...",,,,,,
1,M3939384476,2.918257e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '6102 Reedlan...",{'listing_office': {'list_item': {'name': 'Vih...,for_sale,116800,1.0,...,2,type|property|data|prop_id|3939384476|list_id|...,"{'size': 992, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH911464', 'pla...",,,,,,
2,M4036371277,2.922385e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '5703 N 13th ...",{'listing_office': {'list_item': {'name': 'Pre...,for_sale,215000,1.0,...,3,type|property|data|prop_id|4036371277|list_id|...,"{'size': 1501, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH945010', 'pla...",1.0,,,,,
3,M3553029343,2.922384e+09,https://www.realtor.com/realestateandhomes-det...,single_family,,"{'city': 'Philadelphia', 'line': '1009 Rhawn S...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,394800,1.0,...,4,type|property|data|prop_id|3553029343|list_id|...,"{'size': 15913, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH943096', 'pla...",1.0,,,,,
4,M3649199107,2.922384e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '3850 N Bouvi...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,130000,1.0,...,5,type|property|data|prop_id|3649199107|list_id|...,"{'size': 1240, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH918016', 'pla...",1.0,{'href': 'https://view.ricohtours.com/0701bde8...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9757,M3400474681,2.869904e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '2077 Bridge ...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,94900,1.0,...,33,type|property|data|prop_id|3400474681|list_id|...,"{'size': 1064, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH858856', 'pla...",,,,,,
9758,M3654238233,2.890369e+09,https://www.realtor.com/realestateandhomes-det...,condo,townhomes,"{'city': 'Philadelphia', 'line': '1452 N 57th ...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,149900,1.0,...,34,type|property|data|prop_id|3654238233|list_id|...,"{'size': 1231, 'units': 'sqft'}","{'name': 'BrightMLS', 'id': 'PAPH858898', 'pla...",1.0,,,,,
9759,M9041602444,2.704677e+09,https://www.realtor.com/realestateandhomes-det...,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '7354 Ridge A...",{'listing_office': {'list_item': {'name': 'BHH...,for_sale,432990,3.0,...,35,type|property|data|prop_id|9041602444|list_id|...,,"{'name': 'BrightMLS', 'id': 'PAPH838626', 'pla...",1.0,,,,,
9760,M3276555576,2.915162e+09,https://www.realtor.com/realestateandhomes-det...,condo,condos,"{'city': 'Philadelphia', 'line': '1003 N Bodin...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,405000,2.0,...,36,type|property|data|prop_id|3276555576|list_id|...,,"{'name': 'BrightMLS', 'id': 'PAPH888146', 'pla...",,{'href': 'https://my.matterport.com/show/?m=eR...,,,,


In [6]:
## export to csv file
raw_data.to_csv('raw_data.csv', encoding='utf-8', index=False)

And now, we have all 9562 instances.

In [7]:
## Remove unuseful cols
processed_data = raw_data.drop(columns=[
    'rdc_web_url',
    'office',
    'thumbnail',
    'list_tracking',
    'open_houses',
    'plan_id',
    'new_plan',
    'quick_to_sell_days',
    'virtual_tour',
    'client_display_flags',
    'lead_forms',
    'listing_id',
    'mls'
    
])
processed_data

Unnamed: 0,property_id,prop_type,prop_sub_type,address,branding,prop_status,price,baths_full,baths,beds,building_size,agents,last_update,photo_count,page_no,rank,lot_size,baths_half
0,M4046594895,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '1516 N 62nd ...",{'listing_office': {'list_item': {'name': 'Arc...,for_sale,249900,3.0,3,6.0,"{'size': 1632, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '1291281',...",2020-10-13T17:54:05Z,9,1,1,"{'size': 1842, 'units': 'sqft'}",
1,M3939384476,condo,townhomes,"{'city': 'Philadelphia', 'line': '6102 Reedlan...",{'listing_office': {'list_item': {'name': 'Vih...,for_sale,116800,1.0,1,3.0,"{'size': 1092, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '347285', ...",2020-10-13T18:18:18Z,7,1,2,"{'size': 992, 'units': 'sqft'}",
2,M4036371277,condo,townhomes,"{'city': 'Philadelphia', 'line': '5703 N 13th ...",{'listing_office': {'list_item': {'name': 'Pre...,for_sale,215000,1.0,2,3.0,"{'size': 1360, 'units': 'sqft'}","[{'primary': True, 'photo': None, 'name': 'Kev...",2020-10-13T17:24:20Z,35,1,3,"{'size': 1501, 'units': 'sqft'}",1.0
3,M3553029343,single_family,,"{'city': 'Philadelphia', 'line': '1009 Rhawn S...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,394800,1.0,2,3.0,"{'size': 1856, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '4759', 'i...",2020-10-13T17:11:54Z,123,1,4,"{'size': 15913, 'units': 'sqft'}",1.0
4,M3649199107,condo,townhomes,"{'city': 'Philadelphia', 'line': '3850 N Bouvi...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,130000,1.0,2,3.0,"{'size': 1180, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '391546', ...",2020-10-13T17:02:13Z,33,1,5,"{'size': 1240, 'units': 'sqft'}",1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9757,M3400474681,condo,townhomes,"{'city': 'Philadelphia', 'line': '2077 Bridge ...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,94900,1.0,1,4.0,"{'size': 1296, 'units': 'sqft'}","[{'primary': True, 'photo': None, 'name': ''}]",2020-10-05T12:27:53Z,18,42,33,"{'size': 1064, 'units': 'sqft'}",
9758,M3654238233,condo,townhomes,"{'city': 'Philadelphia', 'line': '1452 N 57th ...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,149900,1.0,2,3.0,"{'size': 1026, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '950515', ...",2020-10-01T06:57:05Z,14,42,34,"{'size': 1231, 'units': 'sqft'}",1.0
9759,M9041602444,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '7354 Ridge A...",{'listing_office': {'list_item': {'name': 'BHH...,for_sale,432990,3.0,4,3.0,"{'size': 1614, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '948779', ...",2020-10-06T09:19:02Z,2,42,35,,1.0
9760,M3276555576,condo,condos,"{'city': 'Philadelphia', 'line': '1003 N Bodin...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,405000,2.0,2,2.0,"{'size': 1020, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '2229603',...",2020-10-05T10:35:58Z,25,42,36,,


## Reset index

In [8]:
processed_data = processed_data.reset_index(drop= True)
processed_data

Unnamed: 0,property_id,prop_type,prop_sub_type,address,branding,prop_status,price,baths_full,baths,beds,building_size,agents,last_update,photo_count,page_no,rank,lot_size,baths_half
0,M4046594895,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '1516 N 62nd ...",{'listing_office': {'list_item': {'name': 'Arc...,for_sale,249900,3.0,3,6.0,"{'size': 1632, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '1291281',...",2020-10-13T17:54:05Z,9,1,1,"{'size': 1842, 'units': 'sqft'}",
1,M3939384476,condo,townhomes,"{'city': 'Philadelphia', 'line': '6102 Reedlan...",{'listing_office': {'list_item': {'name': 'Vih...,for_sale,116800,1.0,1,3.0,"{'size': 1092, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '347285', ...",2020-10-13T18:18:18Z,7,1,2,"{'size': 992, 'units': 'sqft'}",
2,M4036371277,condo,townhomes,"{'city': 'Philadelphia', 'line': '5703 N 13th ...",{'listing_office': {'list_item': {'name': 'Pre...,for_sale,215000,1.0,2,3.0,"{'size': 1360, 'units': 'sqft'}","[{'primary': True, 'photo': None, 'name': 'Kev...",2020-10-13T17:24:20Z,35,1,3,"{'size': 1501, 'units': 'sqft'}",1.0
3,M3553029343,single_family,,"{'city': 'Philadelphia', 'line': '1009 Rhawn S...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,394800,1.0,2,3.0,"{'size': 1856, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '4759', 'i...",2020-10-13T17:11:54Z,123,1,4,"{'size': 15913, 'units': 'sqft'}",1.0
4,M3649199107,condo,townhomes,"{'city': 'Philadelphia', 'line': '3850 N Bouvi...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,130000,1.0,2,3.0,"{'size': 1180, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '391546', ...",2020-10-13T17:02:13Z,33,1,5,"{'size': 1240, 'units': 'sqft'}",1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9557,M3400474681,condo,townhomes,"{'city': 'Philadelphia', 'line': '2077 Bridge ...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,94900,1.0,1,4.0,"{'size': 1296, 'units': 'sqft'}","[{'primary': True, 'photo': None, 'name': ''}]",2020-10-05T12:27:53Z,18,42,33,"{'size': 1064, 'units': 'sqft'}",
9558,M3654238233,condo,townhomes,"{'city': 'Philadelphia', 'line': '1452 N 57th ...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,149900,1.0,2,3.0,"{'size': 1026, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '950515', ...",2020-10-01T06:57:05Z,14,42,34,"{'size': 1231, 'units': 'sqft'}",1.0
9559,M9041602444,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '7354 Ridge A...",{'listing_office': {'list_item': {'name': 'BHH...,for_sale,432990,3.0,4,3.0,"{'size': 1614, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '948779', ...",2020-10-06T09:19:02Z,2,42,35,,1.0
9560,M3276555576,condo,condos,"{'city': 'Philadelphia', 'line': '1003 N Bodin...",{'listing_office': {'list_item': {'name': 'Kel...,for_sale,405000,2.0,2,2.0,"{'size': 1020, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '2229603',...",2020-10-05T10:35:58Z,25,42,36,,


In [9]:
## Export to csv file

processed_data.to_csv('processed_data.csv',encoding='utf-8', index=False)

# Extra more info within cols

In [10]:
## Check NaN in each col

processed_data.isna().sum()

property_id         0
prop_type           0
prop_sub_type    2512
address             0
branding            0
prop_status         0
price               0
baths_full       1760
baths               0
beds             1182
building_size    1381
agents             94
last_update         0
photo_count         0
page_no             0
rank                0
lot_size         1593
baths_half       6168
dtype: int64

In [144]:
processed_data['prop_sub_type'].unique()

array(['duplex_triplex', 'townhomes', nan, 'condos'], dtype=object)

In [46]:
import numpy as np

In [14]:
## extra dic type strings from col
import ast

def str_to_dic(string):
    if string == None: pass
    else:
        return ast.literal_eval(string)
    

In [63]:
def xtr_info_from_dict(col, level = 0, keys = []):
    if level == 0:
        for i in keys:

            if i not in col.keys(): return None
            else:
                col = col[i]
        return col        
    
    else:
        for i in keys:

            if i not in col[level-1].keys(): return None
            else:
                col = col[level-1][i]
        return col

In [18]:
l = []
for i in range(len(processed_data['agents'])):
    if str(processed_data['agents'][i]) == 'nan':
        l.append(None)
    else:
        l.append(xtr_info_from_dict(str_to_dic(processed_data['agents'][i]), level=0, keys = ['id']))

In [47]:
## function for extracting and converting info to df
def info_to_df(col, level, keys, df, col_name):
    l = []
    for i in range(len(col)):
        if str(col[i]) == 'nan':
            l.append(np.nan)
        else:
            l.append(xtr_info_from_dict(str_to_dic(col[i]), level = level, keys = keys))
    
    # convert list to df
    new_df = pd.DataFrame(l, columns=col_name)
    return new_df

In [49]:
## function: append col to df

def append_df(col, level, keys, col_name, df):
    
    new_df = info_to_df(col, level, keys, df, col_name)
    
    return df.join(new_df)

There are some cols have details such `address`, `branding`, `agents`, `lot_size`, and `mls`. We need to discuss what features should we extract from them.

In [51]:
append_df(col = processed_data['agents'], level = 0, keys = ['id'], df = processed_data, col_name = ['agent_id']).head()

Unnamed: 0,property_id,prop_type,prop_sub_type,address,branding,prop_status,price,baths_full,baths,beds,building_size,agents,last_update,photo_count,page_no,rank,lot_size,baths_half,agent_id
0,M4046594895,condo,duplex_triplex,"{'city': 'Philadelphia', 'line': '1516 N 62nd ...",{'listing_office': {'list_item': {'name': 'Arc...,for_sale,249900,3.0,3,6.0,"{'size': 1632, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '1291281',...",2020-10-13T17:54:05Z,9,1,1,"{'size': 1842, 'units': 'sqft'}",,1291281.0
1,M3939384476,condo,townhomes,"{'city': 'Philadelphia', 'line': '6102 Reedlan...",{'listing_office': {'list_item': {'name': 'Vih...,for_sale,116800,1.0,1,3.0,"{'size': 1092, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '347285', ...",2020-10-13T18:18:18Z,7,1,2,"{'size': 992, 'units': 'sqft'}",,347285.0
2,M4036371277,condo,townhomes,"{'city': 'Philadelphia', 'line': '5703 N 13th ...",{'listing_office': {'list_item': {'name': 'Pre...,for_sale,215000,1.0,2,3.0,"{'size': 1360, 'units': 'sqft'}","[{'primary': True, 'photo': None, 'name': 'Kev...",2020-10-13T17:24:20Z,35,1,3,"{'size': 1501, 'units': 'sqft'}",1.0,
3,M3553029343,single_family,,"{'city': 'Philadelphia', 'line': '1009 Rhawn S...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,394800,1.0,2,3.0,"{'size': 1856, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '4759', 'i...",2020-10-13T17:11:54Z,123,1,4,"{'size': 15913, 'units': 'sqft'}",1.0,4759.0
4,M3649199107,condo,townhomes,"{'city': 'Philadelphia', 'line': '3850 N Bouvi...",{'listing_office': {'list_item': {'name': 'Re/...,for_sale,130000,1.0,2,3.0,"{'size': 1180, 'units': 'sqft'}","[{'primary': True, 'advertiser_id': '391546', ...",2020-10-13T17:02:13Z,33,1,5,"{'size': 1240, 'units': 'sqft'}",1.0,391546.0


In [135]:
str_to_dic(processed_data['branding'][0])['listing_office']['list_item']['name']

'Archstone Realty'

In [136]:
len(processed_data['branding'])

9562

In [122]:
## copy df
update_df = processed_data.copy()

In [123]:
extract = {'address':['city', 'line', 'postal_code', 'state_code', 'county', 'lat', 'lon', 'neighborhood_name'],
           'building_size':['size']
          }

In [124]:
for k in extract:
    for info in extract[k]:
        update_df = append_df(col = update_df[k], level = 0, keys = [info], df = update_df, col_name = [info])

In [125]:
## rename building size
update_df = update_df.rename(columns={'size':'buiding_size(sqft)'})

In [126]:
## get lot size
update_df = append_df(col = update_df['lot_size'], level = 0, keys = ['size'], df = update_df, col_name = ['lot_size(sqft)'])

In [130]:
## get agent id
update_df = append_df(col = update_df['agents'], level = 1, keys = ['id'], df = update_df, col_name = ['agent_id'])

In [131]:
## get agent name
update_df = append_df(col = update_df['agents'], level = 1, keys = ['name'], df = update_df, col_name = ['agent_name'])

In [139]:
## get branding office name
brand_name = []
for i in range(len(processed_data['branding'])):
    if str(processed_data['branding'][i]) == 'nan':
        brand_name.append(np.nan)
    else:
        brand_name.append(str_to_dic(processed_data['branding'][i])['listing_office']['list_item']['name'])
    
# convert list to df
brand_name = pd.DataFrame(brand_name, columns=['brand_name'])

update_df = update_df.join(brand_name)

In [141]:
## drop original cols
update_df.drop(columns=['address', 'building_size', 'agents', 'branding', 'lot_size'], inplace=True)

In [142]:
update_df.head()

Unnamed: 0,property_id,prop_type,prop_sub_type,prop_status,price,baths_full,baths,beds,last_update,photo_count,...,state_code,county,lat,lon,neighborhood_name,buiding_size(sqft),lot_size(sqft),agent_id,agent_name,brand_name
0,M4046594895,condo,duplex_triplex,for_sale,249900,3.0,3,6.0,2020-10-13T17:54:05Z,9,...,PA,Philadelphia,39.974408,-75.2437,West Philadelphia,1632.0,1842.0,1291281.0,Ausra Anusauskas,Archstone Realty
1,M3939384476,condo,townhomes,for_sale,116800,1.0,1,3.0,2020-10-13T18:18:18Z,7,...,PA,Philadelphia,39.926283,-75.225382,Southwest Philadelphia,1092.0,992.0,347285.0,Dr Hanh Vo,Vihi Realty
2,M4036371277,condo,townhomes,for_sale,215000,1.0,2,3.0,2020-10-13T17:24:20Z,35,...,PA,Philadelphia,40.039721,-75.1421,Upper North District,1360.0,1501.0,,Kevin Chen,Premium Realty Castor Inc
3,M3553029343,single_family,,for_sale,394800,1.0,2,3.0,2020-10-13T17:11:54Z,123,...,PA,Philadelphia,40.071978,-75.073303,Near Northeast Philadelphia,1856.0,15913.0,4759.0,Carol Mallen,Re/Max Services
4,M3649199107,condo,townhomes,for_sale,130000,1.0,2,3.0,2020-10-13T17:02:13Z,33,...,PA,Philadelphia,40.012784,-75.156405,Upper North Philadelphia,1180.0,1240.0,391546.0,Maria Quattrone CEO,Re/Max @ Home - Philadelphia


In [145]:
update_df.isna().sum()

property_id              0
prop_type                0
prop_sub_type         2512
prop_status              0
price                    0
baths_full            1760
baths                    0
beds                  1182
last_update              0
photo_count              0
page_no                  0
rank                     0
baths_half            6168
city                     0
line                     9
postal_code              0
state_code               0
county                   0
lat                      0
lon                      0
neighborhood_name        8
buiding_size(sqft)    1381
lot_size(sqft)        1687
agent_id               986
agent_name             174
brand_name              80
dtype: int64

In [143]:
## export to csv
# update_df.to_csv('update_df.csv',encoding='utf-8', index=False)

In [146]:
update_df.shape

(9562, 26)