## Set up Bigquery Database

In [1]:
import pandas as pd
import pandas_gbq

In [50]:
df = pd.read_csv('Dataset Real Estate.csv', index_col=0)
df.shape

(208150, 9)

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208150 entries, 0 to 208149
Data columns (total 9 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Address   208150 non-null  object 
 1   Beds      202309 non-null  object 
 2   Baths     208150 non-null  object 
 3   Area      169131 non-null  object 
 4   Price     207681 non-null  float64
 5   Street    208150 non-null  object 
 6   District  208150 non-null  object 
 7   City      208150 non-null  object 
 8   Zip Code  208150 non-null  object 
dtypes: float64(1), object(8)
memory usage: 15.9+ MB


In [52]:
df.head()

Unnamed: 0,Address,Beds,Baths,Area,Price,Street,District,City,Zip Code
0,"4-74 48th Ave #9K, Long Island City, NY 11109",1,1,700.0,555000.0,4-74 48th Ave #9K,Long Island City,NY,11109
1,"469 E 49th St #6, Brooklyn, NY 11203",10,6,,1250000.0,469 E 49th St #6,Brooklyn,NY,11203
2,"715 Avenue L, Brooklyn, NY 11230",8,6,2015.0,1690000.0,715 Avenue L,Brooklyn,NY,11230
3,"223 E 62nd St, New York, NY 10065",5,6,3750.0,7995000.0,223 E 62nd St,New York,NY,10065
4,"1824 E 17th St #B2, Brooklyn, NY 11229",2,2,888.0,579000.0,1824 E 17th St #B2,Brooklyn,NY,11229


In [53]:
df.columns = ['Address', 'Beds', 'Baths', 'Area', 'Price', 'Street', 'District',
       'City', 'Zip_Code']

In [54]:
df['Area'] = df['Area'].str.replace(',', '')
df['Area'] = df['Area'].str.split(" ", n=1, expand=True)[0]

In [59]:
df_clean=df.where(df.Area != r'n/a')
df_clean=df_clean.where(df.Baths != r'n/')
df_clean=df_clean.drop_duplicates()
df_clean=df_clean.dropna()
df_clean.head()

Unnamed: 0,Address,Beds,Baths,Area,Price,Street,District,City,Zip_Code
0,"4-74 48th Ave #9K, Long Island City, NY 11109",1,1,700,555000.0,4-74 48th Ave #9K,Long Island City,NY,11109
2,"715 Avenue L, Brooklyn, NY 11230",8,6,2015,1690000.0,715 Avenue L,Brooklyn,NY,11230
3,"223 E 62nd St, New York, NY 10065",5,6,3750,7995000.0,223 E 62nd St,New York,NY,10065
4,"1824 E 17th St #B2, Brooklyn, NY 11229",2,2,888,579000.0,1824 E 17th St #B2,Brooklyn,NY,11229
5,"335-341 Nostrand Avenue UNIT 401-A, Brooklyn, ...",1,1,469,539000.0,335-341 Nostrand Avenue UNIT 401-A,Brooklyn,NY,11216


In [60]:
df_clean['Area'] = df_clean['Area'].astype('int')
df_clean['Baths'] = df_clean['Baths'].astype('int')

In [61]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3472 entries, 0 to 208149
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Address   3472 non-null   object 
 1   Beds      3472 non-null   object 
 2   Baths     3472 non-null   int32  
 3   Area      3472 non-null   int32  
 4   Price     3472 non-null   float64
 5   Street    3472 non-null   object 
 6   District  3472 non-null   object 
 7   City      3472 non-null   object 
 8   Zip_Code  3472 non-null   object 
dtypes: float64(1), int32(2), object(6)
memory usage: 244.1+ KB


In [62]:
df_clean.shape

(3472, 9)

In [63]:
project_id = 'latihan-345909'
table_id = 'latihan-345909.real_estate.test_1'

In [64]:
pandas_gbq.to_gbq(df_clean, table_id, project_id=project_id,if_exists='append')

1it [00:06,  7.00s/it]


## Scrape New Data

In [None]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import urllib.parse 

real_estate_new=pd.DataFrame(columns=['Address', 'Beds', 'Baths', 'Area', 'Price'])
headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.54'})

address=[]
beds=[]
baths=[]
areas=[]
prices=[]

web = ['https://www.trulia.com/NY/New_York/',
      'https://www.trulia.com/CA/Los_Angeles/',
      'https://www.trulia.com/IL/Chicago/',
      'https://www.trulia.com/AZ/Phoenix/',
      'https://www.trulia.com/NV/Las_Vegas/']

for h in web:
    for i in range (1,16):
        website = requests.get(h + str(i) + '_p/', headers=headers)

        soup = BeautifulSoup(website.content, 'html.parser')

        result = soup.find_all('li', {'class' : 'Grid__CellBox-sc-144isrp-0 SearchResultsList__WideCell-b7y9ki-2 jiZmPM'})

        result_update = [k for k in result if k.has_attr('data-testid')]

        for result in result_update:

            try:
                address.append(result.find('div', {'data-testid':'property-address'}).get_text())
            except:
                address.append('n/a')

            try:
                beds.append(result.find('div', {'data-testid':'property-beds'}).get_text())
            except:
                beds.append('n/a')

            try:
                baths.append(result.find('div', {'data-testid':'property-baths'}).get_text())
            except:
                baths.append('n/a')    

            try:
                areas.append(result.find('div', {'data-testid':'property-floorSpace'}).get_text())
            except:
                areas.append('n/a')  

            try:
                prices.append(result.find('div', {'data-testid':'property-price'}).get_text())
            except:
                prices.append('n/a')      

        for j in range (len(address)):
            real_estate_new=real_estate_new.append({'Address':address[j], 'Beds':beds[j], 'Baths':baths[j], 'Area':areas[j], 'Price':prices[j]}, ignore_index=True)

In [19]:
real_estate_new['Beds'] = real_estate_new['Beds'].apply(lambda x: x.strip('bd'))
real_estate_new['Baths'] = real_estate_new['Baths'].apply(lambda x: x.strip('ba'))
real_estate_new['Price'] = real_estate_new['Price'].apply(lambda x: x.strip('$'))
real_estate_new['Price'] = real_estate_new['Price'].apply(lambda x: x.replace(",",""))
real_estate_new['Price'] = real_estate_new['Price'].apply(lambda x: x.replace("+",""))
real_estate_new['Area'] = real_estate_new['Area'].apply(lambda x: x.replace(" sqft",""))

real_estate_new['Street']=real_estate_new['Address'].apply(lambda x: x.split(',')[0])
real_estate_new['District']=real_estate_new['Address'].apply(lambda x: x.split(',')[1])
real_estate_new['City']=real_estate_new['Address'].apply(lambda x: x.split(',')[2].split(' ')[1])
real_estate_new['Zip_Code']=real_estate_new['Address'].apply(lambda x: x.split(',')[2].split(' ')[2])

In [20]:
real_estate_new['Area'] = real_estate_new['Area'].str.replace(',', '')
real_estate_new['Area'] = real_estate_new['Area'].str.split(" ", n=1, expand=True)[0]

In [70]:
real_estate_clean=real_estate_new.where(real_estate_new.Area != r'n/a')
real_estate_clean=real_estate_clean.where(real_estate_clean.Baths != r'n/')
real_estate_clean=real_estate_clean.where(real_estate_clean.Price != r'')
real_estate_clean=real_estate_clean.drop_duplicates()
real_estate_clean=real_estate_clean.dropna()
real_estate_clean.head()

Unnamed: 0,Address,Beds,Baths,Area,Price,Street,District,City,Zip_Code
0,"1371 Edwards Ave, Bronx, NY 10461",3,2,1800,698000,1371 Edwards Ave,Bronx,NY,10461
4,"730 E 95th St, Brooklyn, NY 11236",5,3,2700,999000,730 E 95th St,Brooklyn,NY,11236
5,"3751 86th St #3, Jackson Heights, NY 11372",1,1,800,258000,3751 86th St #3,Jackson Heights,NY,11372
6,"44 E 67th St #4D, New York, NY 10065",1,1,874,1495000,44 E 67th St #4D,New York,NY,10065
7,"190 22nd St #2, Brooklyn, NY 11232",9,9,6200,3500000,190 22nd St #2,Brooklyn,NY,11232


In [71]:
real_estate_clean['Area'] = real_estate_clean['Area'].astype('int')
real_estate_clean['Baths'] = real_estate_clean['Baths'].astype('int')
real_estate_clean['Price'] = real_estate_clean['Price'].astype('float64')

In [72]:
real_estate_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1713 entries, 0 to 50999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Address   1713 non-null   object 
 1   Beds      1713 non-null   object 
 2   Baths     1713 non-null   int32  
 3   Area      1713 non-null   int32  
 4   Price     1713 non-null   float64
 5   Street    1713 non-null   object 
 6   District  1713 non-null   object 
 7   City      1713 non-null   object 
 8   Zip_Code  1713 non-null   object 
dtypes: float64(1), int32(2), object(6)
memory usage: 120.4+ KB


### Load initial Target Data

In [73]:
from google.cloud import bigquery
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file(
'latihan-345909-d057684ecb42.json')

project_id = 'latihan-345909'
client = bigquery.Client(credentials= credentials,project=project_id)

In [74]:
sql = """
   SELECT *
   FROM real_estate.test_1
   """

df = client.query(sql).to_dataframe()
df.head()

Unnamed: 0,Address,Beds,Baths,Area,Price,Street,District,City,Zip_Code
0,"2014 2nd Ave #BB, New York, NY 10029",24,8,6000,2500000.0,2014 2nd Ave #BB,New York,NY,10029
1,"825 Riverside Dr, New York, NY 10032",9,4,4100,1995000.0,825 Riverside Dr,New York,NY,10032
2,"345 E 62nd St, New York, NY 10065",10,9,9453,9500000.0,345 E 62nd St,New York,NY,10065
3,"1902 8th Ave #1, Brooklyn, NY 11215",19,8,5890,3500000.0,1902 8th Ave #1,Brooklyn,NY,11215
4,"144 14th St, Brooklyn, NY 11215",12,4,2100,2399000.0,144 14th St,Brooklyn,NY,11215


In [75]:
df.dropna(inplace=True)

In [76]:
df.shape

(3472, 9)

### Detect Changes in data by comparing source and target

In [77]:
df.apply(tuple,1)

0       (2014 2nd Ave  #BB, New York, NY 10029, 24, 8,...
1       (825 Riverside Dr, New York, NY 10032, 9, 4, 4...
2       (345 E  62nd St, New York, NY 10065, 10, 9, 94...
3       (1902 8th Ave  #1, Brooklyn, NY 11215, 19, 8, ...
4       (144 14th St, Brooklyn, NY 11215, 12, 4, 2100,...
                              ...                        
3467    (620 S  Gramercy Pl #323, Los Angeles, CA 9000...
3468    (Studio Plan in Perla Condos, Los Angeles, CA ...
3469    (746 S  Los Angeles St #407, Los Angeles, CA 9...
3470    (633 Rose Ave, Venice, CA 90291, Studio, 5, 13...
3471    (20327 Saticoy St #207, Winnetka, CA 91306, St...
Length: 3472, dtype: object

In [78]:
real_estate_clean.apply(tuple,1).isin(df.apply(tuple,1))

0        False
4        False
5        False
6        False
7        False
         ...  
50995    False
50996    False
50997    False
50998    False
50999    False
Length: 1713, dtype: bool

In [79]:
changes = real_estate_clean[~real_estate_clean.apply(tuple,1).isin(df.apply(tuple,1))]
changes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1626 entries, 0 to 50999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Address   1626 non-null   object 
 1   Beds      1626 non-null   object 
 2   Baths     1626 non-null   int32  
 3   Area      1626 non-null   int32  
 4   Price     1626 non-null   float64
 5   Street    1626 non-null   object 
 6   District  1626 non-null   object 
 7   City      1626 non-null   object 
 8   Zip_Code  1626 non-null   object 
dtypes: float64(1), int32(2), object(6)
memory usage: 114.3+ KB


### Load the Changes to Bigquery

In [80]:
pandas_gbq.to_gbq(changes, table_id, project_id=project_id, if_exists='append')

1it [00:04,  4.81s/it]
