In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests
import time

In [4]:
from listings_consumer import get_shard_iterator

In [5]:
def c(x): 
    if x:
        return x.text.replace('\r', '').replace('\t', '').replace('\n', ' ').strip()
    return ''

In [6]:
def parse_pages(pages):
    "Return parsed pages"
    results = []

    for page in range(pages):
        print(f'Getting page {page+1} from looperghana')
        results.append(requests.get(f"https://listings.loopghana.com/pageNumber_{page}").text)

    soups = []
    for r in results:
        soup = BeautifulSoup(r, 'lxml')
        soups.append(soup)
    
    return soups    

In [7]:
def extract_listings(soups):
    "Extract listings from list of beautiful soup objects"
    
    data = []

    for soup in soups:
        listings = soup.find_all('ul', {'class': 'listings-list'})[1].find_all('li')

        for prop in listings:
            d = {
                'broker': c(prop.find('span', {'class': 'agt'})),
                'category': prop.find('a').attrs['data-id'],
                'price': c(prop.find('span', {'class': 'price'})),
                'area': c(prop.find('span', {'class': 'size'})), 
                'beds': c(prop.find('span', {'class': 'bedrooms'})),
                'bath': c(prop.find('span', {'class': 'bathrooms'})),
                'url': prop.find('a').attrs['href'],

            }
            data.append(d)

    return data

In [8]:
def clean(df):
    df['currency'] = df.price.apply(lambda x: re.match(r'[A-Z\$]+', x.replace(',', '')).group())
    df['price'] = df.price.apply(lambda x: re.findall(r'[0-9]+', x.replace(',', ''))[0])
    df['area'] = df['area'].str.replace('m2', '')
    df['beds'] = df['beds'].str.replace('Bed', '')
    df['bath'] = df['bath'].str.replace('Bath', '')
    df['source'] = 'loopghana'
    
    return df

In [9]:
def get_coords(url):
    
    try:
        results = requests.get(url)
        soup = BeautifulSoup(results.text, 'lxml')
        js = soup.find_all('script', {'type': "text/javascript"})[0]

        coords = re.findall(r"(ws_l[a-z]+ = '-?[0-9].[0-9]+')", js.text)

        if coords:
            coords = dict(map(lambda x: x.split(' = '), coords))
            lat = coords.get('ws_lat', None)
            lon = coords.get('ws_lon', None)

        return lat, lon
    except:
        return None, None

In [10]:
def enrich(df):
    df['lat'] = None
    df['lon'] = None
    df[['lat', 'lon']] = df.url.apply(lambda x: get_coords(x)).apply(pd.Series)

    df['lat'] = df['lat'].apply(lambda x: x[1:-1] if x else None).astype(float)
    df['lon'] = df['lon'].apply(lambda x: x[1:-1] if x else None).astype(float)
    
#     df = df[['id', 'location', 'currency', 'price', 'area', 'bedrooms', 'bathrooms', 'url', 'lat', 'lon']]
    
    return df

In [11]:
def scrape_looper(pages=4, add_gps=False):
    soups = parse_pages(pages)
    data = extract_listings(soups)
    df = pd.DataFrame(data)
    df = clean(df)
    
    if enrich:
        df = enrich(df)
    
    return df

In [12]:
df = scrape_looper()

Getting page 1 from looperghana
Getting page 2 from looperghana
Getting page 3 from looperghana
Getting page 4 from looperghana


In [16]:
df

Unnamed: 0,area,bath,beds,broker,category,price,url,currency,source,lat,lon
0,133,2,2,THE GREENS,c8dp,76000,https://listings.loopghana.com/listing/c8dp,$,loopghana,5.727062,0.015778
1,133,2,2,THE GREENS,c8e5,76000,https://listings.loopghana.com/listing/c8e5,$,loopghana,5.726512,0.015912
2,133,2,2,THE GREENS,c8fg,76000,https://listings.loopghana.com/listing/c8fg,$,loopghana,5.726491,0.016515
3,133,2,2,THE GREENS,c8ex,76000,https://listings.loopghana.com/listing/c8ex,$,loopghana,5.726907,0.015757
4,133,3,3,THE GREENS,c888,125000,https://listings.loopghana.com/listing/c888,$,loopghana,5.726635,0.015955
5,7600,10,10,Abri Properties Gh,c9cg,895000,https://listings.loopghana.com/listing/c9cg,$,loopghana,5.618244,-0.140962
6,254,4,4,Abri Properties Gh,c18o,3000,https://listings.loopghana.com/listing/c18o,$,loopghana,5.638401,-0.150245
7,10000,7,6,Abri Properties Gh,dfmk,80000,https://listings.loopghana.com/listing/dfmk,$,loopghana,5.662551,-0.116835
8,19,3,3,Brainbox Consult,d3ob,200000,https://listings.loopghana.com/listing/d3ob,$,loopghana,5.629858,-0.145450
9,,,6,Brainbox Consult,d3kq,1200000,https://listings.loopghana.com/listing/d3kq,$,loopghana,5.636113,-0.162503
