# Import relevant libraries

In [1]:
import re
import os
import numpy as np
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
from tqdm.notebook import tqdm

# Load data

In [2]:
# set base_path
base_path = os.path.join('..', '..')

In [7]:
# set file_path
file_path = os.path.join(base_path, 'datasets', 'cleaned_data.csv')

# load data
df = pd.read_csv(file_path, low_memory=False)

In [8]:
df.head()

Unnamed: 0,CompanyName,RegAddress.PostCode,RegAddress.PostTown
0,!BIG IMPACT GRAPHICS LIMITED,EC1V 9LT,LONDON
1,!NKED LTD,SW9 8QS,LONDON
2,!NVERTD DESIGNS LIMITED,W12 8DS,LONDON
3,"""A"" CONCEPT LIMITED",E1 7AA,LONDON
4,"""BSP RETAIL"" LIMITED",NW4 3XP,LONDON


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911230 entries, 0 to 911229
Data columns (total 3 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   CompanyName          911230 non-null  object
 1   RegAddress.PostCode  910330 non-null  object
 2   RegAddress.PostTown  911230 non-null  object
dtypes: object(3)
memory usage: 20.9+ MB


In [10]:
# remove na
df.dropna(inplace = True)

# select only East London
pattern = r'^EC[12]'

EC12 = []
for postcode in df['RegAddress.PostCode']:
    val = bool(re.match(pattern, postcode))
    EC12.append(val)

In [None]:
df = df[EC12].reset_index().drop(columns = 'index')

In [None]:
unique_postcode = df['RegAddress.PostCode'].unique()

In [None]:
# get latitude, longitude
latitude = []
longitude = []

# request coordinate
geolocator = Nominatim(user_agent = "http")
for postcode in tqdm(unique_postcode):
    location = geolocator.geocode(postcode)
    if location is None:
        latitude.append(None)
        longitude.append(None)
    if location is not None:
        latitude.append(location.latitude)
        longitude.append(location.longitude)

In [None]:
coordinates = pd.DataFrame({'RegAddress.PostCode': unique_postcode,
                            'latitude': latitude,
                            'longitude': longitude})

In [None]:
EC12_coordinates = pd.merge(df, coordinates, on = 'RegAddress.PostCode', how = 'left')

In [None]:
EC12_coordinates.to_csv(os.path.join(base_path, 'datasets', 'tech_roundabout_coordinates.csv'), index = False)