In [1]:
import os
import json
import progressbar
import math
import pandas as pd
import numpy as np
from pandas import json_normalize
from functools import reduce

In [2]:
CITIES_FILEPATH = '../Dataset/Location/cities.json'
STATES_FILEPATH = '../Dataset/Location/states.json'
COUNTRIES_FILEPATH = '../Dataset/Location/countries.json'
LOCATION_FILEPATH = '../Dataset/Processed/Locations.csv'
USERS_FILEPATH = '../Dataset/BX-CSV-Dump/BX-Users.csv'
USERS_NORMALIZED_FILEPATH = '../Dataset/Processed/BX-Users.csv'
LOC_FUZZY_MAP_FILEPATH = '../Dataset/Processed/mappings.json'

In [3]:
def sanitizeNumericData(row, as_types):
    for col in as_types:
        row[col] = int(''.join(filter(str.isdigit, row[col]))) if as_types[col] == 'int64' else row[col]
    return row

In [4]:
def loadDataframeFromJSON(filepath, key, rename_columns, as_types={}):
    with open(filepath) as f: 
        d = json.load(f)  
    return json_normalize(d[key]).apply(sanitizeNumericData, args=(as_types, ), axis=1).rename(columns=rename_columns)

In [5]:
if not os.path.exists(LOCATION_FILEPATH):
    cities = loadDataframeFromJSON(CITIES_FILEPATH, 'cities', {'id': 'city_id', 'name': 'city'}, {'id': 'int64', 'state_id': 'int64'})
    states = loadDataframeFromJSON(STATES_FILEPATH, 'states', {'id': 'state_id', 'name': 'state'}, {'id': 'int64','country_id': 'int64'})
    countries = loadDataframeFromJSON(COUNTRIES_FILEPATH, 'countries', {'id': 'country_id', 'sortname': 'country_code', 'phoneCode': 'phone_code', 'name': 'country'})
    locations = cities.merge(states, on='state_id', how="inner").merge(countries, on='country_id', how="inner")
    locations.to_csv(LOCATION_FILEPATH, index=False, index_label=False, columns=['city_id', 'city', 'state_id', 'state', 'country_id', 'country', 'country_code', 'phone_code'])
else:
    locations = pd.read_csv(LOCATION_FILEPATH);

In [6]:
locations['Location'] = locations['city'] + ", " + locations['state'] + ", " + locations['country']

In [7]:
locations.head()

Unnamed: 0,city_id,city,state_id,state,country_id,country,country_code,phone_code,Location
0,1,Bombuflat,1,Andaman and Nicobar Islands,101,India,IN,91,"Bombuflat, Andaman and Nicobar Islands, India"
1,2,Garacharma,1,Andaman and Nicobar Islands,101,India,IN,91,"Garacharma, Andaman and Nicobar Islands, India"
2,3,Port Blair,1,Andaman and Nicobar Islands,101,India,IN,91,"Port Blair, Andaman and Nicobar Islands, India"
3,4,Rangat,1,Andaman and Nicobar Islands,101,India,IN,91,"Rangat, Andaman and Nicobar Islands, India"
4,5,Addanki,2,Andhra Pradesh,101,India,IN,91,"Addanki, Andhra Pradesh, India"


In [13]:
def loadNormalizedUsers():
    if not os.path.exists(USERS_NORMALIZED_FILEPATH):
        users = pd.read_csv(USERS_FILEPATH, sep=";")
        users_loc = pd.DataFrame([ x.split(', ', 2)[0:3] for x in users['Location'].tolist() ])
        users_loc.columns = ['city', 'state', 'country']
        users_normalized = pd.concat([users, users_loc], axis=1)
        users_normalized['fuzzy_match'] = np.nan
    else:
        users_normalized = pd.read_csv(USERS_NORMALIZED_FILEPATH, dtype={'fuzzy_match': str, "city": str, "state": str})
    return users_normalized

users_normalized = loadNormalizedUsers()
users_normalized.loc[120:140]

In [None]:
# ! conda install -c conda-forge fuzzywuzzy

In [14]:
from fuzzywuzzy import fuzz, process

In [93]:
def getPotentialMatch(city, state):
    booleans = []
    for row in locations.itertuples():
        if fuzz.WRatio(city, row.state) > 80:
            booleans.append(True)
        else:
            booleans.append(False)
    return booleans

In [16]:
f = open(LOC_FUZZY_MAP_FILEPATH,)  
mapping = json.load(f)
def fuzzy_match_loc(row):
    index = row.city + "|" + row.state
    if(index not in mapping):
        filters = getPotentialMatch(row.city, row.state);
        potential_matches = locations[filters]
        if potential_matches['Location'].count() > 0:
            row.fuzzy_match = process.extractOne(row.Location, list(potential_matches['Location'].values))[0]
            mapping[index] = row.fuzzy_match
    else:
       row.fuzzy_match  = mapping[index]
    return row

In [94]:
n = 10
# chunk=0
# stop_limit = 1
users_normalized = loadNormalizedUsers()
to_process_users= users_normalized[users_normalized['fuzzy_match'].isna()].fillna('')
# total_chunk = to_process_users.shape[0] // n
list_df = [to_process_users[i:i+n] for i in range(0,to_process_users.shape[0],n)]
for df in progressbar.progressbar(list_df, redirect_stdout=True):
    updated_df = df.apply(fuzzy_match_loc, axis=1)
    users_normalized.loc[users_normalized['User-ID'].isin(updated_df['User-ID']), ['fuzzy_match']] = updated_df[['fuzzy_match']]
    users_normalized.to_csv(USERS_NORMALIZED_FILEPATH, index=False, index_label=False)
    saveLocMapping()
    '''
    chunk=chunk+1
    if chunk == stop_limit:
        updated_df.head(20)
        break
    '''

N/A% (0 of 5210) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

In [124]:
len(mapping)

31388

In [18]:
def saveLocMapping():
    json_content = json.dumps(mapping)
    f = open(LOC_FUZZY_MAP_FILEPATH,"w")
    f.write(json_content)
    f.close()

In [28]:
len(users_normalized['Location'])

278770

In [24]:
users_normalized[users_normalized['fuzzy_match'] == '']

Unnamed: 0,User-ID,Location,Age,city,state,country,fuzzy_match
0,1,"nyc, new york, usa",,nyc,new york,usa,
13,14,"mediapolis, iowa, usa",,mediapolis,iowa,usa,
17,18,"rio de janeiro, rio de janeiro, brazil",25.0,rio de janeiro,rio de janeiro,brazil,
20,21,"ferrol / spain, alabama, spain",46.0,ferrol / spain,alabama,spain,
53,54,"eubank, kentucky, usa",44.0,eubank,kentucky,usa,
...,...,...,...,...,...,...,...
278819,278820,"ninety six, south carolina, usa",59.0,ninety six,south carolina,usa,
278840,278841,"llangollen, denbighshire county, united kingdom.",,llangollen,denbighshire county,united kingdom.,
278844,278845,"järvenpää, uusimaa, finland",,järvenpää,uusimaa,finland,
278847,278848,"köln, nordrhein-westfalen, germany",,köln,nordrhein-westfalen,germany,


In [29]:
locations.head()

Unnamed: 0,city_id,city,state_id,state,country_id,country,country_code,phone_code,Location
0,1,Bombuflat,1,Andaman and Nicobar Islands,101,India,IN,91,"Bombuflat, Andaman and Nicobar Islands, India"
1,2,Garacharma,1,Andaman and Nicobar Islands,101,India,IN,91,"Garacharma, Andaman and Nicobar Islands, India"
2,3,Port Blair,1,Andaman and Nicobar Islands,101,India,IN,91,"Port Blair, Andaman and Nicobar Islands, India"
3,4,Rangat,1,Andaman and Nicobar Islands,101,India,IN,91,"Rangat, Andaman and Nicobar Islands, India"
4,5,Addanki,2,Andhra Pradesh,101,India,IN,91,"Addanki, Andhra Pradesh, India"


In [90]:
locations = locations.fillna('')
locations[locations['state'].astype('str').str.contains('Denbighshire')]

Unnamed: 0,city_id,city,state_id,state,country_id,country,country_code,phone_code,Location
41544,41520,Denbigh,3828,Denbighshire,230,United Kingdom,GB,44,"Denbigh, Denbighshire, United Kingdom"


In [92]:
fuzz.WRatio('Denbighshire', 'denbighshire county')

90