In [1]:
import os
import json
import progressbar
import math
import pandas as pd
import numpy as np
from pandas import json_normalize
from functools import reduce

In [2]:
CITIES_FILEPATH = '../Dataset/Location/cities.json'
STATES_FILEPATH = '../Dataset/Location/states.json'
COUNTRIES_FILEPATH = '../Dataset/Location/countries.json'
LOCATION_FILEPATH = '../Dataset/Processed/Locations.csv'
USERS_FILEPATH = '../Dataset/BX-CSV-Dump/BX-Users.csv'
USERS_NORMALIZED_FILEPATH = '../Dataset/Processed/BX-Users.csv'
LOC_FUZZY_MAP_FILEPATH = '../Dataset/Processed/mappings.json'

In [3]:
def sanitizeNumericData(row, as_types):
    for col in as_types:
        row[col] = int(''.join(filter(str.isdigit, row[col]))) if as_types[col] == 'int64' else row[col]
    return row

In [4]:
def loadDataframeFromJSON(filepath, key, rename_columns, as_types={}):
    with open(filepath) as f: 
        d = json.load(f)  
    return json_normalize(d[key]).apply(sanitizeNumericData, args=(as_types, ), axis=1).rename(columns=rename_columns)

In [5]:
if not os.path.exists(LOCATION_FILEPATH):
    cities = loadDataframeFromJSON(CITIES_FILEPATH, 'cities', {'id': 'city_id', 'name': 'city'}, {'id': 'int64', 'state_id': 'int64'})
    states = loadDataframeFromJSON(STATES_FILEPATH, 'states', {'id': 'state_id', 'name': 'state'}, {'id': 'int64','country_id': 'int64'})
    countries = loadDataframeFromJSON(COUNTRIES_FILEPATH, 'countries', {'id': 'country_id', 'sortname': 'country_code', 'phoneCode': 'phone_code', 'name': 'country'})
    locations = cities.merge(states, on='state_id', how="inner").merge(countries, on='country_id', how="inner")
    locations.to_csv(LOCATION_FILEPATH, index=False, index_label=False, columns=['city_id', 'city', 'state_id', 'state', 'country_id', 'country', 'country_code', 'phone_code'])
else:
    locations = pd.read_csv(LOCATION_FILEPATH);

In [6]:
locations['Location'] = locations['city'] + ", " + locations['state'] + ", " + locations['country']

In [7]:
locations.head()

Unnamed: 0,city_id,city,state_id,state,country_id,country,country_code,phone_code,Location
0,1,Bombuflat,1,Andaman and Nicobar Islands,101,India,IN,91,"Bombuflat, Andaman and Nicobar Islands, India"
1,2,Garacharma,1,Andaman and Nicobar Islands,101,India,IN,91,"Garacharma, Andaman and Nicobar Islands, India"
2,3,Port Blair,1,Andaman and Nicobar Islands,101,India,IN,91,"Port Blair, Andaman and Nicobar Islands, India"
3,4,Rangat,1,Andaman and Nicobar Islands,101,India,IN,91,"Rangat, Andaman and Nicobar Islands, India"
4,5,Addanki,2,Andhra Pradesh,101,India,IN,91,"Addanki, Andhra Pradesh, India"


In [13]:
def loadNormalizedUsers():
    if not os.path.exists(USERS_NORMALIZED_FILEPATH):
        users = pd.read_csv(USERS_FILEPATH, sep=";")
        users_loc = pd.DataFrame([ x.split(', ', 2)[0:3] for x in users['Location'].tolist() ])
        users_loc.columns = ['city', 'state', 'country']
        users_normalized = pd.concat([users, users_loc], axis=1)
        users_normalized['fuzzy_match'] = np.nan
    else:
        users_normalized = pd.read_csv(USERS_NORMALIZED_FILEPATH, dtype={'fuzzy_match': str, "city": str, "state": str})
    return users_normalized

users_normalized = loadNormalizedUsers()
users_normalized.loc[120:140]

In [None]:
# ! conda install -c conda-forge fuzzywuzzy

In [14]:
from fuzzywuzzy import fuzz, process

In [131]:
def getPotentialMatch(city, state):
    booleans = []
    for row in locations.itertuples():
        if fuzz.WRatio(state, row.state) >= 90:
            booleans.append(True)
        else:
            booleans.append(False)
    return booleans

In [138]:
f = open(LOC_FUZZY_MAP_FILEPATH,)  
mapping = json.load(f)
def fuzzy_match_loc(row):
    index = row.city + "|" + row.state
    if(index not in mapping):
        filters = getPotentialMatch(row.city, row.state);
        potential_matches = locations[filters]
        if potential_matches['Location'].count() > 0:
            row.fuzzy_match = process.extractOne(row.Location, list(potential_matches['Location'].values))[0]
            mapping[index] = row.fuzzy_match
    else:
       row.fuzzy_match  = mapping[index]
    return row

In [137]:
n = 1000
users_normalized = loadNormalizedUsers()
to_process_users= users_normalized[users_normalized['fuzzy_match'].isna()].fillna('')
total_chunk = to_process_users.shape[0] // n
list_df = [to_process_users[i:i+n] for i in range(0,to_process_users.shape[0],n)]
for df in progressbar.progressbar(list_df, redirect_stdout=True):
    updated_df = df.apply(fuzzy_match_loc, axis=1)
    users_normalized.loc[users_normalized['User-ID'].isin(updated_df['User-ID']), ['fuzzy_match']] = updated_df[['fuzzy_match']]
    users_normalized.to_csv(USERS_NORMALIZED_FILEPATH, index=False, index_label=False)
    saveLocMapping()

100% (53 of 53) |#######################| Elapsed Time: 19:16:44 Time: 19:16:44


In [18]:
def saveLocMapping():
    json_content = json.dumps(mapping)
    f = open(LOC_FUZZY_MAP_FILEPATH,"w")
    f.write(json_content)
    f.close()

In [140]:
users_normalized = loadNormalizedUsers()
users_normalized[users_normalized['fuzzy_match'].isna()]

Unnamed: 0,User-ID,Location,Age,city,state,country,fuzzy_match
17,18,"rio de janeiro, rio de janeiro, brazil",25.0,rio de janeiro,rio de janeiro,brazil,
64,65,"n/a, n/a, australia",,,,australia,
90,91,"toronto/newmarket, ,",,toronto/newmarket,",",,
111,112,"mexico city, d.f., mexico",32.0,mexico city,d.f.,mexico,
155,156,"sarajevo, n/a, bosnia and herzegovina",20.0,sarajevo,,bosnia and herzegovina,
...,...,...,...,...,...,...,...
278607,278608,"west vancouver, british columbia, canada",,west vancouver,british columbia,canada,
278700,278701,"vila das aves, minho, portugal",39.0,vila das aves,minho,portugal,
278711,278712,"cranbrook, british columbia, canada",,cranbrook,british columbia,canada,
278781,278782,"larnaca, n/a, cyprus",,larnaca,,cyprus,


In [141]:
users_normalized[users_normalized['fuzzy_match'].notna()]

Unnamed: 0,User-ID,Location,Age,city,state,country,fuzzy_match
0,1,"nyc, new york, usa",,nyc,new york,usa,"Airmont, New York, United States"
1,2,"stockton, california, usa",18.0,stockton,california,usa,"Stockton, California, United States"
2,3,"moscow, yukon territory, russia",,moscow,yukon territory,russia,"Moscow, Moscow, Russia"
3,4,"porto, v.n.gaia, portugal",17.0,porto,v.n.gaia,portugal,"Porto, Norte, Portugal"
4,5,"farnborough, hants, united kingdom",,farnborough,hants,united kingdom,"Farnborough, England, United Kingdom"
...,...,...,...,...,...,...,...
278853,278854,"portland, oregon, usa",,portland,oregon,usa,"Portland, Connecticut, United States"
278854,278855,"tacoma, washington, united kingdom",50.0,tacoma,washington,united kingdom,"Tacoma, Washington, United States"
278855,278856,"brampton, ontario, canada",,brampton,ontario,canada,"Brampton, Ontario, Canada"
278856,278857,"knoxville, tennessee, usa",,knoxville,tennessee,usa,"Knoxville, Tennessee, United States"


In [146]:
def finalOptimization(row):
    if pd.isna(row.fuzzy_match):
        row.city = 'UNK'
        row.state = 'UNK'
        row.country = 'UNK'
    else:
        loc = row.fuzzy_match.split(', ', 2)
        row.city = loc[0]
        row.state = loc[1]
        row.country = loc[2]
    return row

In [154]:
cleaned_users = users_normalized.apply(finalOptimization, axis=1).filter(['User-ID', 'Age', 'city', 'state', 'country']).rename(columns={'city': 'City', 'state': 'State', 'country': 'Country'})
cleaned_users

Unnamed: 0,User-ID,Age,City,State,Country
0,1,,Airmont,New York,United States
1,2,18.0,Stockton,California,United States
2,3,,Moscow,Moscow,Russia
3,4,17.0,Porto,Norte,Portugal
4,5,,Farnborough,England,United Kingdom
...,...,...,...,...,...
278853,278854,,Portland,Connecticut,United States
278854,278855,50.0,Tacoma,Washington,United States
278855,278856,,Brampton,Ontario,Canada
278856,278857,,Knoxville,Tennessee,United States


In [165]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='constant')
cleaned_users['Age'] = imp.fit_transform(cleaned_users[['Age']])

In [166]:
cleaned_users

Unnamed: 0,User-ID,Age,City,State,Country
0,1,0.0,Airmont,New York,United States
1,2,18.0,Stockton,California,United States
2,3,0.0,Moscow,Moscow,Russia
3,4,17.0,Porto,Norte,Portugal
4,5,0.0,Farnborough,England,United Kingdom
...,...,...,...,...,...
278853,278854,0.0,Portland,Connecticut,United States
278854,278855,50.0,Tacoma,Washington,United States
278855,278856,0.0,Brampton,Ontario,Canada
278856,278857,0.0,Knoxville,Tennessee,United States


In [168]:
m = cleaned_users.groupby(['Country', 'State']).mean()

In [178]:
m

Unnamed: 0_level_0,Unnamed: 1_level_0,User-ID,Age
Country,State,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,Jawzjan,106847.222222,27.111111
Afghanistan,Kabul,157527.600000,20.000000
Afghanistan,Lawghar,192226.000000,22.000000
Albania,Korce,85748.000000,0.000000
Albania,Lezhe,111121.250000,8.500000
...,...,...,...
Zambia,Lusaka,116846.000000,22.750000
Zimbabwe,Bulawayo,190757.000000,10.000000
Zimbabwe,Harare,145401.750000,15.625000
Zimbabwe,Masvingo,31295.000000,30.000000


In [196]:
z = {}
for i, a in m.iterrows():
    z[i] = a['Age']

In [217]:
def impMissingAgeWithGroupedAvg(r, z):
    if r.Age == 0:
        r.Age = round(z[(r.Country, r.State)])
    return r

In [218]:
final_users= cleaned_users.apply(impMissingAgeWithGroupedAvg, args=(z, ), axis=1)
final_users

Unnamed: 0,User-ID,Age,City,State,Country
0,1,20.0,Airmont,New York,United States
1,2,18.0,Stockton,California,United States
2,3,21.0,Moscow,Moscow,Russia
3,4,17.0,Porto,Norte,Portugal
4,5,23.0,Farnborough,England,United Kingdom
...,...,...,...,...,...
278853,278854,19.0,Portland,Connecticut,United States
278854,278855,50.0,Tacoma,Washington,United States
278855,278856,20.0,Brampton,Ontario,Canada
278856,278857,21.0,Knoxville,Tennessee,United States


In [219]:
dir = '../Dataset/Final'
if not os.path.exists(dir):
    os.makedirs(dir)

In [220]:
final_users.to_csv(dir + "/BX-Users.csv", index=False, index_label=False)

In [221]:
pd.read_csv(dir + "/BX-Users.csv")

Unnamed: 0,User-ID,Age,City,State,Country
0,1,20.0,Airmont,New York,United States
1,2,18.0,Stockton,California,United States
2,3,21.0,Moscow,Moscow,Russia
3,4,17.0,Porto,Norte,Portugal
4,5,23.0,Farnborough,England,United Kingdom
...,...,...,...,...,...
278853,278854,19.0,Portland,Connecticut,United States
278854,278855,50.0,Tacoma,Washington,United States
278855,278856,20.0,Brampton,Ontario,Canada
278856,278857,21.0,Knoxville,Tennessee,United States
