In [1]:
import re
import pandas as pd
import numpy as np
import unicodedata
import pickle
from cities_coordinates import CityCoordinator

from _html_parser import HtmlParser, ParsingDataPrepare

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
df_country_code = pd.read_csv('csv_files/wikipedia-iso-country-codes.csv') 

In [4]:
df_country_code.columns = [x.lower() for x in df_country_code.columns]
df_country_code['alpha-2 code'] = df_country_code['alpha-2 code'].map(lambda x: str(x).lower())
df_country_code.rename(columns={'english short name lower case':'country', 'alpha-2 code':'iso_code'}, inplace=True)
df_country_code.iso_code = df_country_code.iso_code.str.upper()
df_country_code.shape

(246, 5)

In [5]:
df_country_code.head(3)

Unnamed: 0,country,iso_code,alpha-3 code,numeric code,iso 3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL


In [6]:
df_country_code[df_country_code.country.isin(['England', 'Northern Ireland', 'Scotland', 'Wales'])]

Unnamed: 0,country,iso_code,alpha-3 code,numeric code,iso 3166-2


In [7]:
with open('pickle_files/cities_countries/df_for_geo_coords_26_05_2022', 'rb') as f:
    df_for_coords = pickle.load(f)

#### Remove some countries

In [8]:
# Remove USA country because many cities have twice locations in different states, 'Republic of Kosovo' - doesn't have iso_code, 'Syria' - not sure at coords
print('shape - ', df_for_coords.shape[0])
df_for_coords = df_for_coords[~df_for_coords.country.isin(['USA', 'Republic of Kosovo', 'Syria'])].copy()
print('shape - ', df_for_coords.shape[0])

shape -  9210
shape -  9002


In [9]:
#  For teams dataframe appoint iso_code:
# ['England', 'Wales', 'Scotland', 'Northern Ireland'] - 'GB'

# For world cities dataframe renames countries:
# 'Tanzania, United Republic of' - Tanzania, 'Moldova, Republic of' - 'Moldova',
# 'Congo, the Democratic Republic of the' - 'Congo DR', 'United States Of America' - 'USA', 'Syrian Arab Republic' - 'Syria',
# 'China' - 'China PR', 'Korea, Republic of (South Korea) - 'Korea Republic', 'Macedonia, the former Yugoslav Republic of' - 'FYR Macedonia', 
# 'Bosnia and Herzegovina' - 'Bosnia-Herzegovina', 'Ireland' - 'Republic of Ireland', 'Libyan Arab Jamahiriya' - 'Libya', 
# 'Côte d'Ivoire' - 'Ivory Coast'

In [10]:
countries_chng = {'Tanzania, United Republic of':'Tanzania', 'Moldova, Republic of':'Moldova',
                  'Congo, the Democratic Republic of the':'Congo DR', 'United States Of America':'USA', 'Syrian Arab Republic':'Syria',
                  'China':'China PR', 'Korea, Republic of (South Korea)':'Korea Republic', 'Macedonia, the former Yugoslav Republic of':'FYR Macedonia', 
                  'Bosnia and Herzegovina':'Bosnia-Herzegovina', 'Ireland':'Republic of Ireland', 'Libyan Arab Jamahiriya':'Libya',
                  'Côte d\'Ivoire':'Ivory Coast'}

In [11]:
df_country_code.country = df_country_code.country.map(lambda x: countries_chng[x] if x in countries_chng.keys() else x)

In [12]:
df_for_coords = df_for_coords.merge(df_country_code[['country', 'iso_code']], how='left', left_on=['country'], right_on=['country'])
df_for_coords.shape

(9002, 21)

In [13]:
df_for_coords[~df_for_coords.country.isin(df_country_code.country.unique())].country.unique()

array(['England', 'Scotland', 'Northern Ireland', 'Wales'], dtype=object)

In [14]:
british_list           = ['England', 'Northern Ireland', 'Scotland', 'Wales']
df_for_coords.iso_code = df_for_coords[['country', 'iso_code']].apply(lambda x: 'GB' if x[0] in british_list else x[1], axis=1)

In [15]:
df_for_coords[df_for_coords.iso_code.isna()].values

array([], shape=(0, 21), dtype=object)

In [16]:
df_for_coords.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,pts_f,pos_s,pts_s,fs_pos_max,fs_pts_max,f_pow,desc,country,team_id,city,iso_code
0,1708335,Premier League,483,301,26.05.2022 15:00,FC Minsk - Arsenal Dzerzhinsk,2:0,top,FC Minsk,4.0,19.0,15.0,4.0,"4 - 15, 16","19 - 4, 23",Fav,fav_pre,Belarus,540.0,Minsk,BY
1,1687696,OBOS liga,668,301,26.05.2022 15:00,Skeid - Sandnes Ulf,0:1,thr,Skeid,16.0,3.0,4.0,18.0,"16 - 4, 16","3 - 18, 23",Pre,fav_pre,Norway,10294.0,Oslo,NO
2,1687694,OBOS liga,668,301,26.05.2022 16:00,Kongsvinger - Brann,1:3,thr,Kongsvinger,13.0,7.0,1.0,23.0,"13 - 1, 16","7 - 23, 23",Pre,fav_pre,Norway,,Kongsvinger,NO


In [17]:
with open('pickle_files/cities_countries/df_uniq_cities_coords_created_28_06_22', 'rb') as f:
    df_uniq_cities_coords = pickle.load(f)

In [18]:
print('shape - ', df_uniq_cities_coords.shape[0])
df_uniq_cities_coords.tail(3)

shape -  2251


Unnamed: 0,city,iso_code,country,longitude,latitude,lat,lon
2248,Tvøroyri,FO,Faroe Islands,-6.803333,61.555833,61n33,6w48
2249,Ñemby,PY,Paraguay,-57.5443,-25.3935,25s23,57w32
2250,Štip,MK,FYR Macedonia,22.193558,41.737503,41n44,22e11


In [19]:
col_names = ['city', 'iso_code', 'country']
df_for_coords = df_for_coords.merge(df_uniq_cities_coords, how='left', left_on=col_names, right_on=col_names)
print('shape - ', df_for_coords.shape[0])

shape -  9002


In [20]:
df_for_coords.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,pts_f,pos_s,pts_s,fs_pos_max,fs_pts_max,f_pow,desc,country,team_id,city,iso_code,longitude,latitude,lat,lon
0,1708335,Premier League,483,301,26.05.2022 15:00,FC Minsk - Arsenal Dzerzhinsk,2:0,top,FC Minsk,4.0,19.0,15.0,4.0,"4 - 15, 16","19 - 4, 23",Fav,fav_pre,Belarus,540.0,Minsk,BY,27.56667,53.9,53n53,2.7e+35
1,1687696,OBOS liga,668,301,26.05.2022 15:00,Skeid - Sandnes Ulf,0:1,thr,Skeid,16.0,3.0,4.0,18.0,"16 - 4, 16","3 - 18, 23",Pre,fav_pre,Norway,10294.0,Oslo,NO,10.74609,59.91273,59n54,1.0000000000000001e+45
2,1687694,OBOS liga,668,301,26.05.2022 16:00,Kongsvinger - Brann,1:3,thr,Kongsvinger,13.0,7.0,1.0,23.0,"13 - 1, 16","7 - 23, 23",Pre,fav_pre,Norway,,Kongsvinger,NO,11.99772,60.19049,60n11,1.1e+60


In [21]:
print('shape - ', df_for_coords.shape[0])
df_include_coords = df_for_coords[~df_for_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_include_coords.shape[0])

shape -  9002
shape -  7977


In [22]:
df_wt_coords = df_for_coords[df_for_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_wt_coords.shape)

shape -  (1025, 25)


In [23]:
df_uniq_city_wt_coords = df_wt_coords[['country', 'city', 'iso_code']].groupby(by=['city', 'iso_code']).first().reset_index()
print('shape - ', df_uniq_city_wt_coords.shape)
df_uniq_city_wt_coords.head(3)

shape -  (216, 3)


Unnamed: 0,city,iso_code,country
0,'s-Hertogenbosch,NL,Netherlands
1,Aberdeen,GB,Scotland
2,Aberystwyth,GB,Wales


In [24]:
df_uniq_city_wt_coords[df_uniq_city_wt_coords.duplicated(subset=['city', 'iso_code'])]

Unnamed: 0,city,iso_code,country


#### Get first geo coordinates from CityCoordinator()

In [25]:
# labrary example
c = CityCoordinator()
# c.get_city(city_name="London", country_code_iso="GB")

In [26]:
df_for_loc = df_uniq_city_wt_coords[['city', 'iso_code']].apply(lambda x: c.get_city(city_name=x[0], country_code_iso=x[1]), axis=1)

In [27]:
df_for_loc[0]['location']

{'lon': '5.30417', 'lat': '51.69917'}

In [28]:
location  = [x['location'] if x != None else None for x in df_for_loc]
longitude = [x['lon'] if x != None else None for x in location]
latitude  = [x['lat'] if x != None else None for x in location]

In [29]:
df_uniq_city_wt_coords['longitude'] = longitude
df_uniq_city_wt_coords['latitude']  = latitude

In [30]:
print('shape - ', df_uniq_city_wt_coords.shape[0])
df_uniq_city_wt_coords.head(3)

shape -  216


Unnamed: 0,city,iso_code,country,longitude,latitude
0,'s-Hertogenbosch,NL,Netherlands,5.30417,51.69917
1,Aberdeen,GB,Scotland,-2.09814,57.14369
2,Aberystwyth,GB,Wales,-4.08292,52.41548


In [31]:
df_uniq_city_new = df_uniq_city_wt_coords[~df_uniq_city_wt_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  163


In [32]:
df_city_for_geolocator = df_uniq_city_wt_coords[df_uniq_city_wt_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_city_for_geolocator.shape[0])

shape -  53


In [33]:
df_city_for_geolocator.head(3)

Unnamed: 0,city,iso_code,country,longitude,latitude
0,Aue-Bad Schlema,DE,Germany,,
1,Bab Ben Gashier,LY,Libya,,
2,Ballybofe,IE,Republic of Ireland,,


In [34]:
cities_countries = list(df_city_for_geolocator[['city', 'country']].apply(lambda x: (x[0], x[1]), axis=1))

In [35]:
cities_countries[-3:]

[('the city Odense', 'Denmark'),
 ('Ñemby in the Central Department', 'Paraguay'),
 ('Žižkov', 'Czech Republic')]

#### Parsing from geolocator

In [36]:
# HtmlParser.cities_data_with_geocoordinates(cities_countries)
# All cities data saved with last names: ('Žižkov', 'Czech Republic')

In [37]:
with open('pickle_files/cities_countries/cities_data_26_05_2022_Žižkov_Czech_Republic', 'rb') as f:
    cities_data = pickle.load(f)

In [38]:
cities_data[:2]

[[('Aue-Bad Schlema', 'Germany'),
  Location(Aue-Bad Schlema, Erzgebirgskreis, Sachsen, Deutschland, (50.6039112, 12.683981, 0.0))],
 [('Bab Ben Gashier', 'Libya'), None]]

In [39]:
list_cities, list_countries, list_data = [], [], []

for data in cities_data:
    list_cities.append(data[0][0])
    list_countries.append(data[0][1])
    if data[1] != None:
        list_data.append([data[1].latitude, data[1].longitude])
    else:
        list_data.append([None, None])

In [40]:
list_lat = [x[0] for x in list_data]
list_lon = [x[1] for x in list_data]

In [41]:
df_coord_parsing = pd.DataFrame({'city':list_cities, 'country':list_countries, 'latitude':list_lat, 'longitude':list_lon})

In [42]:
print('shape - df_city_for_geolocator - ', df_city_for_geolocator.shape[0])
print('shape - df_coord_parsing       - ', df_coord_parsing.shape[0])

shape - df_city_for_geolocator -  53
shape - df_coord_parsing       -  53


In [43]:
df_coord_parsing = df_coord_parsing.merge(df_city_for_geolocator.iso_code, how='left', left_index=True, right_index=True)

In [44]:
df_coord_prs = df_coord_parsing[~df_coord_parsing.latitude.isna()].copy()
print('shape - ', df_coord_prs.shape[0])
df_coord_prs.head(3)

shape -  36


Unnamed: 0,city,country,latitude,longitude,iso_code
0,Aue-Bad Schlema,Germany,50.603911,12.683981,DE
3,Bishop's Stortford,England,51.87288,0.158782,GB
4,Borisov,Belarus,54.224066,28.511785,BY


In [45]:
df_uniq_city_new = pd.concat([df_uniq_city_new, df_coord_prs]).reset_index(drop=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  199


In [46]:
df_manual_adding = df_coord_parsing[df_coord_parsing.latitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_manual_adding.shape[0])

shape -  17


In [47]:
# df_manual_adding

### Manual adding cities coordinates and fixing teams data

In [48]:
# df_manual_adding.to_csv('csv_files/df_cities_manual_adding_26_05_2022.csv', header=True, index=False)

#### Latitude North-South, Longitude - East-West

In [49]:
df_manual_adding = pd.read_csv('csv_files/df_cities_manual_adding_26_05_2022.csv', dtype=str)

In [50]:
cities_rename_lst = list(df_manual_adding.city.map(lambda x: x if re.findall(':', str(x)) else None))
cities_rename_lst = [x for x in cities_rename_lst if x != None]
cities_rename_lst = [re.sub(':', '\':\'', x) for x in cities_rename_lst]
str(cities_rename_lst)

'["Bab Ben Gashier\':\'Tripoli", "Hai Ouled Smail\':\'nd", "Laval in western France\':\'Laval", "MariehamnAland\':\'Mariehamn", "Pagranichnaja\':\'nd", "no define\':\'nd", "no determinate\':\'nd", "no determine\':\'nd", "the Betrideildin\':\'nd", "the League of Ireland Premier Division\':\'nd", "the city Odense\':\'Odense", "Ñemby in the Central Department\':\'Ñemby", "Ydrenewydd\':\'Newtown"]'

In [51]:
cities_rename_dict = {}
keys_vals_lst      = []

for x in cities_rename_lst:
    keys_vals = x.split('\':\'')
    keys_vals_lst.append(keys_vals)

for key_val in keys_vals_lst:
    cities_rename_dict[key_val[0]] = key_val[1]

#### Change errors in cities or countries name for df_wt_coords

In [52]:
str(cities_rename_dict)

"{'Bab Ben Gashier': 'Tripoli', 'Hai Ouled Smail': 'nd', 'Laval in western France': 'Laval', 'MariehamnAland': 'Mariehamn', 'Pagranichnaja': 'nd', 'no define': 'nd', 'no determinate': 'nd', 'no determine': 'nd', 'the Betrideildin': 'nd', 'the League of Ireland Premier Division': 'nd', 'the city Odense': 'Odense', 'Ñemby in the Central Department': 'Ñemby', 'Ydrenewydd': 'Newtown'}"

In [53]:
df_wt_coords.city = ParsingDataPrepare.replace_rare_symbols(df_wt_coords, col_name='city', dict_obj=cities_rename_dict).copy()

#### Clear manual data

In [54]:
df_manual_adding.city = ParsingDataPrepare.replace_rare_symbols(df_manual_adding, col_name='city', dict_obj=cities_rename_dict).copy()
df_manual_adding.city = df_manual_adding.city.map(lambda x: x.split(':')[0] if re.findall(':', x) else x)

In [55]:
df_manual_adding.latitude  = df_manual_adding.latitude.str.strip()
df_manual_adding.longitude = df_manual_adding.longitude.str.strip()

In [56]:
print('shape - ', df_manual_adding.shape[0])
df_manual_adding = df_manual_adding[~df_manual_adding.city.isin(['nd'])].copy().reset_index(drop=True)
print('shape - ', df_manual_adding.shape[0])

shape -  24
shape -  17


In [57]:
print('shape - ', df_uniq_city_new.shape[0])
df_uniq_city_new = pd.concat([df_uniq_city_new, df_manual_adding]).reset_index(drop=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  199
shape -  216


In [58]:
df_uniq_city_new.drop_duplicates(subset=['city', 'iso_code', 'country'], inplace=True, ignore_index=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  216


In [59]:
df_uniq_city_new[(df_uniq_city_new.longitude.isna())|df_uniq_city_new.latitude.isna()]

Unnamed: 0,city,iso_code,country,longitude,latitude


### Lat_lon

In [60]:
# palmyra = LatLon(Latitude(42.4833333), Longitude(1.4666667)) # Location of Palmyra Atoll in decimal degrees
# palmyra = Lat# Change cities names errors in dataframe:Lon(5.8833, -162.0833) # Same thing but simpler!
# palmyra = LatLon(Latitude(degree = 5, minute = 52, second = 59.88), Longitude(degree = -162, minute = -4.998) # or more complicated!
# cordinates = palmyra.to_string('d% %m% %S% %H') # Print coordinates to degree minute second
# ('5 52 59.88 N', '162 4 59.88 W')

In [61]:
df_uniq_city_new['lat'], df_uniq_city_new['lon'] = ParsingDataPrepare.lat_lon_calculate(df_uniq_city_new, col_lat='latitude', col_lon='longitude')

In [62]:
df_uniq_city_new.head(3)

Unnamed: 0,city,iso_code,country,longitude,latitude,lat,lon
0,'s-Hertogenbosch,NL,Netherlands,5.30417,51.69917,51n41,5e18
1,Aberdeen,GB,Scotland,-2.09814,57.14369,57n8,2w5
2,Aberystwyth,GB,Wales,-4.08292,52.41548,52n24,4w4


#### Concatinate old dataset with new data

In [63]:
print('shape - ', df_uniq_cities_coords.shape[0])
df_uniq_cities_coords = pd.concat([df_uniq_cities_coords, df_uniq_city_new]).reset_index(drop=True)
print('shape - ', df_uniq_cities_coords.shape[0])
df_uniq_cities_coords.tail(3)

shape -  2251
shape -  2467


Unnamed: 0,city,iso_code,country,longitude,latitude,lat,lon
2464,Ñemby,PY,Paraguay,-57.5443,-25.3935,25s23,57w32
2465,Cei Conna,GB,Wales,-3.057,53.218,53n13,3w3
2466,Newtown,GB,Wales,-3.3141,52.5132,52n30,3w18


In [64]:
df_uniq_cities_coords.drop_duplicates(subset=['city', 'iso_code', 'country'], inplace=True, ignore_index=True)

In [65]:
print('shape - ', df_uniq_cities_coords.shape)

shape -  (2455, 7)


In [66]:
# file = open('pickle_files/cities_countries/df_uniq_cities_coords_created_30_06_22', 'wb')
# pickle.dump(df_uniq_cities_coords, file)  
# file.close()

#### Concatinate all dataframes with coordinates

In [69]:
df_wt_coords.drop(columns=['longitude',	'latitude',	'lat', 'lon'], inplace=True)

In [70]:
print('shape - ', df_wt_coords.shape)
cols_merge = ['city', 'iso_code', 'country']
df_coords  = df_wt_coords.merge(df_uniq_city_new, how='left', left_on=cols_merge, right_on=cols_merge)
print('shape - ', df_coords.shape)

shape -  (1025, 21)
shape -  (1025, 25)


In [72]:
print('shape - ', df_coords.shape)
df_coords  = df_coords[(~df_coords.longitude.isna()) & (~df_coords.latitude.isna())].copy()
print('shape - ', df_coords.shape)

shape -  (1025, 25)
shape -  (996, 25)


In [73]:
print('shape - ', df_coords.shape)
df_coords = df_coords[~df_coords.city.isin(['nd'])].copy().reset_index(drop=True)
print('shape - ', df_coords.shape)

shape -  (996, 25)
shape -  (996, 25)


In [74]:
print('shape - ', df_include_coords.shape)
df_include_coords = pd.concat([df_include_coords, df_coords]).reset_index(drop=True)
print('shape - ', df_include_coords.shape)

shape -  (7977, 25)
shape -  (8973, 25)


In [76]:
df_include_coords[(df_include_coords.longitude.isna()) | (df_include_coords.latitude.isna())].values

array([], shape=(0, 25), dtype=object)

#### Check equal cities for each team

In [84]:
df_fteam_cities = df_coords.groupby('f_team').agg({'city':'unique'}).reset_index()

In [86]:
df_fteam_cities.head(3)

Unnamed: 0,f_team,city
0,1. FC Slovacko,[Uherské Hradiště]
1,AFC Bournemouth,[Bournemouth]
2,Aberdeen,[Aberdeen]


In [91]:
len_city_lst = df_fteam_cities.city.map(lambda x: len(x))
max(len_city_lst)

1

In [94]:
print('shape - ', df_include_coords.shape)
df_include_coords.tail(3)

shape -  (8973, 25)


Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,pts_f,pos_s,pts_s,fs_pos_max,fs_pts_max,f_pow,desc,country,team_id,city,iso_code,longitude,latitude,lat,lon
8970,15143801,League Two,694,306,30.03.2021 18:45,Brechin City - Elgin City,1:2,thr,Brechin City,10.0,6.0,5.0,29.0,"10 - 5, 10","6 - 29, 48",Pre,fav_pre,Scotland,,Brechin,GB,-2.65729,56.72993,56n43,2w39
8971,15143803,League Two,694,306,30.03.2021 18:45,Cowdenbeath - Edinburgh City,1:3,thr,Cowdenbeath,9.0,14.0,2.0,35.0,"9 - 2, 10","14 - 35, 48",Pre,fav_pre,Scotland,8843.0,Cowdenbeath,GB,-3.34426,56.11194,56n6,3w20
8972,15136213,Eerste Divisie,595,306,29.03.2021 19:00,Den Bosch - Go Ahead Eagles,3:3,sec,Den Bosch,19.0,32.0,2.0,77.0,"19 - 2, 20","32 - 77, 92",Pre,fav_pre,Netherlands,,'s-Hertogenbosch,NL,5.30417,51.69917,51n41,5e18


In [96]:
# file = open('pickle_files/cities_countries/df_with_geo_coords_26_05_2022', 'wb')
# pickle.dump(df_include_coords, file)  
# file.close()

##### Example

In [None]:
# Exempl
from  geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Your_Name")

city ="Russia"
country ="Perm"
loc = geolocator.geocode(city+','+ country)
lat = loc.latitude
long = loc.longitude

print(lat, long)