In [1]:
import re
import os
import pandas as pd
import numpy as np
import unicodedata
import pickle
from cities_coordinates import CityCoordinator

from _html_parser import HtmlParser, ParsingDataPrepare

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
df_country_code = pd.read_csv('csv_files/wikipedia-iso-country-codes.csv') 

In [4]:
df_country_code.columns = [x.lower() for x in df_country_code.columns]
df_country_code['alpha-2 code'] = df_country_code['alpha-2 code'].map(lambda x: str(x).lower())
df_country_code.rename(columns={'english short name lower case':'country', 'alpha-2 code':'iso_code'}, inplace=True)
df_country_code.iso_code = df_country_code.iso_code.str.upper()
df_country_code.shape

(246, 5)

In [5]:
df_country_code.head(3)

Unnamed: 0,country,iso_code,alpha-3 code,numeric code,iso 3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL


In [6]:
df_country_code[df_country_code.iso_code.isin(['NAN'])]

Unnamed: 0,country,iso_code,alpha-3 code,numeric code,iso 3166-2
151,Namibia,NAN,NAM,516,ISO 3166-2:NA


In [7]:
df_country_code.loc[151:151, 'iso_code'] = 'NA'

In [8]:
df_country_code[df_country_code.country.isin(['England', 'Northern Ireland', 'Scotland', 'Wales'])]

Unnamed: 0,country,iso_code,alpha-3 code,numeric code,iso 3166-2


In [9]:
with open('pickle_files/cities_countries/cities_data_26_05_2022/df_for_geo_coords_26_05_2022_sec', 'rb') as f:
    df_for_coords = pickle.load(f)

#### Remove some countries

In [10]:
# Remove USA country because many cities have twice locations in different states, 'Republic of Kosovo' - doesn't have iso_code, 'Syria' - not sure at game's location
print('shape - ', df_for_coords.shape[0])
df_for_coords = df_for_coords[~df_for_coords.country.isin(['USA', 'Republic of Kosovo', 'Syria'])].copy()
print('shape - ', df_for_coords.shape[0])

shape -  25697
shape -  25044


In [11]:
#  For teams dataframe appoint iso_code:
# ['England', 'Wales', 'Scotland', 'Northern Ireland'] - 'GB'

# For world cities dataframe renames countries:
# 'Tanzania, United Republic of' - Tanzania, 'Moldova, Republic of' - 'Moldova',
# 'Congo, the Democratic Republic of the' - 'Congo DR', 'United States Of America' - 'USA', 'Syrian Arab Republic' - 'Syria',
# 'China' - 'China PR', 'Korea, Republic of (South Korea) - 'Korea Republic', 'Macedonia, the former Yugoslav Republic of' - 'FYR Macedonia', 
# 'Bosnia and Herzegovina' - 'Bosnia-Herzegovina', 'Ireland' - 'Republic of Ireland', 'Libyan Arab Jamahiriya' - 'Libya', 
# 'Côte d'Ivoire' - 'Ivory Coast'

In [12]:
countries_chng = {'Tanzania, United Republic of':'Tanzania', 'Moldova, Republic of':'Moldova',
                  'Congo, the Democratic Republic of the':'Congo DR', 'United States Of America':'USA', 'Syrian Arab Republic':'Syria',
                  'China':'China PR', 'Korea, Republic of (South Korea)':'Korea Republic', 'Macedonia, the former Yugoslav Republic of':'FYR Macedonia', 
                  'Bosnia and Herzegovina':'Bosnia-Herzegovina', 'Ireland':'Republic of Ireland', 'Libyan Arab Jamahiriya':'Libya',
                  'Côte d\'Ivoire':'Ivory Coast', 'Brunei Darussalam':'Brunei'}

In [13]:
df_country_code.country = df_country_code.country.map(lambda x: countries_chng[x] if x in countries_chng.keys() else x)

In [14]:
df_for_coords = df_for_coords.merge(df_country_code[['country', 'iso_code']], how='left', left_on=['country'], right_on=['country'])
df_for_coords.shape

(25044, 21)

In [15]:
df_for_coords[~df_for_coords.country.isin(df_country_code.country.unique())].country.unique()

array(['England', 'Scotland', 'Northern Ireland', 'Wales'], dtype=object)

In [16]:
british_list           = ['England', 'Northern Ireland', 'Scotland', 'Wales']
df_for_coords.iso_code = df_for_coords[['country', 'iso_code']].apply(lambda x: 'GB' if x[0] in british_list else x[1], axis=1)

In [17]:
df_for_coords[df_for_coords.iso_code.isna()].values

array([], shape=(0, 21), dtype=object)

In [18]:
df_for_coords.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,pts_f,pos_s,pts_s,fs_pos_max,fs_pts_max,f_pow,desc,country,team_id,city,iso_code
0,1600750,FNL-2,1147,310,26.05.2022 15:00,Chernomorets Novorossiysk - Druzhba,5:1,thr,Chernomorets Novorossiysk,5.0,60.0,11.0,32.0,"5 - 11, 17","60 - 32, 80",Fav,sum_coef_13,Russia,7407.0,Novorossiysk,RU
1,1627637,Superliga,447,310,26.05.2022 15:00,Skenderbeu - Teuta Durrës,0:0,thr,Skenderbeu,10.0,26.0,6.0,50.0,"10 - 6, 10","26 - 50, 73",Pre,sum_coef_13,Albania,734.0,Korçë,AL
2,1627640,Superliga,447,310,26.05.2022 15:00,Tirana - Kukësi,2:3,thr,Tirana,1.0,73.0,4.0,55.0,"1 - 4, 10","73 - 55, 73",Fav,sum_coef_13,Albania,,Tirana,AL


#### Check last file version

In [22]:
with open('pickle_files/cities_countries/df_uniq_cities_coords_created_18_07_22', 'rb') as f:
    df_uniq_cities_coords = pickle.load(f)

In [23]:
print('shape - ', df_uniq_cities_coords.shape[0])
df_uniq_cities_coords.tail(3)

shape -  4951


Unnamed: 0,city,iso_code,country,longitude,latitude,lat,lon
4948,Laval in western France,FR,France,-0.7689,48.0733,48n4,0w46
4949,Totness,SR,Suriname,-56.329167,5.8775,5n52,56w19
4950,Mafeteng,LS,Lesotho,27.25,-29.816667,29s49,27e15


In [24]:
col_names = ['city', 'iso_code', 'country']
df_for_coords = df_for_coords.merge(df_uniq_cities_coords, how='left', left_on=col_names, right_on=col_names)
print('shape - ', df_for_coords.shape[0])

shape -  25044


In [25]:
df_for_coords.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,pts_f,pos_s,pts_s,fs_pos_max,fs_pts_max,f_pow,desc,country,team_id,city,iso_code,longitude,latitude,lat,lon
0,1600750,FNL-2,1147,310,26.05.2022 15:00,Chernomorets Novorossiysk - Druzhba,5:1,thr,Chernomorets Novorossiysk,5.0,60.0,11.0,32.0,"5 - 11, 17","60 - 32, 80",Fav,sum_coef_13,Russia,7407.0,Novorossiysk,RU,37.76752,44.72439,44n43,3.6999999999999996e+47
1,1627637,Superliga,447,310,26.05.2022 15:00,Skenderbeu - Teuta Durrës,0:0,thr,Skenderbeu,10.0,26.0,6.0,50.0,"10 - 6, 10","26 - 50, 73",Pre,sum_coef_13,Albania,734.0,Korçë,AL,20.78083,40.61861,40n37,2e+47
2,1627640,Superliga,447,310,26.05.2022 15:00,Tirana - Kukësi,2:3,thr,Tirana,1.0,73.0,4.0,55.0,"1 - 4, 10","73 - 55, 73",Fav,sum_coef_13,Albania,,Tirana,AL,19.81889,41.3275,41n19,1.9e+50


In [26]:
print('shape - ', df_for_coords.shape[0])
df_include_coords = df_for_coords[~df_for_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_include_coords.shape[0])

shape -  25044
shape -  22217


In [27]:
df_wt_coords = df_for_coords[df_for_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_wt_coords.shape)

shape -  (2827, 25)


In [28]:
df_uniq_city_wt_coords = df_wt_coords[['country', 'city', 'iso_code']].groupby(by=['city', 'iso_code']).first().reset_index()
print('shape - ', df_uniq_city_wt_coords.shape)
df_uniq_city_wt_coords.head(3)

shape -  (992, 3)


Unnamed: 0,city,iso_code,country
0,A Estrada,ES,Spain
1,Abobo,CI,Ivory Coast
2,Abu Halifa,KW,Kuwait


In [29]:
df_uniq_city_wt_coords[df_uniq_city_wt_coords.duplicated(subset=['city', 'iso_code'])]

Unnamed: 0,city,iso_code,country


#### Get first geo coordinates from CityCoordinator()

In [30]:
# labrary example
c = CityCoordinator()
# c.get_city(city_name="London", country_code_iso="GB")

In [31]:
df_uniq_city_wt_coords.iso_code.value_counts(dropna=False)

ES    70
DE    70
GB    63
IT    62
AT    38
      ..
GN     1
GR     1
KW     1
CN     1
IE     1
Name: iso_code, Length: 96, dtype: int64

In [32]:
df_for_loc = df_uniq_city_wt_coords[['city', 'iso_code']].apply(lambda x: c.get_city(city_name=x[0], country_code_iso=x[1]), axis=1)

In [33]:
df_for_loc

0            {'country_iso_code': 'ES', 'city_name': 'A Estrada', 'location': {'lon': '-8.48842', 'lat': '42.68911'}}
1                  {'country_iso_code': 'CI', 'city_name': 'Abobo', 'location': {'lon': '-4.0159', 'lat': '5.41613'}}
2                                                                                                                None
3             {'country_iso_code': 'ES', 'city_name': 'Aceuchal', 'location': {'lon': '-6.48636', 'lat': '38.64627'}}
4                {'country_iso_code': 'DE', 'city_name': 'Achern', 'location': {'lon': '8.07607', 'lat': '48.63115'}}
                                                            ...                                                      
987            {'country_iso_code': 'SK', 'city_name': 'Šamorín', 'location': {'lon': '17.30972', 'lat': '48.03015'}}
988    {'country_iso_code': 'SI', 'city_name': 'Šmartno ob Paki', 'location': {'lon': '15.03333', 'lat': '46.33333'}}
989         {'country_iso_code': 'CZ', 'city_name': 'Ště

In [34]:
location  = [x['location'] if x != None else None for x in df_for_loc]
longitude = [x['lon'] if x != None else None for x in location]
latitude  = [x['lat'] if x != None else None for x in location]

In [35]:
df_uniq_city_wt_coords['longitude'] = longitude
df_uniq_city_wt_coords['latitude']  = latitude

In [36]:
print('shape - ', df_uniq_city_wt_coords.shape[0])
df_uniq_city_wt_coords.head(3)

shape -  992


Unnamed: 0,city,iso_code,country,longitude,latitude
0,A Estrada,ES,Spain,-8.48842,42.68911
1,Abobo,CI,Ivory Coast,-4.0159,5.41613
2,Abu Halifa,KW,Kuwait,,


In [37]:
df_uniq_city_new = df_uniq_city_wt_coords[~df_uniq_city_wt_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  712


In [38]:
df_city_for_geolocator = df_uniq_city_wt_coords[df_uniq_city_wt_coords.longitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_city_for_geolocator.shape[0])

shape -  280


In [39]:
df_city_for_geolocator.head(3)

Unnamed: 0,city,iso_code,country,longitude,latitude
0,Abu Halifa,KW,Kuwait,,
1,Ajdabya,LY,Libya,,
2,Al Bukayriyah,SA,Saudi Arabia,,


In [40]:
cities_countries = list(df_city_for_geolocator[['city', 'country']].apply(lambda x: (x[0], x[1]), axis=1))

In [41]:
cities_countries[-3:]

[('Čonoplja', 'Serbia'),
 ('Śląsk Wrocław', 'Poland'),
 ('Županja in Slavonia', 'Croatia')]

#### Parsing coords from geolocator

In [42]:
for i, city in enumerate(cities_countries):
    if city[0] == 'Ciudad de Florida':
        print(i, city)

In [43]:
cities_countries[:2] 

[('Abu Halifa', 'Kuwait'), ('Ajdabya', 'Libya')]

#### Fix errors in the cities or countries names during parsing:
##### GeocoderUnavailable: Max retries exceeded with url: /search?q=the+city+Odense%2CDenmark

In [46]:
for x, y in enumerate(cities_countries[:]):
    if y[0] == 'the city Odense':
        print(x, y)

In [47]:
# cities_countries[273] = ('Odense', 'Denmark')

In [None]:
# HtmlParser.cities_data_with_geocoordinates(cities_countries[201:], name_saved_file='cities_data_26_05_2022')
# All cities data saved with last names: ('Žižkov', 'Czech Republic')

In [None]:
## If parser save data in one file start this open file code and don't use remainder_data.
# with open('pickle_files/cities_countries/cities_data_26_03_2021_Selemet_Moldova', 'rb') as f:
#     cities_data = pickle.load(f)

In [48]:
with open('pickle_files/cities_countries/cities_data_26_05_2022_Pullach_Germany', 'rb') as f:
    cities_data_1 = pickle.load(f)

In [49]:
with open('pickle_files/cities_countries/cities_data_26_05_2022_Županja in Slavonia_Croatia', 'rb') as f:
    cities_data_2 = pickle.load(f)

In [50]:
all_cities_data = cities_data_1 + cities_data_2

In [51]:
len(all_cities_data)

280

#### This parser needs to start until len(remainder_data) == 0, if haven't errors

In [None]:
# path = 'pickle_files/cities_countries/cities_data_26_03_2021/'
# all_cities_data = []

# for file in os.listdir(path):
#     if file.startswith('cities_data'):
#         with open(path + file, 'rb') as f:
#             cities_data = pickle.load(f)
#             all_cities_data = all_cities_data + cities_data

In [52]:
len(all_cities_data)

280

In [53]:
all_unique_data = []

for city_data in all_cities_data:
    if city_data not in all_unique_data:
        all_unique_data.append(city_data)

len(all_unique_data)        

280

In [54]:
remainder_data = cities_countries.copy()

for city_data in all_unique_data:
    remainder_data.remove(city_data[0])
    
len(remainder_data)    

0

In [55]:
remainder_data

[]

In [None]:
# HtmlParser.cities_data_with_geocoordinates(remainder_data, name_saved_file='cities_data_26_03_2021')

#### Adding error parser city's name data deleted beefore if need

In [None]:
# all_unique_data.insert(-1, [('Ciudad de Florida', 'Uruguay'), None])

In [56]:
cities_data = all_unique_data.copy()

In [57]:
len(cities_data)

280

In [58]:
cities_data[-2:]

[[('Śląsk Wrocław', 'Poland'),
  Location(Kibice Razem Śląsk Wrocław, Szewska, Dzielnica Czterech Wyznań, Osiedle Stare Miasto, Wrocław, województwo dolnośląskie, 50-151, Polska, (51.1107923, 17.0347956, 0.0))],
 [('Županja in Slavonia', 'Croatia'), None]]

In [59]:
list_cities, list_countries, list_data = [], [], []

for data in cities_data:
    list_cities.append(data[0][0])
    list_countries.append(data[0][1])
    if data[1] != None:
        list_data.append([data[1].latitude, data[1].longitude])
    else:
        list_data.append([None, None])

In [60]:
list_lat = [x[0] for x in list_data]
list_lon = [x[1] for x in list_data]

In [61]:
df_coord_parsing = pd.DataFrame({'city':list_cities, 'country':list_countries, 'latitude':list_lat, 'longitude':list_lon})

In [62]:
df_coord_parsing

Unnamed: 0,city,country,latitude,longitude
0,Abu Halifa,Kuwait,29.130114,48.124021
1,Ajdabya,Libya,30.755004,20.223326
2,Al Bukayriyah,Saudi Arabia,26.139618,43.657703
3,Al-Abyar,Libya,32.189645,20.594509
4,Al-Musannah,Oman,23.739567,57.645387
...,...,...,...,...
275,Ñemby in the Central Department,Paraguay,,
276,Østerbro,Denmark,55.705084,12.582614
277,Čonoplja,Serbia,45.812126,19.248848
278,Śląsk Wrocław,Poland,51.110792,17.034796


In [68]:
# df_city_for_geolocator

In [65]:
print('shape - df_city_for_geolocator - ', df_city_for_geolocator.shape[0])
print('shape - df_coord_parsing       - ', df_coord_parsing.shape[0])

shape - df_city_for_geolocator -  280
shape - df_coord_parsing       -  280


In [66]:
col_names = ['iso_code', 'city', 'country']
df_coord_parsing = df_coord_parsing.merge(df_city_for_geolocator[col_names], how='left', left_on=['city', 'country'], right_on=['city', 'country'])

In [69]:
df_coord_prs = df_coord_parsing[~df_coord_parsing.latitude.isna()].copy()
print('shape - ', df_coord_prs.shape[0])
df_coord_prs.head(3)

shape -  234


Unnamed: 0,city,country,latitude,longitude,iso_code
0,Abu Halifa,Kuwait,29.130114,48.124021,KW
1,Ajdabya,Libya,30.755004,20.223326,LY
2,Al Bukayriyah,Saudi Arabia,26.139618,43.657703,SA


In [70]:
df_uniq_city_new = pd.concat([df_uniq_city_new, df_coord_prs]).reset_index(drop=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  946


In [71]:
df_manual_adding = df_coord_parsing[df_coord_parsing.latitude.isna()].copy().reset_index(drop=True)
print('shape - ', df_manual_adding.shape[0])

shape -  46


#### Format fixing errors in cities names: 'error_name' : 'fixing_name'

In [73]:
# df_manual_adding

### Manual adding cities coordinates and fixing teams data.

In [None]:
# df_manual_adding.to_csv('csv_files/df_cities_manual_adding_26_05_2022_four.csv', header=True, index=False)

#### Latitude North-South, Longitude - East-West

In [75]:
# df_wt_coords

In [255]:
# Fix some citie's errors names - 'the Eerste divisie', and change it in manual adding file.
# Errors countries names or if one city name has multiple teams - need to change manual in 'df_wt_coords' file and don't use ':' in manual adding file.
# df_wt_coords[df_wt_coords.city == 'the Eerste divisie']
# df_wt_coords[df_wt_coords.f_team == 'Roda']

#### Change error country name for some cities and after that change country name and iso_code in df_for_manual:

In [77]:
# df_country_code[df_country_code.iso_code == 'BN']

In [256]:
# df_country_code[df_country_code.country == 'Lesotho']

In [153]:
# df_wt_coords.loc[df_wt_coords[df_wt_coords.city == 'Bandar Seri Begawan'].index, 'country']  = 'Brunei'
# df_wt_coords.loc[df_wt_coords[df_wt_coords.city == 'Bandar Seri Begawan'].index, 'iso_code'] = 'BR'
# df_wt_coords.loc[df_wt_coords[df_wt_coords.f_team == 'Zebras'].index, 'city'] = 'Warwick Parish'
# df_wt_coords.loc[df_wt_coords[df_wt_coords.f_team == 'Al Ittihad'].index, 'country']  = 'Saudi Arabia'
# df_wt_coords.loc[df_wt_coords[df_wt_coords.f_team == 'Al Ittihad'].index, 'iso_code'] = 'SA'

In [79]:
# file = open('pickle_files/df_wt_coords_before_manual_add_26_03_2021', 'wb')
# pickle.dump(df_wt_coords, file)  
# file.close()

#### Fix team data:

In [283]:
# df_teams_data_main
with open('pickle_files/teams_data/df_tmd_main_created_22_07_2022', 'rb') as f:
    df_teams_data_main = pickle.load(f)

In [285]:
# df_teams_data_main.shape

In [249]:
df_teams_data_main[df_teams_data_main.city == 'the League of Ireland Premier Division']

Unnamed: 0,f_team,team_id,country,city
4119,Drogheda United,7556,Republic of Ireland,the League of Ireland Premier Division


In [None]:
# df_teams_data_main.drop([7188], inplace=True)
# df_teams_data_main.reset_index(drop=True, inplace=True)

In [None]:
# df_teams_data_main[df_teams_data_main.team == 'Aigle Noir']

In [252]:
df_teams_data_main.loc[4119].city #= 'Drogheda'

'Drogheda'

In [None]:
# df_teams_data_main.loc[7201].country #= 'Lesotho'

In [254]:
# file = open('pickle_files/df_tmd_main_created_22_07_2022', 'wb')
# pickle.dump(df_teams_data_main, file)  
# file.close()

#### After manual adding

In [257]:
df_manual_adding = pd.read_csv('csv_files/df_cities_manual_adding_26_05_2022_four.csv', dtype=str)

In [258]:
df_manual_adding.head(3)

Unnamed: 0,city,country,latitude,longitude,iso_code
0,Al-Qasim City:Al-Qasim,Iraq,32.301389,44.689167,IQ
1,Albufeira,Spain,37.089722,-8.245833,ES
2,Arnhem,Burkina Faso,51.983333,5.916667,BF


In [259]:
rename_lst = list(df_manual_adding.city.map(lambda x: x if re.findall(':', str(x)) else None))
rename_lst = [x for x in rename_lst if x != None]
rename_lst = [re.sub(':', '\':\'', x) for x in rename_lst]
str(rename_lst)

'["Al-Qasim City\':\'Al-Qasim", "Becerril de Campos in the autonomous community of Castile and León\':\'Becerril de Campos", "Belenenses II\':\'Lisbon", "Beograde\':\'Belgrade", "Birmingham\':\'Bweyogerere", "Botola\':\'Zemamra", "Botola Pro Inwi\':\'Mohammedia", "Bunschotenlso\':\'Bunschoten", "Buzău\':\'Vacaria", "Cherkassy\':\'na", "Gwelup in Perth\':\' Perth", "Hai Ouled Smail\':\'Teleghma", "Kant\':\'Krasnasielski", "Karagiannia\':\'Kozani", "Kerkrade\':\'La Roda", "Kibungo\':\'Troyes", "Kolkata\':\'Oranjestad", "Le Robert\':\'na", "Linafoot\':\'Kindu", "Pagranichnaja\':\'nd", "Paragominas in Pará\':\'Paragominas", "Saint Petersberg\':\'Saint Petersburg", "Sekolah Tinggi Ilmu Kepolisian\':\'Jakarta", "Sfântu Gheorghe\':\'Otok", "Strassen\':\'Veldhoven", "Yaypan\':\'Turkistan", "Zagreb\':\'Ljubljana", "no determine\':\'Bishkek", "the Betrideildin\':\'Trongisvágur", "the Eerste divisie\':\'Doetinchem", "the League of Ireland Premier Division\':\'Drogheda", "Ñemby in the Central Depa

In [260]:
rename_dict = {}
keys_vals_lst      = []

for x in rename_lst:
    keys_vals = x.split('\':\'')
    keys_vals_lst.append(keys_vals)

for key_val in keys_vals_lst:
    rename_dict[key_val[0]] = key_val[1]

#### Change errors in cities or countries name for df_wt_coords

In [261]:
str(rename_dict)

"{'Al-Qasim City': 'Al-Qasim', 'Becerril de Campos in the autonomous community of Castile and León': 'Becerril de Campos', 'Belenenses II': 'Lisbon', 'Beograde': 'Belgrade', 'Birmingham': 'Bweyogerere', 'Botola': 'Zemamra', 'Botola Pro Inwi': 'Mohammedia', 'Bunschotenlso': 'Bunschoten', 'Buzău': 'Vacaria', 'Cherkassy': 'na', 'Gwelup in Perth': ' Perth', 'Hai Ouled Smail': 'Teleghma', 'Kant': 'Krasnasielski', 'Karagiannia': 'Kozani', 'Kerkrade': 'La Roda', 'Kibungo': 'Troyes', 'Kolkata': 'Oranjestad', 'Le Robert': 'na', 'Linafoot': 'Kindu', 'Pagranichnaja': 'nd', 'Paragominas in Pará': 'Paragominas', 'Saint Petersberg': 'Saint Petersburg', 'Sekolah Tinggi Ilmu Kepolisian': 'Jakarta', 'Sfântu Gheorghe': 'Otok', 'Strassen': 'Veldhoven', 'Yaypan': 'Turkistan', 'Zagreb': 'Ljubljana', 'no determine': 'Bishkek', 'the Betrideildin': 'Trongisvágur', 'the Eerste divisie': 'Doetinchem', 'the League of Ireland Premier Division': 'Drogheda', 'Ñemby in the Central Department': 'Ñemby', 'Županja in S

In [262]:
df_wt_coords.city = df_wt_coords.city.map(lambda x: rename_dict[x] if x in rename_dict.keys() else x)

In [265]:
# df_wt_coords[df_wt_coords.city == 'Al-Qasim']

#### Clear manual data

In [266]:
df_manual_adding.city = df_manual_adding.city.map(lambda x: rename_dict[x] if x in rename_dict.keys() else x)
df_manual_adding.city = df_manual_adding.city.map(lambda x: x.split(':')[1] if re.findall(':', x) else x)

In [267]:
df_manual_adding.latitude  = df_manual_adding.latitude.str.strip()
df_manual_adding.longitude = df_manual_adding.longitude.str.strip()

In [268]:
print('shape - ', df_manual_adding.shape[0])
df_manual_adding = df_manual_adding[~df_manual_adding.city.isin(['nd'])].copy().reset_index(drop=True)
print('shape - ', df_manual_adding.shape[0])

shape -  46
shape -  45


In [269]:
print('shape - ', df_uniq_city_new.shape[0])
df_uniq_city_new = pd.concat([df_uniq_city_new, df_manual_adding]).reset_index(drop=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  946
shape -  991


In [270]:
df_uniq_city_new.drop_duplicates(subset=['city', 'iso_code', 'country'], inplace=True, ignore_index=True)
print('shape - ', df_uniq_city_new.shape[0])

shape -  989


In [271]:
df_uniq_city_new[(df_uniq_city_new.longitude.isna())|df_uniq_city_new.latitude.isna()]

Unnamed: 0,city,iso_code,country,longitude,latitude
956,na,BY,Belarus,,
970,na,ES,Spain,,


In [275]:
df_uniq_city_new = df_uniq_city_new[df_uniq_city_new.city != 'na'].copy().reset_index(drop=True)
df_uniq_city_new.shape

(987, 5)

### Lat_lon

In [None]:
# palmyra = LatLon(Latitude(42.4833333), Longitude(1.4666667)) # Location of Palmyra Atoll in decimal degrees
# palmyra = Lat# Change cities names errors in dataframe:Lon(5.8833, -162.0833) # Same thing but simpler!
# palmyra = LatLon(Latitude(degree = 5, minute = 52, second = 59.88), Longitude(degree = -162, minute = -4.998) # or more complicated!
# cordinates = palmyra.to_string('d% %m% %S% %H') # Print coordinates to degree minute second
# ('5 52 59.88 N', '162 4 59.88 W')

In [276]:
df_uniq_city_new['lat'], df_uniq_city_new['lon'] = ParsingDataPrepare.lat_lon_calculate(df_uniq_city_new, col_lat='latitude', col_lon='longitude')

In [277]:
print('shape - ', df_uniq_city_new.shape)
df_uniq_city_new.head(3)

shape -  (987, 7)


Unnamed: 0,city,iso_code,country,longitude,latitude,lat,lon
0,A Estrada,ES,Spain,-8.48842,42.68911,42n41,8w29
1,Abobo,CI,Ivory Coast,-4.0159,5.41613,5n24,4w0
2,Aceuchal,ES,Spain,-6.48636,38.64627,38n38,6w29


#### Concatinate old dataset with new data

In [278]:
print('shape - ', df_uniq_cities_coords.shape[0])
df_uniq_cities_coords = pd.concat([df_uniq_cities_coords, df_uniq_city_new]).reset_index(drop=True)
print('shape - ', df_uniq_cities_coords.shape[0])
df_uniq_cities_coords.tail(3)

shape -  4951
shape -  5938


Unnamed: 0,city,iso_code,country,longitude,latitude,lat,lon
5935,Drogheda,IE,Republic of Ireland,-6.3525,53.715,53n42,6w21
5936,Ñemby,PY,Paraguay,-57.5443,-25.3935,25s23,57w32
5937,Županja,HR,Croatia,18.7,45.07,45n4,18e41


In [279]:
df_uniq_cities_coords.drop_duplicates(subset=['city', 'iso_code', 'country'], inplace=True, ignore_index=True)

In [280]:
print('shape - ', df_uniq_cities_coords.shape)

shape -  (5924, 7)


In [282]:
# file = open('pickle_files/cities_countries/df_uniq_cities_coords_created_23_07_2022', 'wb')
# pickle.dump(df_uniq_cities_coords, file)  
# file.close()

#### Concatinate all dataframes with coordinates

In [286]:
df_wt_coords.drop(columns=['longitude',	'latitude',	'lat', 'lon'], inplace=True)

In [287]:
print('shape - ', df_wt_coords.shape)
cols_merge = ['city', 'iso_code', 'country']
df_coords  = df_wt_coords.merge(df_uniq_city_new, how='left', left_on=cols_merge, right_on=cols_merge)
print('shape - ', df_coords.shape)

shape -  (2827, 21)
shape -  (2827, 25)


In [288]:
print('shape - ', df_coords.shape)
df_coords  = df_coords[(~df_coords.longitude.isna()) & (~df_coords.latitude.isna())].copy()
print('shape - ', df_coords.shape)

shape -  (2827, 25)
shape -  (2812, 25)


In [289]:
print('shape - ', df_coords.shape)
df_coords = df_coords[~df_coords.city.isin(['nd'])].copy().reset_index(drop=True)
print('shape - ', df_coords.shape)

shape -  (2812, 25)
shape -  (2812, 25)


In [290]:
print('shape - ', df_include_coords.shape)
df_include_coords = pd.concat([df_include_coords, df_coords]).reset_index(drop=True)
print('shape - ', df_include_coords.shape)

shape -  (22217, 25)
shape -  (25029, 25)


In [291]:
df_include_coords[(df_include_coords.longitude.isna()) | (df_include_coords.latitude.isna())].values

array([], shape=(0, 25), dtype=object)

#### Check equal cities for each team

In [292]:
df_fteam_cities = df_coords.groupby('f_team').agg({'city':'unique'}).reset_index()

In [295]:
df_fteam_cities.head(3)

Unnamed: 0,f_team,city
0,1954 Kelkit Belediyespor,[Kelkit]
1,2 de Mayo,[Pedro Juan Caballero]
2,AC Leon,[Vimercate]


In [296]:
len_city_lst = df_fteam_cities.city.map(lambda x: len(x))
max(len_city_lst) # must could == 1

2

In [317]:
# if max(len_city_lst) > 1 need check team for different countries names:
for lt_city in df_fteam_cities.city:
    if len(lt_city) > 1:
        print(lt_city)

['Mbour' 'Dakar']


In [312]:
# df_teams_data_main[df_teams_data_main.city == 'Mbour' ]

In [313]:
# df_teams_data_main[df_teams_data_main.city == 'Dakar' ]

In [315]:
df_fteam_cities[df_fteam_cities.f_team == 'Douanes']

Unnamed: 0,f_team,city
253,Douanes,"[Mbour, Dakar]"


In [316]:
df_include_coords[df_include_coords.f_team == 'Douanes']

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,pts_f,pos_s,pts_s,fs_pos_max,fs_pts_max,f_pow,desc,country,team_id,city,iso_code,longitude,latitude,lat,lon
24406,15209963,Ligue 1,696,306,11.07.2021 17:00,Douanes - Generation Foot,1:1,thr,Douanes,8.0,30.0,3.0,46.0,"8 - 3, 14","30 - 46, 52",Pre,sum_coef_13,Senegal,,Mbour,SN,-16.829798,14.385877,14n23,16w49
24524,15192934,Premier League,1081,306,23.06.2021 16:00,Douanes - ASFA-Yennenga,3:2,thr,Douanes,1.0,64.0,6.0,45.0,"1 - 6, 18","64 - 45, 64",Fav,sum_coef_13,Burkina Faso,14453.0,Dakar,BF,-17.486199,14.717922,14n43,17w29
24601,15209933,Ligue 1,696,306,13.06.2021 17:00,Douanes - Teungueth,0:4,thr,Douanes,8.0,30.0,1.0,52.0,"8 - 1, 14","30 - 52, 52",Pre,sum_coef_13,Senegal,,Mbour,SN,-16.829798,14.385877,14n23,16w49


In [320]:
print('shape - ', df_include_coords.shape)
df_include_coords.tail(3)

shape -  (25029, 25)


Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,pts_f,pos_s,pts_s,fs_pos_max,fs_pts_max,f_pow,desc,country,team_id,city,iso_code,longitude,latitude,lat,lon
25026,15164040,Srpska Liga,698,306,28.03.2021 13:30,Zeleznicar Lajkovac - Sloboda Užice,1:0,thr,Zeleznicar Lajkovac,15.0,36.0,4.0,57.0,"15 - 4, 20","36 - 57, 94",Pre,sum_coef_13,Serbia,,Lajkovac,RS,20.16528,44.36944,44n22,20000000000.0
25027,15147393,Third Division,553,306,28.03.2021 12:30,Ormideia - O Kormakitis,2:0,thr,Ormideia,8.0,40.0,16.0,16.0,"8 - 16, 16","40 - 16, 68",Fav,sum_coef_13,Cyprus,,Ormideia,CY,33.775628,34.99351,34n59,3.3e+47
25028,15109589,Thai League 1,718,298,28.03.2021 11:00,Prachuap - Bangkok United,0:1,thr,Prachuap,10.0,37.0,5.0,51.0,"10 - 5, 16","37 - 51, 77",Pre,sum_coef_13,Thailand,13605.0,Prachuap Khiri Khan,TH,99.7841,11.82098,11n49,9.9e+48


In [319]:
# file = open('pickle_files/cities_countries/df_with_geo_coords_26_05_2022_sum_coef_13', 'wb')
# pickle.dump(df_include_coords, file)  
# file.close()

##### Example

In [None]:
# Exempl
from  geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Your_Name")

city ="Russia"
country ="Perm"
loc = geolocator.geocode(city+','+ country)
lat = loc.latitude
long = loc.longitude

print(lat, long)