In [1]:
import re
import pandas as pd
import numpy as np
import unicodedata
import pickle
from cities_coordinates import CityCoordinator

from _html_parser import HtmlParser, ParsingDataPrepare

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 20)

In [2]:
%load_ext autoreload

In [3]:
df_country_code = pd.read_csv('csv_files/wikipedia-iso-country-codes.csv') 

In [4]:
df_country_code.columns = [x.lower() for x in df_country_code.columns]
df_country_code['alpha-2 code'] = df_country_code['alpha-2 code'].map(lambda x: str(x).lower())
df_country_code.rename(columns={'english short name lower case':'country', 'alpha-2 code':'iso_code'}, inplace=True)
df_country_code.iso_code = df_country_code.iso_code.str.upper()
df_country_code.shape

(246, 5)

In [5]:
df_country_code.head(3)

Unnamed: 0,country,iso_code,alpha-3 code,numeric code,iso 3166-2
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX
2,Albania,AL,ALB,8,ISO 3166-2:AL


In [6]:
with open('pickle_files/cities_countries/df_for_geo_coords_26_05_2022', 'rb') as f:
    df_wt_coords = pickle.load(f)

In [7]:
# Remove USA country because many cities have twice locations in different states, 'Republic of Kosovo' - doesn't have iso_code
print('shape - ', df_wt_coords.shape[0])
df_wt_coords = df_wt_coords[~df_wt_coords.country.isin(['USA', 'Republic of Kosovo'])].copy()
print('shape - ', df_wt_coords.shape[0])

shape -  39069
shape -  38474


In [8]:
df_wt_coords = df_wt_coords.merge(df_country_code[['country', 'iso_code']], how='left', left_on=['country'], right_on=['country'])
df_wt_coords.shape

(38474, 41)

In [9]:
df_wt_coords.head(3)

Unnamed: 0,game_id,ligue_header,comp_id,season_id,game_utc,game_title,goals,ligue,f_team,pos_f,...,tm_2,ls_25,mr_25,f_pow_bet,f_pow_pts,f_pow,desc,country,city,iso_code
0,1687696,OBOS liga,668,301,26.05.2022 15:00,Skeid - Sandnes Ulf,0:1,thr,Skeid,16.0,...,,,,,Pre,Pre,fav_pre,Norway,Oslo,NO
1,1687694,OBOS liga,668,301,26.05.2022 16:00,Kongsvinger - Brann,1:3,thr,Kongsvinger,13.0,...,,,,,Pre,Pre,fav_pre,Norway,Kongsvinger,NO
2,1687692,OBOS liga,668,301,26.05.2022 16:00,Bryne - Stabaek,0:0,thr,Bryne,12.0,...,,,,,Pre,Pre,fav_pre,Norway,Bryne,NO


In [10]:
country_wt_iso = list(set(df_wt_coords[df_wt_coords.iso_code.isna()].country.values))
str(country_wt_iso)

"['Moldova', 'Republic of Ireland', 'Northern Ireland', 'Syria', 'FYR Macedonia', 'China PR', 'Scotland', 'Wales', 'Korea Republic', 'Libya', 'Congo DR', 'Tanzania', 'England', 'Bosnia-Herzegovina', 'Ivory Coast']"

In [11]:
#  For teams dataframe:
# ['England', 'Wales', 'Scotland', 'Northern Ireland'] - 'United Kingdom'

# For world cities dataframe:
# 'Tanzania, United Republic of' - Tanzania, 'Moldova, Republic of' - 'Moldova',
# 'Congo, the Democratic Republic of the' - 'Congo DR', 'United States Of America' - 'USA', 'Syrian Arab Republic' - 'Syria',
# 'China' - 'China PR', 'Korea, Republic of (South Korea) - 'Korea Republic', 'Macedonia, the former Yugoslav Republic of' - 'FYR Macedonia', 
# 'Bosnia and Herzegovina' - 'Bosnia-Herzegovina', 'Ireland' - 'Republic of Ireland', 'Libyan Arab Jamahiriya' - 'Libya', 
# 'Côte d'Ivoire' - 'Ivory Coast'

In [12]:
df_country_code[df_country_code.country == 'Serbia']

Unnamed: 0,country,iso_code,alpha-3 code,numeric code,iso 3166-2
195,Serbia,RS,SRB,688,ISO 3166-2:RS


In [13]:
british_list          = ['England', 'Northern Ireland', 'Scotland', 'Wales']
df_wt_coords.iso_code = df_wt_coords[['country', 'iso_code']].apply(lambda x: 'GB' if x[0] in british_list else x[1], axis=1)

In [14]:
countries_chng = {'Tanzania, United Republic of':'Tanzania', 'Moldova, Republic of':'Moldova',
                  'Congo, the Democratic Republic of the':'Congo DR', 'United States Of America':'USA', 'Syrian Arab Republic':'Syria',
                  'China':'China PR', 'Korea, Republic of (South Korea)':'Korea Republic', 'Macedonia, the former Yugoslav Republic of':'FYR Macedonia', 
                  'Bosnia and Herzegovina':'Bosnia-Herzegovina', 'Ireland':'Republic of Ireland', 'Libyan Arab Jamahiriya':'Libya',
                  'Côte d\'Ivoire':'Ivory Coast'}

In [15]:
df_country_code.country = df_country_code.country.map(lambda x: countries_chng[x] if x in countries_chng.keys() else x)

In [16]:
df_wt_coords.drop(columns=['iso_code'], inplace=True)

In [17]:
df_wt_coords = df_wt_coords.merge(df_country_code[['country', 'iso_code']], how='left', left_on=['country'], right_on=['country'])
df_wt_coords.shape

(38474, 41)

In [18]:
# Changed there counties on 'GB' yet
country_wt_iso = list(set(df_wt_coords[df_wt_coords.iso_code.isna()].country.values))
str(country_wt_iso)

"['England', 'Wales', 'Northern Ireland', 'Scotland']"

In [19]:
df_uniq_city_iso = df_wt_coords[['country', 'city', 'iso_code']].groupby(by=['city', 'iso_code']).first().reset_index()
df_uniq_city_iso.head(3)

Unnamed: 0,city,iso_code,country
0,A Coruña,ES,Spain
1,Aalborg,DK,Denmark
2,Aarau,CH,Switzerland


In [20]:
# labrary example
c = CityCoordinator()
# c.get_city(city_name="London", country_code_iso="GB")

In [21]:
df_for_loc = df_uniq_city_iso[['city', 'iso_code']].apply(lambda x: c.get_city(city_name=x[0], country_code_iso=x[1]), axis=1)

In [22]:
df_for_loc[0]['location']

{'lon': '-8.396', 'lat': '43.37135'}

In [23]:
location = [x['location'] if x != None else None for x in df_for_loc]
lon      = [x['lon'] if x != None else None for x in location]
lat      = [x['lat'] if x != None else None for x in location]

In [24]:
df_uniq_city_iso['lon'] = lon
df_uniq_city_iso['lat'] = lat

In [None]:
df_uniq_city_iso

In [25]:
df_wt_coord = df_uniq_city_iso[df_uniq_city_iso.lon.isna()]
df_wt_coord.head(3)

Unnamed: 0,city,iso_code,country,lon,lat
5,Abo,FI,Finland,,
8,Accre,IL,Israel,,
13,Adjara,GE,Georgia,,
18,Agrinio,GR,Greece,,
21,Ahwaz,IR,Iran,,
...,...,...,...,...,...
2243,Óbuda,HU,Hungary,,
2246,Östergötland,SE,Sweden,,
2248,Újpest,HU,Hungary,,
2253,İstanbul,TR,Turkey,,


In [27]:
cities_countries = list(df_wt_coord[['city', 'country']].apply(lambda x: (x[0], x[1]), axis=1))

In [50]:
cities_countries[:3]

[('Abo', 'Finland'), ('Accre', 'Israel'), ('Adjara', 'Georgia')]

In [None]:
HtmlParser.cities_data_with_geocoordinates(cities_countries)

Cities_data_saved - ('Sakakah', 'United Arab Emirates')


In [38]:
with open('pickle_files/cities_countries/cities_data_26_05_2022_Al Qara_Saudi Arabia', 'rb') as f:
    cities_data = pickle.load(f)

### Lat_lon

In [33]:
# palmyra = LatLon(Latitude(42.4833333), Longitude(1.4666667)) # Location of Palmyra Atoll in decimal degrees
# palmyra = LatLon(5.8833, -162.0833) # Same thing but simpler!
# palmyra = LatLon(Latitude(degree = 5, minute = 52, second = 59.88), Longitude(degree = -162, minute = -4.998) # or more complicated!
# cordinates = palmyra.to_string('d% %m% %S% %H') # Print coordinates to degree minute second
# ('5 52 59.88 N', '162 4 59.88 W')

In [34]:
# df = df_world_cities[['latitude', 'longitude']].copy()

In [35]:
# df_world_cities['lat'], df_world_cities['lon'] = ParsingDataPrepare.lat_lon_calculate(df_world_cities)

In [36]:
# df_world_cities

In [None]:
# with open('pickle_files/cities_countries/cities_data_26_05_2022_Bou Saâda_Algeria', 'rb') as f:
#     cities_data = pickle.load(f)

In [39]:
len(cities_data)

10

In [40]:
cities_data[:3]

[[('Abo', 'Finland'),
  Location(Turku, Turun seutukunta, Varsinais-Suomi, Lounais-Suomen aluehallintovirasto, Manner-Suomi, Suomi / Finland, (60.4517531, 22.2670522, 0.0))],
 [('Accre', 'Israel'), None],
 [('Adjara', 'Georgia'),
  Location(აჭარის ავტონომიური რესპუბლიკა, საქართველო, (41.67315085, 41.91966521300377, 0.0))]]

In [41]:
list_cities, list_countries, list_data = [], [], []

for data in cities_data:
    list_cities.append(data[0][0])
    list_countries.append(data[0][1])
    if data[1] != None:
        list_data.append([data[1].latitude, data[1].longitude])
    else:
        list_data.append([None, None])

In [42]:
list_lat = [x[0] for x in list_data]
list_lon = [x[1] for x in list_data]

In [43]:
df_data = pd.DataFrame({'country':list_countries, 'city':list_cities, 'latitude':list_lat, 'longitude':list_lon})

In [44]:
df_data.shape

(10, 4)

In [45]:
df_data = df_data[(~df_data.latitude.isna()) | (~df_data.longitude.isna())].copy()
df_data.reset_index(drop=True, inplace=True)

In [46]:
df_lat_lon = df_data[['latitude', 'longitude']].copy()

In [47]:
df_data['lat'], df_data['lon'] = ParsingDataPrepare.lat_lon_calculate(df_lat_lon)

In [48]:
df_data

Unnamed: 0,country,city,latitude,longitude,lat,lon
0,Finland,Abo,60.451753,22.267052,60n27,2.2e+17
1,Georgia,Adjara,41.673151,41.919665,41n40,4.1e+56
2,Greece,Agrinio,38.624828,21.409421,38n37,2.1e+25
3,Iran,Ahwaz,31.323164,48.679254,31n19,4.8e+41
4,Libya,Al Bayda',32.760953,21.757717,32n45,2.0999999999999998e+46
5,Qatar,Al Khor,25.683733,51.503121,25n41,5.1e+31
6,Saudi Arabia,Al Majma'ah,25.903889,45.345556,25n54,4.5e+21
7,Saudi Arabia,Al Qara,20.25078,41.350399,20n15,4.1e+22


In [None]:
# file = open('pickle_files/cities_countries/df_world_cities_3_own', 'wb')
# pickle.dump(df_data, file)  
# file.close()

In [None]:
# Exempl
from  geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="Your_Name")

city ="Russia"
country ="Perm"
loc = geolocator.geocode(city+','+ country)
lat = loc.latitude
long = loc.longitude

print(lat, long)

In [None]:
# find_c = df_country_code.country.apply(lambda x: x if re.findall(r'Macedonia', str(x)) else None)
# find_c.value_counts()

In [None]:
# df_country_code[df_country_code.country == 'Macedonia, the former Yugoslav Republic of']

In [None]:
df_wt_coords = df_wt_coords.merge(df_iso_codes, how='left', left_on=['country'], right_on=['country'])
df_wt_coords.shape

In [None]:
df_wt_coords.iso_code = df_wt_coords[['iso_code', 'iso_code_own']].apply(lambda x: x[1] if pd.isna(x[0]) else x[0], axis=1)

In [None]:
df_wt_coords[df_wt_coords.iso_code.isna()]

In [None]:
# df_location = df_wt_coords[['city', 'iso_code']].apply(lambda x: c.get_city(city_name=x[0], country_code_iso=x[1]), axis=1)

In [None]:
df_tmp

In [None]:
df_countries = df_wt_coords[df_wt_coords.iso_code.isna()].country

In [None]:
df_countries.drop_duplicates(inplace=True)

In [None]:
df_countries.values