In [1]:
import re
import pickle
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

from _html_parser import HtmlParser

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

In [2]:
%load_ext autoreload
%autoreload 2

In [20]:
with open('pickle_files/teams_data/tmd_26_05_2022/df_teams_cities_na_26_05_2022_third', 'rb') as f:
    df_game_ids_na_cities = pickle.load(f)

In [6]:
teams_for_ids = df_game_ids_na_cities.f_team.values
print('shape - ', len(teams_for_ids))
teams_for_ids[-3:]

shape -  149


array(['Nepal', 'Sinalunghese', 'Coimbra'], dtype=object)

#### Parsing teams ids

In [7]:
HtmlParser.find_teams_ids(teams_for_ids) 

Current_1000_teams_ids_saved - Coimbra
Teams ids saved with last name: Coimbra


In [8]:
with open('pickle_files/teams_data/tmd_26_05_2022/teams_ids_na_cities_Coimbra', 'rb') as f:
    lst_teams_ids = pickle.load(f)

In [9]:
print('shape - ', len(lst_teams_ids))
lst_teams_ids[-3:]

shape -  149


[['Nepal', '7107'], ['Sinalunghese', '19864'], ['Coimbra', '14892']]

In [10]:
teams_ids = [x[1] for x in lst_teams_ids if x[1] is not None]

In [11]:
print('shape - ', len(teams_ids))
teams_ids[-3:]

shape -  136


['7107', '19864', '14892']

In [12]:
lst_teams_wt_id = [x[0] for x in lst_teams_ids if x[1] is None]

In [13]:
print('shape - ', len(lst_teams_wt_id))
lst_teams_wt_id[-3:]

shape -  13


['Minaj U19', 'Belaya Rus', 'Sputnik Rechitsa']

In [14]:
# file = open('pickle_files/teams_data/lst_teams_wt_id_main', 'wb')
# pickle.dump(lst_teams_wt_id, file)  
# file.close()

In [14]:
df_teams_ids           = pd.DataFrame(columns=['teams_ids', 'team'])
df_teams_ids.teams_ids = [x[1] for x in lst_teams_ids]
df_teams_ids.team      = [x[0] for x in lst_teams_ids]

In [15]:
df_teams_ids = df_teams_ids.merge(df_game_ids_na_cities.country, how='left', left_index=True, right_index=True)

In [25]:
df_teams_ids.head(3)

Unnamed: 0,teams_ids,team,country
0,,Lillestrøm,Norway
1,8631.0,HIFK,Finland
2,7556.0,Drogheda United,Republic of Ireland


#### Parsing teams data

In [26]:
HtmlParser.find_teams_data(teams_ids[:])

Current_500_teams_data_saved - 14892
Teams data saved with last id: 14892


In [27]:
# teams_wiki_data_8636, teams_wiki_data_12683, teams_wiki_data_14892
with open('pickle_files/teams_data/tmd_26_05_2022/teams_wiki_data_14892', 'rb') as f:
    wiki_data = pickle.load(f)

In [28]:
len(wiki_data)

136

In [29]:
wiki_data[2:3]

[('8228',
  'Finn Harps Football Club are an Irish football club that play in the Premier Division of the League of Ireland, as of 2020. The club was founded in 1954 and elected to the league in 1969. However, it was technically founded on 30 November 1953.')]

In [30]:
df_wiki_data  = pd.DataFrame(wiki_data, columns=['teams_ids', 'tm_data'])
df_teams_data = df_teams_ids.merge(df_wiki_data, how='left', left_on='teams_ids', right_on='teams_ids')

In [31]:
df_teams_data.drop_duplicates(subset=['teams_ids', 'team'], inplace=True)
df_teams_data.reset_index(drop=True, inplace=True)

In [33]:
# df_teams_data

#### Data clear

In [34]:
data_is_a = df_teams_data.tm_data.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'is a', '\.')) if re.findall('is a', str(x)) else None)

In [35]:
data_based_in = data_is_a.map(lambda x: x if re.findall('(based in)|(city of)|(town of)|(located in)|(club from)|(club in)|(playing in)', str(x)) else None)

In [36]:
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'city of ', '$')) if re.findall('city of', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'town of ', '$')) if re.findall('town of', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'located in ', '$')) if re.findall('located in', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'club from ', '$')) if re.findall('club from', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'club in ', '$')) if re.findall('club in', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'playing in ', '$')) if re.findall('playing in', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'based in ', '$')) if re.findall('based in', str(x)) else x)

In [37]:
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, '^', ',')) if re.findall(',', str(x)) else x)

In [39]:
# data_based_in

In [40]:
df_teams_data['city'] = data_based_in.values

In [41]:
print('shape - ', df_teams_data.shape[0])
df_teams_data.head(3)

shape -  149


Unnamed: 0,teams_ids,team,country,tm_data,city
0,,Lillestrøm,Norway,,
1,8631.0,HIFK,Finland,"HIFK, the Idrottsföreningen Kamraterna, Helsingfors (IFK, Helsingfors) rf (officially abbreviated IFK Helsingfors, colloquially often Helsingfors IFK or Helsingin IFK ) is a multi-sport association based in Helsinki, Finland.",Helsinki
2,7556.0,Drogheda United,Republic of Ireland,"Drogheda United Football Club is an Irish association football club based in Drogheda, County Louth playing in the League of Ireland Premier Division. They play their home matches at Head In The Game Park.",the League of Ireland Premier Division


In [43]:
# df_teams_data.to_csv('csv_files/df_teams_data_26_05_2022_third.csv', index = None, header=True)

#### Work after manual correct

In [58]:
colnames = ['teams_ids', 'team', 'country', 'tm_data', 'city']

In [59]:
df_after_corr = pd.read_csv('csv_files/df_teams_data_26_05_2022_sec.csv', names=colnames, header=None)

In [62]:
df_after_corr[df_after_corr.teams_ids == 'teams_ids']

Unnamed: 0,teams_ids,team,country,tm_data,city


In [65]:
df_after_corr.drop(1293, inplace=True)

In [66]:
df_after_corr.reset_index(drop=True, inplace=True)

In [67]:
df_after_corr

Unnamed: 0,teams_ids,team,country,tm_data,city
0,7998,Magdeburg,Germany,"1. FC Magdeburg is a German association football club based in the city of Magdeburg. The club was founded in 1965 and spent all but one season in East Germany top flight, the DDR-Oberliga, winning three championships and seven cup titles.",Magdeburg
1,9863,Koln II,Germany,"1. Fußball-Club Köln 01/07 e. V. II, commonly known as simply 1. FC Köln II, is a German football team based in Cologne. It is the reserve team of German association football club 1. FC Köln.",Cologne
2,8056,Lokomotive Leipzig,Germany,"1. Fußballclub Lokomotive Leipzig e.V. is a German football club based in the locality of Probstheida in the Südost borough of Leipzig, Saxony. The club may be more familiar to many of the country's football fans as the historic side VfB Leipzig the first national champion of Germany.",Leipzig
3,14511,Navad Urmia,Iran,"90 Urmia Football Club is a professional football club based in Urmia, Iran actually playing in the second tier Azadegan League. The club was founded in 2011 and joined the Azadegan League in August of 2018, after the transfer of Gostaresh Foulad F.C. to the city of Urmia.",Urmia
4,10571,Trento,Italy,"A.C. Trento 1921 S.S.D. is an Italian football club, and the major club in Trento. Currently they play in Serie C Group A. In 2014 Società Sportiva Dilettantistica Trento Calcio 1921 S.r.l. went bankrupt. The sports title was transferred to A.C. Trento S.C.S.D.",Trento
...,...,...,...,...,...
1288,,EPS,Finland,,Espoo
1289,,Ilves-Kissat,Finland,,Tampere
1290,15282,RoPS II,Finland,,Rovaniemi
1291,,Atletico Cali,Colombia,,Cali


In [68]:
df_after_corr[df_after_corr.city == 'nan']

Unnamed: 0,teams_ids,team,country,tm_data,city


In [70]:
# file = open('pickle_files/teams_data/df_after_corr_26_05_2022_sec', 'wb')
# pickle.dump(df_after_corr, file)  
# file.close()