In [1]:
import re
import pickle
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

from _html_parser import HtmlParser

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
with open('pickle_files/teams_data/tmd_26_03_2021/df_teams_cities_na_26_03_2021_sec', 'rb') as f:
    df_game_ids_na_cities = pickle.load(f)

In [4]:
teams_for_ids = df_game_ids_na_cities.f_team.values
print('shape - ', len(teams_for_ids))
teams_for_ids[-3:]

shape -  2283


array(['Pakistan Airlines', 'Habib Bank', 'Waltersdorf'], dtype=object)

#### Parsing teams ids

In [5]:
# HtmlParser.find_teams_ids(teams_for_ids[:]) 

In [6]:
with open('pickle_files/teams_data/tmd_26_03_2021/teams_ids_na_cities_Waltersdorf', 'rb') as f:
    lst_teams_ids = pickle.load(f)

In [7]:
print('shape - ', len(lst_teams_ids))
lst_teams_ids[-3:]

shape -  2283


[['Pakistan Airlines', None], ['Habib Bank', None], ['Waltersdorf', None]]

In [8]:
teams_ids = [x[1] for x in lst_teams_ids if x[1] is not None]

In [9]:
print('shape - ', len(teams_ids))
teams_ids[-3:]

shape -  752


['11907', '19689', '7887']

In [10]:
lst_teams_wt_id = [x[0] for x in lst_teams_ids if x[1] is None]

In [11]:
print('shape - ', len(lst_teams_wt_id))
lst_teams_wt_id[-3:]

shape -  1531


['Pakistan Airlines', 'Habib Bank', 'Waltersdorf']

In [12]:
df_teams_ids           = pd.DataFrame(columns=['teams_ids', 'team'])
df_teams_ids.teams_ids = [x[1] for x in lst_teams_ids]
df_teams_ids.team      = [x[0] for x in lst_teams_ids]

In [13]:
df_teams_ids = df_teams_ids.merge(df_game_ids_na_cities.country, how='left', left_index=True, right_index=True)

In [14]:
df_teams_ids.head(3)

Unnamed: 0,teams_ids,team,country
0,,Fortuna Becicherecu Mic,Romania
1,,Concordia Chiajna II,Romania
2,10398.0,Flacara,Romania


#### Parsing teams data

In [15]:
# HtmlParser.find_teams_data(teams_ids[:])

In [16]:
# tmd_26_05_2022: (teams_wiki_data_8636, teams_wiki_data_12683, teams_wiki_data_14892), 
with open('pickle_files/teams_data/tmd_26_03_2021/teams_wiki_data_7887', 'rb') as f:
    wiki_data = pickle.load(f)

In [17]:
len(wiki_data)

752

In [18]:
wiki_data[2:3]

[('16569',
  'Nakhon Pathom United Football Club, formerly known as Nakhon Pathom Football Club (Thai: สโมสรฟุตบอลจังหวัดนครปฐม), is a Thailand professional football club based in Nakhon Pathom province and currently play in Thai League 2. Their home stadium is Nakhon Pathom Municipality Sport School Stadium.')]

In [19]:
df_wiki_data  = pd.DataFrame(wiki_data, columns=['teams_ids', 'tm_data'])
df_teams_data = df_teams_ids.merge(df_wiki_data, how='left', left_on='teams_ids', right_on='teams_ids')

In [20]:
df_teams_data.drop_duplicates(subset=['teams_ids', 'team'], inplace=True)
df_teams_data.reset_index(drop=True, inplace=True)

In [21]:
print('shape - ', df_teams_data.shape)
df_teams_data.head()

shape -  (2278, 4)


Unnamed: 0,teams_ids,team,country,tm_data
0,,Fortuna Becicherecu Mic,Romania,
1,,Concordia Chiajna II,Romania,
2,10398.0,Flacara,Romania,"Club Sportiv Municipal Flacăra Moreni,, commonly known as Flacăra Moreni or simply as Flacăra, is a Romanian football club based in Moreni, Dâmbovița County."
3,16570.0,Phrae United,Thailand,Phrae United Football Club is a Thai semi-professional football club based in Phrae Province. They currently play in Thai League 2.
4,16569.0,Nakhon Pathom,Thailand,"Nakhon Pathom United Football Club, formerly known as Nakhon Pathom Football Club (Thai: สโมสรฟุตบอลจังหวัดนครปฐม), is a Thailand professional football club based in Nakhon Pathom province and currently play in Thai League 2. Their home stadium is Nakhon Pathom Municipality Sport School Stadium."


#### Data clear

In [22]:
data_is_a = df_teams_data.tm_data.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'is a', '\.')) if re.findall('is a', str(x)) else None)

In [23]:
data_based_in = data_is_a.map(lambda x: x if re.findall('(based in)|(city of)|(town of)|(located in)|(club from)|(club in)|(playing in)', str(x)) else None)

In [24]:
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'city of ', '$')) if re.findall('city of', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'town of ', '$')) if re.findall('town of', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'located in ', '$')) if re.findall('located in', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'club from ', '$')) if re.findall('club from', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'club in ', '$')) if re.findall('club in', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'playing in ', '$')) if re.findall('playing in', str(x)) else x)
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, 'based in ', '$')) if re.findall('based in', str(x)) else x)

In [25]:
data_based_in = data_based_in.map(lambda x: ''.join(HtmlParser.cut_part_of_string(x, '^', ',')) if re.findall(',', str(x)) else x)

In [26]:
# data_based_in

In [27]:
df_teams_data['city'] = data_based_in.values

In [28]:
print('shape - ', df_teams_data.shape[0])
df_teams_data.head(3)

shape -  2278


Unnamed: 0,teams_ids,team,country,tm_data,city
0,,Fortuna Becicherecu Mic,Romania,,
1,,Concordia Chiajna II,Romania,,
2,10398.0,Flacara,Romania,"Club Sportiv Municipal Flacăra Moreni,, commonly known as Flacăra Moreni or simply as Flacăra, is a Romanian football club based in Moreni, Dâmbovița County.",Moreni


In [29]:
# df_teams_data.to_csv('csv_files/df_teams_data_26_03_2021_sec.csv', index = None, header=True)

#### Parsing data from soccerway

In [38]:
df_wt_data = df_teams_data[df_teams_data.teams_ids.isna()].copy().reset_index(drop=True)
print('shape - ', df_wt_data.shape)
df_wt_data.tail(3)

shape -  (1531, 5)


Unnamed: 0,teams_ids,team,country,tm_data,city
1528,,Pakistan Airlines,Pakistan,,
1529,,Habib Bank,Pakistan,,
1530,,Waltersdorf,Germany,,


In [None]:
teams_countries_lt

In [82]:
teams_countries_lt = df_wt_data[['team', 'country']].apply(lambda x: [x[0], x[1]], axis=1)

In [83]:
teams_countries_lt[:3]

0    [Fortuna Becicherecu Mic, Romania]
1       [Concordia Chiajna II, Romania]
2          [Urania Baška Voda, Croatia]
dtype: object

In [98]:
def extract_source(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    source=requests.get(url, headers=headers).text
    return source

In [100]:
# extract_source('https://int.soccerway.com/search/teams/?q=Waltersdorf')

In [116]:
def find_soccerway_team_data(teams_countries_lt: list, url='https://int.soccerway.com/search/teams/?q=', name_saved_file='soccerway_teams_data'):
    headers = {'User-Agent': 'Mozilla/5.0'}
    team_data = []

    for count, team_country in enumerate(teams_countries_lt):
    
        html = requests.get(url + team_country[0], headers=headers).content # + 
        soup = BeautifulSoup(html, "html.parser")

        ul_find   = soup.find('ul', {'class':'tree search-results'})
        span_find = re.sub(r'[^\w]', '', ul_find.find('span', {'class':'meta'}).text)
        
        print(ul_find)

In [None]:
<span class="meta">(Germany)</span>

In [117]:
find_soccerway_team_data(teams_countries_lt[:3])

<ul class="tree search-results">
<li><a href="/teams/belgium/koninklijke-atletiek-associatie-gent/214/">Gent</a> <span class="meta">(Belgium)</span></li>
<li><a href="/teams/germany/dusseldorfer-tus-fortuna-1895-ev/1029/">Fortuna Düsseldorf</a> <span class="meta">(Germany)</span></li>
<li><a href="/teams/netherlands/fortuna-sittard/1551/">Fortuna Sittard</a> <span class="meta">(Netherlands)</span></li>
<li><a href="/teams/romania/poiana-campina/1805/">Poiana</a> <span class="meta">(Romania)</span></li>
<li><a href="/teams/germany/sc-fortuna-koln/2367/">Fortuna Köln</a> <span class="meta">(Germany)</span></li>
<li><a href="/teams/finland/hifk-soccer-helsinki/2432/">HIFK</a> <span class="meta">(Finland)</span></li>
<li><a href="/teams/moldova/fortuna-pleseni/3374/">Fortuna</a> <span class="meta">(Moldova)</span></li>
<li><a href="/teams/germany/spvgg-oberaussem-fortuna/5200/">Oberaussem</a> <span class="meta">(Germany)</span></li>
<li><a href="/teams/norway/fortuna-aalesund-fk/5234/">For

In [None]:
https://int.soccerway.com/search/teams/?q=Waltersdorf

In [None]:
'Waltersdorf'

In [None]:
<ul class="tree search-results">

#### Work after manual correct

In [30]:
colnames = ['team_id', 'f_team', 'country', 'tm_data', 'city']

In [31]:
df_after_corr = pd.read_csv('csv_files/df_teams_data_26_05_2022_four.csv', names=colnames, header=None, dtype=str)

In [32]:
print('shape - ', df_after_corr.shape[0])
df_after_corr.head(3)

shape -  1586


Unnamed: 0,team_id,f_team,country,tm_data,city
0,teams_ids,team,country,tm_data,city
1,,RSV Eintracht 1949,Germany,,Stahnsdorf
2,,TuS Mechtersheim,Germany,,Mechtersheim


In [33]:
df_after_corr.drop_duplicates(subset=['f_team', 'country'], inplace=True)
df_after_corr.reset_index(drop=True, inplace=True)

In [34]:
df_after_corr.drop(columns='tm_data', inplace=True)

In [35]:
df_after_corr.tail(3)

Unnamed: 0,team_id,f_team,country,city
1583,11391.0,Song Lam Nghe An,Vietnam,Vinh
1584,12529.0,Heidelberg United,Australia,Melbourne
1585,,Mladost Medoševac,Serbia,Medoševac


In [36]:
# file = open('pickle_files/teams_data/df_after_corr_26_05_2022_four', 'wb')
# pickle.dump(df_after_corr, file)  
# file.close()