In [85]:
import os
import re
from bs4 import BeautifulSoup
import pandas as pd

In [86]:
folder_path = "downloads_clubs"

In [87]:
html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]

In [88]:
html_files

['%C3%85bo_IFK.html',
 '%C3%85tvidabergs_FF.html',
 '%C3%87aykur_Rizespor.html',
 '%C3%8D%C3%BEr%C3%B3ttabandalag_Akraness.html',
 '%C3%8D%C3%BEr%C3%B3ttabandalag_Vestmannaeyja.html',
 '%C3%8DBV_men%27s_football.html',
 '%C3%8DB_Akureyri.html',
 '%C3%8DF_Fuglafj%C3%B8r%C3%B0ur.html',
 '%C3%96rebro_SK.html',
 '%C3%96rgryte_IS.html',
 '%C3%96stersunds_FK.html',
 '%C3%96sters_IF.html',
 '%C3%9Ajpest_FC.html',
 '%C3%9E%C3%B3r_Akureyri.html',
 '%C4%A6amrun_Spartans_F.C..html',
 '%C4%B0stanbulspor_A.%C5%9E..html',
 '%C4%B0stanbul_Ba%C5%9Fak%C5%9Fehir_F.K..html',
 '%C5%81KS_%C5%81%C3%B3d%C5%BA.html',
 '%C5%9Al%C4%85sk_Wroc%C5%82aw.html',
 '%C5%A0K_Futura_Humenn%C3%A9.html',
 '%C5%A0K_Slovan_Bratislava.html',
 '%C5%BBurrieq_F.C..html',
 '1.FC_Kaiserslautern.html',
 '1._FC_Brno.html',
 '1._FC_Frankfurt.html',
 '1._FC_K%C3%B6ln.html',
 '1._FC_Kaiserslautern.html',
 '1._FC_Lokomotive_Leipzig.html',
 '1._FC_Magdeburg.html',
 '1._FC_N%C3%BCrnberg.html',
 '1._FC_Saarbr%C3%BCcken.html',
 '1._FC_Slov%

In [117]:
def scrape_clubs(filename):
    
    with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    klub = {}
    
    klub['teamName'] = soup.find('title').text.split(' - Wikipedia')[0]
    
    klub['teamUrl'] = filename.replace('.html','')
    
    nicknames = []
    
    if soup.find('table', class_='infobox'):
    
        t = soup.find('table', class_='infobox')

        for r in t.find_all('tr'):
            if r.text[0:6] == 'Nickna':
                if r.find('span', class_='nowrap'):
                    spans = r.find_all('span')
                    for s in spans:
                        if len(s.text) > 2:
                            nicknames.append(s.text.split('[')[0].strip())
                    klub['nicknames'] = nicknames
                else:
                    klub['nicknames'] = [i.text for i in r.find_all('i')]
            elif r.text[0:7] == 'Founded':
                try:
                    klub['founded'] = re.search('\d{4}', r.text).group(0)
                except:
                    klub['founded'] = None
            elif r.text[0:7] == 'Full na':
                klub['full'] = r.find(class_='infobox-data').text.strip().split('[')[0]
            elif r.text[0:7] == 'Short n':
                klub['short'] = r.find(class_='infobox-data').text.strip().split('[')[0]
            elif r.text[0:6] == 'League':
                try:
                    klub['league'] = r.find('a').text.strip()
                except:
                    klub['league'] = r.find('td').text.strip()
    
    return klub

In [113]:
scrape_clubs('Zag%C5%82%C4%99bie_Lubin.html')

{'teamName': 'Zagłębie Lubin',
 'teamUrl': 'Zag%C5%82%C4%99bie_Lubin',
 'full': 'Zagłębie Lubin Spółka Akcyjna',
 'nicknames': ['Miedziowi'],
 'founded': '1945',
 'league': 'Ekstraklasa'}

In [118]:
scrape_clubs('A.S._Livorno_Calcio.html')

{'teamName': 'US Livorno 1915',
 'teamUrl': 'A.S._Livorno_Calcio',
 'full': 'Unione Sportiva Livorno 1915',
 'nicknames': ['Gli Amaranto', 'I Labronici', 'Le Triglie'],
 'founded': '1915',
 'league': 'Serie D'}

In [119]:
df = pd.DataFrame()
for h in html_files:
    print(h)
    k = pd.DataFrame([scrape_clubs(h)])
    df = pd.concat([df, k])

%C3%85bo_IFK.html
%C3%85tvidabergs_FF.html
%C3%87aykur_Rizespor.html
%C3%8D%C3%BEr%C3%B3ttabandalag_Akraness.html
%C3%8D%C3%BEr%C3%B3ttabandalag_Vestmannaeyja.html
%C3%8DBV_men%27s_football.html
%C3%8DB_Akureyri.html
%C3%8DF_Fuglafj%C3%B8r%C3%B0ur.html
%C3%96rebro_SK.html
%C3%96rgryte_IS.html
%C3%96stersunds_FK.html
%C3%96sters_IF.html
%C3%9Ajpest_FC.html
%C3%9E%C3%B3r_Akureyri.html
%C4%A6amrun_Spartans_F.C..html
%C4%B0stanbulspor_A.%C5%9E..html
%C4%B0stanbul_Ba%C5%9Fak%C5%9Fehir_F.K..html
%C5%81KS_%C5%81%C3%B3d%C5%BA.html
%C5%9Al%C4%85sk_Wroc%C5%82aw.html
%C5%A0K_Futura_Humenn%C3%A9.html
%C5%A0K_Slovan_Bratislava.html
%C5%BBurrieq_F.C..html
1.FC_Kaiserslautern.html
1._FC_Brno.html
1._FC_Frankfurt.html
1._FC_K%C3%B6ln.html
1._FC_Kaiserslautern.html
1._FC_Lokomotive_Leipzig.html
1._FC_Magdeburg.html
1._FC_N%C3%BCrnberg.html
1._FC_Saarbr%C3%BCcken.html
1._FC_Slov%C3%A1cko.html
1._FC_Tatran_Pre%C5%A1ov.html
1._FC_Union_Berlin.html
1._FK_P%C5%99%C3%ADbram.html
1._FSV_Mainz_05.html
1._Wiene

F.C._Copenhagen.html
F.C._Dinamo_Bucure%C8%99ti.html
F.C._Domagnano.html
F.C._Etar.html
F.C._Hansa_Rostock.html
F.C._Internazionale_Milano.html
F.C._Pa%C3%A7os_de_Ferreira.html
F.C._Porto.html
F.C._Rustavi_Metalurgist.html
F.C._Utrecht.html
F91_Dudelange.html
Falkirk_F.C..html
FA_%C5%A0iauliai.html
FA_Red_Boys_Differdange.html
FBK_Kaunas.html
FCI_Levadia_Tallinn.html
FCI_Tallinn.html
FCM_Bac%C4%83u.html
FCM_Dun%C4%83rea_Gala%C8%9Bi.html
FCSB.html
FCV_Farul_Constan%C8%9Ba.html
FC_%C5%BDalgiris.html
FC_Aarau.html
FC_Admira_Wacker_M%C3%B6dling.html
FC_Akhmat_Grozny.html
FC_Aktobe.html
FC_Alania_Vladikavkaz.html
FC_Alashkert.html
FC_Alma-Ata.html
FC_Ameri_Tbilisi.html
FC_Amkar_Perm.html
FC_Amsterdam.html
FC_Anzhi_Makhachkala.html
FC_AP_Brera.html
FC_Ararat-Armenia.html
FC_Ararat_Yerevan.html
FC_Arda_Kardzhali.html
FC_Arge%C5%9F_Pite%C5%9Fti.html
FC_Arge%C8%99_Pite%C8%99ti.html
FC_Aris_Bonnevoie.html
FC_Arsenal_Kyiv.html
FC_Arsenal_Tula.html
FC_Artmedia_Bratislava.html
FC_Artmedia_Petr%C5%B

FK_Belasica.html
FK_Bokelj.html
FK_Bor.html
FK_Borac_%C4%8Ca%C4%8Dak.html
FK_Borac_Banja_Luka.html
FK_Budu%C4%87nost_Banatski_Dvor.html
FK_Budu%C4%87nost_Banovi%C4%87i.html
FK_Budu%C4%87nost_Podgorica.html
FK_Cementarnica_55.html
FK_Chmel_Bl%C5%A1any.html
FK_DAC_1904_Dunajsk%C3%A1_Streda.html
FK_Daugava_(2003).html
FK_De%C4%8Di%C4%87.html
FK_Dinamo_Tirana.html
FK_Drnovice.html
FK_Dukla_Bansk%C3%A1_Bystrica.html
FK_Ekranas.html
FK_Gj%C3%B8vik-Lyn.html
FK_Grbalj.html
FK_Hajduk_Kula.html
FK_Haugesund.html
FK_Inkaras_Kaunas.html
FK_Inter_Bratislava.html
FK_Iskra_Danilovgrad.html
FK_Jablonec.html
FK_Jablonec_97.html
FK_Jagodina.html
FK_Jelgava.html
FK_Karabakh.html
FK_Kareda_Kaunas.html
FK_Karvan.html
FK_Kauno_%C5%BDalgiris.html
FK_Khazar_Lankaran.html
FK_Khazar_Lenkoran.html
FK_Kruoja_Pakruojis.html
FK_Kuk%C3%ABsi.html
FK_Leotar.html
FK_Liep%C4%81ja.html
FK_Liep%C4%81jas_Metalurgs.html
FK_Lov%C4%87en.html
FK_Lyn.html
FK_Makedonija_G.P..html
FK_Masall%C4%B1.html
FK_Metalurg_Skopje.html
FK_M

KuPS.html
KV_Mechelen.html
L.B._Ch%C3%A2teauroux.html
L.R._Vicenza_Virtus.html
Landskrona_BoIS.html
Larne_F.C..html
LASK.html
LASK_Linz.html
Lausanne_Sports.html
Lechia_Gda%C5%84sk.html
Lech_Pozna%C5%84.html
Leeds_United_A.F.C..html
Leeds_United_F.C..html
Legia_Warsaw.html
Legia_Warszawa.html
Leicester_City_F.C..html
Leiftur_%C3%93lafsfj%C3%B6r%C3%B0ur.html
Leipzig.html
Leix%C3%B5es_S.C..html
Levante_UD.html
Levski_Sofia.html
Liep%C4%81jas_Metalurgs.html
Lierse_S.K..html
Lillestr%C3%B8m_S.K..html
Lillestr%C3%B8m_SK.html
Lille_OSC.html
Limerick_F.C..html
Limerick_United_F.C..html
Lincoln_Red_Imps_F.C..html
Linfield_F.C..html
Lisburn_Distillery_F.C..html
Litex_Lovech.html
Liverpool_F.C..html
Livingston_F.C..html
Llandudno_F.C..html
Llanelli_A.F.C..html
Llanelli_AFC.html
Llanelli_Town_A.F.C..html
Lokomotiv_Plovdiv.html
Lolland-Falster_Alliancen.html
Lombard-P%C3%A1pa_TFC.html
London_XI.html
Longford_Town_F.C..html
LR_Vicenza.html
Luft%C3%ABtari_Gjirokast%C3%ABr.html
Lusitanos.html
Lyngby_

SKN_St._P%C3%B6lten.html
Skoda_Xanthi.html
Skoda_Xanthi_F.C..html
Skonto_FC.html
Skonto_Riga.html
SK_Brann.html
SK_Haugar.html
SK_Rapid_Wien.html
SK_Sigma_Olomouc.html
SK_Slavia_Prague.html
SK_Slovan_Bratislava.html
SK_Sturm_Graz.html
SK_Vorw%C3%A4rts_Steyr.html
Slavia_Prague.html
Slavija_Isto%C4%8Dno_Sarajevo.html
Sliema_Wanderers_F.C..html
Sligo_Rovers_F.C..html
Slovan_Liberec.html
Southampton_F.C..html
Spartak_Brno_KPS.html
Spartak_Myjava.html
Spartak_Trnava.html
Spartak_Yerevan_FC.html
Sparta_Prague.html
Sparta_Rotterdam.html
Speran%C8%9Ba_Nisporeni.html
Sporting_Clube_de_Portugal.html
Sporting_Club_de_Portugal.html
Sporting_CP.html
Sporting_de_Gij%C3%B3n.html
Sporting_Fingal_F.C..html
SP_La_Fiorita.html
SP_Tre_Penne.html
SR_Bra%C8%99ov.html
SSC_Napoli.html
SS_Lazio.html
St%C3%A6vnet.html
St._Patrick%27s_Athletic.html
St._Patrick%27s_Athletic_F.C..html
Stab%C3%A6k_Fotball.html
Stade_de_Reims.html
Stade_Dudelange.html
Stade_Fran%C3%A7ais_(association_football).html
Stade_Lavallois.h

In [131]:
df

Unnamed: 0,teamName,teamUrl,full,nicknames,founded,league,short
0,Åbo IFK,%C3%85bo_IFK,Idrottsföreningen Kamraterna i Åbo,[],1908,Kakkonen,
0,Åtvidabergs FF,%C3%85tvidabergs_FF,Åtvidabergs Fotbollförening,,1907,Ettan Fotboll,
0,Çaykur Rizespor,%C3%87aykur_Rizespor,Çaykur Rize Gençlik ve Spor Kulübü Derneği,[Karadeniz Atmacası (The Black Sea Sparrowhaw...,1953,Süper Lig,Rizespor
0,Íþróttabandalag Akraness,%C3%8D%C3%BEr%C3%B3ttabandalag_Akraness,Íþróttabandalag Akraness,[Skagamenn],1946,Besta deildin,ÍA
0,Íþróttabandalag Vestmannaeyja,%C3%8D%C3%BEr%C3%B3ttabandalag_Vestmannaeyja,Íþróttabandalag Vestmannaeyja,,1903,,
...,...,...,...,...,...,...,...
0,Zagreb,Zagreb,,,,,
0,Zalaegerszegi TE,Zalaegerszegi_TE,Zalaegerszegi Torna Egylet Football Club,[],1920,NB I,ZTE
0,Zawisza Bydgoszcz,Zawisza_Bydgoszcz,SP Zawisza Bydgoszcz,"[Wojskowi, Niebiesko-Czarni, Rycerze Pomorza]",1946,III liga,
0,Zira FK,Zira_FK,Zirə Futbol Klubu,[Qartallar],2014,Azerbaijan Premier League,


In [121]:
df.sort_values(by='founded').head(50)

Unnamed: 0,teamName,teamUrl,full,nicknames,founded,league,short
0,VfL Bochum,VfL_Bochum,Verein für Leibesübungen Bochum 1848 Fußballge...,"[Die Unabsteigbaren, citation needed, Die Blau...",1848,Bundesliga,
0,TSV 1860 Munich,TSV_1860_M%C3%BCnchen,Turn- und Sportverein München von 1860,"[Die Löwen, Sechzig, (Die) Sechzger, Weiß und ...",1860,3. Liga,
0,TSV 1860 Munich,TSV_1860_Munich,Turn- und Sportverein München von 1860,"[Die Löwen, Sechzig, (Die) Sechzger, Weiß und ...",1860,3. Liga,
0,Stoke City F.C.,Stoke_City_F.C.,Stoke City Football Club,[],1863,EFL Championship,
0,K.A.A. Gent,KAA_Gent,Koninklijke Atletiek Associatie Gent,[De Buffalo's],1864,Belgian Pro League,
0,K.A.A. Gent,K.A.A._Gent,Koninklijke Atletiek Associatie Gent,[De Buffalo's],1864,Belgian Pro League,
0,Wrexham A.F.C.,Wrexham_F.C.,Wrexham Association Football Club,[],1864,EFL League One,Wrexham AFCCPD Wrecsam (Welsh)
0,Wrexham A.F.C.,Wrexham_A.F.C.,Wrexham Association Football Club,[],1864,EFL League One,Wrexham AFCCPD Wrecsam (Welsh)
0,Nottingham Forest F.C.,Nottingham_Forest,Nottingham Forest Football Club,[],1865,Premier League,Forest
0,Nottingham Forest F.C.,Nottingham_Forest_F.C.,Nottingham Forest Football Club,[],1865,Premier League,Forest


In [124]:
df2 = df[['teamName','teamUrl','full','founded']].groupby(df[['teamName','teamUrl','full','founded']].columns.difference(['teamUrl']).tolist(), as_index=False).agg(list)

In [125]:
df2

Unnamed: 0,founded,full,teamName,teamUrl
0,1848,Verein für Leibesübungen Bochum 1848 Fußballge...,VfL Bochum,[VfL_Bochum]
1,1860,Turn- und Sportverein München von 1860,TSV 1860 Munich,"[TSV_1860_M%C3%BCnchen, TSV_1860_Munich]"
2,1863,Stoke City Football Club,Stoke City F.C.,[Stoke_City_F.C.]
3,1864,Koninklijke Atletiek Associatie Gent,K.A.A. Gent,"[K.A.A._Gent, KAA_Gent]"
4,1864,Wrexham Association Football Club,Wrexham A.F.C.,"[Wrexham_A.F.C., Wrexham_F.C.]"
...,...,...,...,...
1203,2017,Football Club Noah,FC Noah,[FC_Noah]
1204,2017,Sabah Futbol Klubu,Sabah FC (Azerbaijan),[Sabah_FC_(Azerbaijan)]
1205,2018,Футбольный клуб Сочи(Football Club Sochi),PFC Sochi,[PFC_Sochi]
1206,2020,Club Football Estrela da Amadora SAD,C.F. Estrela da Amadora,[C.F._Estrela_da_Amadora]


In [132]:
df2 = df.explode('nicknames').groupby(df.columns.difference(['teamUrl','nicknames']).tolist(), as_index=False).agg(list)

In [134]:
df3 = df.explode('nicknames')
df3

Unnamed: 0,teamName,teamUrl,full,nicknames,founded,league,short
0,Åbo IFK,%C3%85bo_IFK,Idrottsföreningen Kamraterna i Åbo,,1908,Kakkonen,
0,Åtvidabergs FF,%C3%85tvidabergs_FF,Åtvidabergs Fotbollförening,,1907,Ettan Fotboll,
0,Çaykur Rizespor,%C3%87aykur_Rizespor,Çaykur Rize Gençlik ve Spor Kulübü Derneği,Karadeniz Atmacası (The Black Sea Sparrowhawk),1953,Süper Lig,Rizespor
0,Çaykur Rizespor,%C3%87aykur_Rizespor,Çaykur Rize Gençlik ve Spor Kulübü Derneği,Karadeniz Atmacası,1953,Süper Lig,Rizespor
0,Íþróttabandalag Akraness,%C3%8D%C3%BEr%C3%B3ttabandalag_Akraness,Íþróttabandalag Akraness,Skagamenn,1946,Besta deildin,ÍA
...,...,...,...,...,...,...,...
0,Zawisza Bydgoszcz,Zawisza_Bydgoszcz,SP Zawisza Bydgoszcz,Wojskowi,1946,III liga,
0,Zawisza Bydgoszcz,Zawisza_Bydgoszcz,SP Zawisza Bydgoszcz,Niebiesko-Czarni,1946,III liga,
0,Zawisza Bydgoszcz,Zawisza_Bydgoszcz,SP Zawisza Bydgoszcz,Rycerze Pomorza,1946,III liga,
0,Zira FK,Zira_FK,Zirə Futbol Klubu,Qartallar,2014,Azerbaijan Premier League,


In [None]:
df3.groupby(df3.columns.difference(['teamUrl','nicknames']).tolist(), as_index=False)

In [141]:
df['teamUrl'].nunique()

1636

In [142]:
df['full'].nunique()

1208

In [143]:
df['short'].nunique()

234

In [138]:
df3_filled = df3.fillna('NaN_Placeholder')

In [139]:
df3.groupby(df.columns.difference(['teamUrl','nicknames']).tolist(), as_index=False).agg(lambda x: list(x.dropna()))

Unnamed: 0,founded,full,league,short,teamName,teamUrl,nicknames
0,1864,Wrexham Association Football Club,EFL League One,Wrexham AFCCPD Wrecsam (Welsh),Wrexham A.F.C.,"[Wrexham_A.F.C., Wrexham_F.C.]",[]
1,1865,Nottingham Forest Football Club,Premier League,Forest,Nottingham Forest F.C.,"[Nottingham_Forest, Nottingham_Forest_F.C.]",[]
2,1867,Sheffield Wednesday Football Club,EFL Championship,SWFC,Sheffield Wednesday F.C.,[Sheffield_Wednesday_F.C.],[]
3,1874,Aston Villa Football Club,Premier League,Villa,Aston Villa F.C.,"[Aston_Villa, Aston_Villa_F.C., Aston_Villa_F.C]",[]
4,1876,Kjøbenhavns Boldklub,Københavnsserien (tier 5),KB,Kjøbenhavns Boldklub,[Kj%C3%B8benhavns_Boldklub],[]
...,...,...,...,...,...,...,...
213,2009,RasenBallsport Leipzig e.V.,Bundesliga,RBL,RB Leipzig,[RB_Leipzig],[Die Roten Bullen]
214,2011,Asociația Club Sportiv Sepsi OSK Sfântu Gheorghe,Liga I,Sepsi,Sepsi OSK Sfântu Gheorghe,[Sepsi_OSK_Sf%C3%A2ntu_Gheorghe],[Piros-fehérek / Roș-albii (The Red and Whites)]
215,2013,Nykøbing Football Club,Danish 2nd Division,NFC,Nykøbing FC,"[Boldklubben_1901, Lolland-Falster_Alliancen]","[Alliancen, Alliancen]"
216,2013,Penybont Football Club,Cymru Premier,Penybont,Penybont F.C.,[Penybont_F.C.],[]


In [137]:
df3.groupby(df.columns.difference(['teamUrl','nicknames']).tolist(), as_index=False).agg(lambda x: list(x.dropna()))

Unnamed: 0,founded,full,league,short,teamName,teamUrl,nicknames
0,1864,Wrexham Association Football Club,EFL League One,Wrexham AFCCPD Wrecsam (Welsh),Wrexham A.F.C.,"[Wrexham_A.F.C., Wrexham_F.C.]",[]
1,1865,Nottingham Forest Football Club,Premier League,Forest,Nottingham Forest F.C.,"[Nottingham_Forest, Nottingham_Forest_F.C.]",[]
2,1867,Sheffield Wednesday Football Club,EFL Championship,SWFC,Sheffield Wednesday F.C.,[Sheffield_Wednesday_F.C.],[]
3,1874,Aston Villa Football Club,Premier League,Villa,Aston Villa F.C.,"[Aston_Villa, Aston_Villa_F.C., Aston_Villa_F.C]",[]
4,1876,Kjøbenhavns Boldklub,Københavnsserien (tier 5),KB,Kjøbenhavns Boldklub,[Kj%C3%B8benhavns_Boldklub],[]
...,...,...,...,...,...,...,...
213,2009,RasenBallsport Leipzig e.V.,Bundesliga,RBL,RB Leipzig,[RB_Leipzig],[Die Roten Bullen]
214,2011,Asociația Club Sportiv Sepsi OSK Sfântu Gheorghe,Liga I,Sepsi,Sepsi OSK Sfântu Gheorghe,[Sepsi_OSK_Sf%C3%A2ntu_Gheorghe],[Piros-fehérek / Roș-albii (The Red and Whites)]
215,2013,Nykøbing Football Club,Danish 2nd Division,NFC,Nykøbing FC,"[Boldklubben_1901, Lolland-Falster_Alliancen]","[Alliancen, Alliancen]"
216,2013,Penybont Football Club,Cymru Premier,Penybont,Penybont F.C.,[Penybont_F.C.],[]


In [120]:
df3 = df.groupby(df.columns.difference(['teamUrl']).tolist(), as_index=False).agg(list)

TypeError: unhashable type: 'list'

In [None]:
df

KeyError: 'homeTeam'

In [144]:
df.to_csv(os.path.join('data','teams.csv'), index=False)

In [145]:
df.reset_index(drop=True).to_json(os.path.join('data','teams.json'))