# Data Analysis

In [1]:
import pandas as pd
import numpy as np
import panel as pn
pn.extension()               # pn.extension('tabulator') Switching between extensions might turn plots invisible in the notebook

import holoviews as hv
import hvplot.pandas         # adds hvplot method to pandas objects. Also, sets bokeh as pn extension
from hvplot import hvPlot
import sqlite3

import matplotlib as mpl

mpl.use('agg')                # we are going to use some matplotlib aggregation functions in this project 

In [2]:
# cache data to improve the dashboard performance
if 'data' not in pn.state.cache.keys():

    df = pd.read_csv('dataSets\\allTimeSquads.csv')

    pn.state.cache['data'] = df.copy()

else: 

    df = pn.state.cache['data']

In [3]:
df.shape

(10964, 12)

In [4]:
df1930 = df[df.year == 1930]

In [5]:
df1930

Unnamed: 0,player,national_team,position,birthday,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,1905-05-05,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,1908-10-23,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,1907-02-23,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,1907-02-23,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,1909-03-19,21,0,Estudiantil Porteño,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...,...
236,Ljubiša Stefanović,Yugoslavia,MF,1910-01-04,20,0,FC Sète,France,1930-07-17,1930,Europe,-
237,Milan Stojanović,Yugoslavia,GK,1911-12-28,18,0,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-
238,Aleksandar Tirnanić,Yugoslavia,FW,1910-07-15,20,5,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-
239,Dragomir Tošić,Yugoslavia,DF,1909-11-08,20,0,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-


In [6]:
df.head()

Unnamed: 0,player,national_team,position,birthday,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,1905-05-05,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,1908-10-23,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,1907-02-23,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,1907-02-23,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,1909-03-19,21,0,Estudiantil Porteño,Argentina,1930-07-17,1930,South America,-


In [7]:
df = df.drop(columns = 'birthday')

In [8]:
wc = df[df.year == 1930]

In [9]:
wc.age.mean()

24.443983402489625

In [10]:
wc2 = df[df.year == 2022]

In [11]:
wc2.age.mean()

26.94344163658243

In [12]:
wc3 = df[df.year == 2018]

In [13]:
wc3.age 

9397     32
9398     31
9399     25
9400     31
9401     32
         ..
10128    27
10129    28
10130    31
10131    31
10132    35
Name: age, Length: 736, dtype: int64

In [14]:
df.year.unique()

array([1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978,
       1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022],
      dtype=int64)

In [15]:
# Calculate the average age of players for each world cup

In [16]:
for y in df.year.unique():
    means = df[df.year == y].age.mean()
    means = round(means,2)
    print(means)

24.44
24.92
25.43
26.46
26.38
26.27
25.78
25.79
25.76
26.21
26.1
26.61
26.63
26.39
26.85
27.01
26.97
26.88
26.9
26.85
27.44
26.94


In [17]:
groupYear = df.groupby(['year'])

In [18]:
#countAge = groupYear['age'].value_counts().to_frame().rename(columns = {'age':'count'}).reset_index(1)

In [19]:
countAge = groupYear['age'].mean()
countAge = round(countAge,2)
countAge.to_frame().reset_index()

Unnamed: 0,year,age
0,1930,24.44
1,1934,24.92
2,1938,25.43
3,1950,26.46
4,1954,26.38
5,1958,26.27
6,1962,25.78
7,1966,25.79
8,1970,25.76
9,1974,26.21


In [20]:
# calculate median age of players

In [21]:
print("Medians")
for y in df.year.unique():
    medians = df[df.year == y].age.median()
    print(f"{y} = {medians} years old")

Medians
1930 = 24.0 years old
1934 = 25.0 years old
1938 = 25.0 years old
1950 = 27.0 years old
1954 = 26.0 years old
1958 = 26.0 years old
1962 = 25.0 years old
1966 = 25.0 years old
1970 = 26.0 years old
1974 = 26.0 years old
1978 = 26.0 years old
1982 = 26.0 years old
1986 = 26.0 years old
1990 = 26.0 years old
1994 = 27.0 years old
1998 = 27.0 years old
2002 = 27.0 years old
2006 = 27.0 years old
2010 = 27.0 years old
2014 = 27.0 years old
2018 = 27.0 years old
2022 = 27.0 years old


In [22]:
#hvplot.help('bar')

In [23]:
def plot_world_avg_exp(year): # receives a slider object.
    year_df = df[df['year'] == year]
    
    return year_df.hvplot.bar('year', y='caps',   # y default is = all numeric columns
                              height=320, width=550,line_color = 'black', color = 'purple',ylim=(0,50),
                              legend=False, yformatter='%.0f'
                              
                             ).aggregate(function=np.mean).opts(xlabel="Year", ylabel="Avg. caps",
                                                                title="Iternational experience (world)",xrotation=20) 

In [24]:
def plot_sa_avg_exp(year): # receives a slider object.

    year_df = df[(df['year'] == year) & (df['confederation']== 'South America')]
    
    return year_df.hvplot.bar('year', y='caps',   # y default is = all numeric columns
                              height=320, width=550,line_color = 'black', color = 'purple',ylim=(0,50),
                              legend=False, yformatter='%.0f'
                              
                             ).aggregate(function=np.mean).opts(xlabel="Year", ylabel="Avg. caps",
                                                                title="Iternational experience (south america)",xrotation=20) 

In [25]:
# Create a slider widget 
year_slider2 = pn.widgets.IntSlider(name='Year Slider', width=200,
                                 start=1930, end=2022, value=(1930),
                                 step=4,value_throttled=(1930))
# Apply a callback policy                                 
@pn.depends(year_slider2.param.value_throttled)
def year_selected(year_slider2):
    return '### Jobs in {}'.format(year_slider2)

In [26]:
year_slider2 

In [27]:
plots_box2 = pn.WidgetBox(pn.Row(pn.bind(plot_world_avg_exp,year_slider2), align="start", sizing_mode="stretch_width"))

In [28]:
plots_box2

In [29]:
df

Unnamed: 0,player,national_team,position,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,21,0,Estudiantil Porteño,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...
10959,Sorba Thomas,Wales,MF,23,6,Huddersfield Town,England,2022-11-20,2022,Europe,-
10960,Dylan Levitt,Wales,MF,22,13,Dundee United,Scotland,2022-11-20,2022,Europe,-
10961,Ben Cabango,Wales,DF,22,5,Swansea City,Wales,2022-11-20,2022,Europe,-
10962,Rubin Colwill,Wales,MF,20,7,Cardiff City,Wales,2022-11-20,2022,Europe,-


In [30]:
df.hvplot.line(x= 'year' , y= ['age']).aggregate(function=np.mean)

### Average international player experience, by world cup

In [31]:
df.hvplot.line(x= 'year' , y= ['caps']).aggregate(function=np.mean)

In [32]:
conts = df.groupby(['year','confederation'])['caps'].mean()

In [33]:
conts = conts.to_frame().reset_index()
conts

Unnamed: 0,year,confederation,caps
0,1930,Europe,6.406250
1,1930,North America,0.727273
2,1930,South America,4.270833
3,1934,Africa,1.650000
4,1934,Europe,9.787072
...,...,...,...
99,2022,Asia,39.038760
100,2022,Europe,35.872781
101,2022,North America,38.240385
102,2022,Oceania,20.653846


In [34]:
sorted_conts = conts.sort_values(by=['confederation', 'year']).reset_index(drop = True)

In [35]:
sorted_conts.iloc[100:106]

Unnamed: 0,year,confederation,caps
100,2010,South America,28.86087
101,2014,South America,35.072464
102,2018,South America,35.269565
103,2022,South America,35.125


## Países con más presencias

In [36]:
nations = df[['year','national_team']]

In [37]:
naciones= nations.groupby(['year','national_team'])['national_team'].count().reset_index(name='jugadores')

In [38]:
naciones

Unnamed: 0,year,national_team,jugadores
0,1930,Argentina,22
1,1930,Belgium,16
2,1930,Bolivia,17
3,1930,Brazil,22
4,1930,Chile,19
...,...,...,...
484,2022,Switzerland,26
485,2022,Tunisia,26
486,2022,United States,26
487,2022,Uruguay,26


In [39]:
# Encontrar el valor mínimo en la columna 'count'
min_count = naciones['jugadores'].min()

In [40]:
# Crear una máscara booleana para identificar las filas con el valor mínimo
min_players = naciones[naciones['jugadores'] == min_count]

In [41]:
min_players

Unnamed: 0,year,national_team,jugadores
9,1930,Romania,15
31,1938,Cuba,15


In [42]:
naciones.drop('jugadores',axis=1)

Unnamed: 0,year,national_team
0,1930,Argentina
1,1930,Belgium
2,1930,Bolivia
3,1930,Brazil
4,1930,Chile
...,...,...
484,2022,Switzerland
485,2022,Tunisia
486,2022,United States
487,2022,Uruguay


In [43]:
naciones.national_team.value_counts().iloc[0:50]

Brazil            22
Argentina         18
Italy             18
Mexico            17
France            16
England           16
Spain             16
Belgium           14
Uruguay           14
Switzerland       12
Sweden            12
South Korea       11
Netherlands       11
United States     11
West Germany      10
Germany           10
Hungary            9
Yugoslavia         9
Chile              9
Poland             9
Czechoslovakia     8
Cameroon           8
Portugal           8
Scotland           8
Paraguay           8
Austria            7
Soviet Union       7
Romania            7
Japan              7
Bulgaria           7
Iran               6
Tunisia            6
Denmark            6
Costa Rica         6
Nigeria            6
Croatia            6
Saudi Arabia       6
Morocco            6
Australia          6
Colombia           6
Peru               5
Algeria            4
Ecuador            4
Russia             4
Ghana              4
Norway             3
Bolivia            3
South Africa 

In [44]:
Germany = naciones.national_team.value_counts().iloc[14:16].sum()

In [45]:
Germany

20

## Clubes que más jugadores aportaron

In [46]:
df

Unnamed: 0,player,national_team,position,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,21,0,Estudiantil Porteño,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...
10959,Sorba Thomas,Wales,MF,23,6,Huddersfield Town,England,2022-11-20,2022,Europe,-
10960,Dylan Levitt,Wales,MF,22,13,Dundee United,Scotland,2022-11-20,2022,Europe,-
10961,Ben Cabango,Wales,DF,22,5,Swansea City,Wales,2022-11-20,2022,Europe,-
10962,Rubin Colwill,Wales,MF,20,7,Cardiff City,Wales,2022-11-20,2022,Europe,-


## Normalizar nombres de países, para no tener datos divididos

In [47]:
df.club_origin.unique()

array(['Argentina', 'Belgium', 'Bolivia', 'Brazil', 'Chile', 'France',
       'Mexico', 'Paraguay', 'Peru', 'Romania', 'United States',
       'Uruguay', 'Kingdom of Yugoslavia', 'Austria', 'Czechoslovakia',
       'Egypt', 'Germany', 'Hungary', 'Italy', 'Netherlands', 'Spain',
       'Sweden', 'Switzerland', 'Cuba', 'Dutch East Indies', 'Norway',
       'Poland', 'England', 'Socialist Federal Republic of Yugoslavia',
       'Scotland', 'South Korea', 'Turkey', 'West Germany',
       'Northern Ireland', 'Soviet Union', 'Wales', 'Bulgaria',
       'Colombia', 'North Korea', 'Portugal', 'El Salvador', 'Israel',
       'Morocco', 'Australia', 'East Germany', 'Haiti',
       'Trinidad and Tobago', 'Zaire', '-', 'Iran', 'Tunisia',
       'Saudi Arabia', 'Algeria', 'Cameroon', 'Ivory Coast', 'Guatemala',
       'Honduras', 'Kuwait', 'New Zealand', 'Canada', 'Denmark', 'Greece',
       'Iraq', 'Réunion', 'Costa Rica', 'United Arab Emirates', 'Japan',
       'Qatar', 'Russia', 'Ecuador', 'Croa

In [48]:
# Utiliza el método .replace() con un diccionario para realizar las sustituciones
df['club_origin'] = df['club_origin'].replace({'Kingdom of Yugoslavia': 'Yugoslavia',
                                     'Socialist Federal Republic of Yugoslavia': 'Yugoslavia',
                                     'Federal Republic of Yugoslavia': 'Yugoslavia',
                                     'East Germany': 'Germany',
                                     'West Germany': 'Germany'})

In [49]:
naciones= nations.groupby(['year','national_team'])['national_team'].count().reset_index(name='jugadores')

In [50]:
origen_club=df.club_origin.unique().tolist()

In [51]:
#sorted(origen_club)

In [52]:
dash = '-'

In [53]:
coincidencias = df['club_origin'] == dash

In [54]:
filas = df[coincidencias]

In [55]:
filas

Unnamed: 0,player,national_team,position,age,caps,club,club_origin,wc_date,year,confederation,imputed
3303,Alberto Tarantini,Argentina,DF,22,26,Free Agent,-,1978-06-01,1978,South America,-


In [56]:
df['club_and_origin'] = df['club'] + ' from ' + df['club_origin']

In [57]:
clubs = df['club_and_origin'].to_frame()

In [58]:
clubs

Unnamed: 0,club_and_origin
0,Talleres (BA) from Argentina
1,Argentino (Q) from Argentina
2,Boca Juniors from Argentina
3,Central Norte from Argentina
4,Estudiantil Porteño from Argentina
...,...
10959,Huddersfield Town from England
10960,Dundee United from Scotland
10961,Swansea City from Wales
10962,Cardiff City from Wales


In [59]:
clubs_or = clubs.club_and_origin.unique().tolist()

In [60]:
sorted(clubs_or)

['1. FC Kaiserslautern from Germany',
 '1. FC Kaiserslautern[8] from Germany',
 '1. FC Köln from Germany',
 '1. FC Köln[7] from Germany',
 '1. FC Lokomotive Leipzig from Germany',
 '1. FC Magdeburg from Germany',
 '1. FC Nürnberg from Germany',
 '1. FC Nürnberg[105] from Germany',
 '1. FC Saarbrücken from Germany',
 '1. FC Schweinfurt 05 from Germany',
 '1. Simmeringer SC from Austria',
 '1860 Munich from Germany',
 '1899 Hoffenheim from Germany',
 '1899 Hoffenheim[141] from Germany',
 '8 August from North Korea',
 '8 February from North Korea',
 'A.C. Mantova from Italy',
 'A.S.V. Oostende K.M. from Belgium',
 'AC Bellinzona from Switzerland',
 'AC Sparta Prague from Czechoslovakia',
 'ADO Den Haag from Netherlands',
 'AEK Athens F.C. from Greece',
 'AEK Athens from Greece',
 'AFC Bournemouth from England',
 'AFC Wimbledon from England',
 'AGF from Denmark',
 'AIK from Sweden',
 'AKS Chorzów from Poland',
 'AMEF Arad from Romania',
 'APIA Leichhardt from Australia',
 'APOEL from Cypru

In [61]:
len(clubs_or)

1971

Hay algunos nombres de clubes que vienen con una referencia en los datos originales, para eliminarla vamos a utilizar regular expresions

In [62]:
import re

In [63]:
# Utilizar el método .str.replace() para realizar la sustitución.Es lo mismo que hacerlo con apply + función anónima pero 
# más eficiente.
clubs['club_and_origin'] = clubs['club_and_origin'].str.replace(r'\[\d+\]', '', regex=True)


In [64]:
clubs_or = clubs.club_and_origin.unique().tolist()

In [65]:
len(clubs_or)

1809

In [67]:
sorted(clubs_or)

['1. FC Kaiserslautern from Germany',
 '1. FC Köln from Germany',
 '1. FC Lokomotive Leipzig from Germany',
 '1. FC Magdeburg from Germany',
 '1. FC Nürnberg from Germany',
 '1. FC Saarbrücken from Germany',
 '1. FC Schweinfurt 05 from Germany',
 '1. Simmeringer SC from Austria',
 '1860 Munich from Germany',
 '1899 Hoffenheim from Germany',
 '8 August from North Korea',
 '8 February from North Korea',
 'A.C. Mantova from Italy',
 'A.S.V. Oostende K.M. from Belgium',
 'AC Bellinzona from Switzerland',
 'AC Sparta Prague from Czechoslovakia',
 'ADO Den Haag from Netherlands',
 'AEK Athens F.C. from Greece',
 'AEK Athens from Greece',
 'AFC Bournemouth from England',
 'AFC Wimbledon from England',
 'AGF from Denmark',
 'AIK from Sweden',
 'AKS Chorzów from Poland',
 'AMEF Arad from Romania',
 'APIA Leichhardt from Australia',
 'APOEL from Cyprus',
 'AS Bilima from Zaire',
 'AS Marsa from Tunisia',
 'AS Monaco from France',
 'AS Saint-Étienne from France',
 'AS Sale from Morocco',
 'AS Tro

1) 'AEK Athens F.C. from Greece','AEK Athens from Greece',
2) Athletic Bilbao from Spain, Atlético Bilbao from Spain,Atlético de Bilbao from Spain Pasar todos a Athletic Bilbao
3) Arsenal F.C. from England, Arsenal from England. Pasar a Arsenal F.C.
4) 'Atlante F.C. from Mexico', 'Atlante from Mexico'
5) 'Austria Vienna from Austria', 'Austria Wien from Austria'
6) 'Borussia Dortmund from Germany', 'BV Borussia Dortmund from Germany'
7) 'Barcelona SC from Ecuador','Barcelona from Ecuador',
8) "Beijing Guo'an from China",'Beijing Guoan from China'
9) 'Beşiktaş J.K. from Turkey','Beşiktaş from Turkey',
10) 'Blackburn Rovers F.C. from England', 'Blackburn Rovers from England',
11) 'Blackpool F.C. from England','Blackpool from England',
12) 'Budapest Honvéd FC from Hungary','Budapest Honvéd from Hungary',
13) 'Burnley F.C. from England', 'Burnley from England',
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)
13)


In [None]:
cuenta = clubs.groupby(['club_and_origin'])['club_and_origin'].count().to_frame()

In [None]:
cuenta = cuenta.rename(columns={'club_and_origin': 'count'})

In [None]:
cuenta.reset_index()

In [None]:
idf = df.interactive()           # creates an interactive DataFrame with hvPlot

In [None]:
# Define Panel widgets
year_slider = pn.widgets.IntSlider(name='Year slider', start=1930, end=2022, step=5, value=1930,width=230)

In [None]:
# Radio buttons for counting fatalities
yAxis = pn.widgets.RadioButtonGroup(
    name='Y axis', 
    options=['national_team','player'],
    button_type='primary'   # choose a color with a predetermined category (as in bootstrap)
    , width=250,height=30
)
#yAxis.servable()

In [None]:
# Make a pipeline by processing the interactive DataFrame. 
# The slider widget should be incorporated here to separate data by year. 
# That will update the output dynamically(year-by-year).
mPPipeline = (
    idf[
        (idf.year <= year_slider)
    ]
    .groupby("year")[yAxis].sum()
    .to_frame()
    .reset_index()
    .sort_values(by='year')  
    .reset_index(drop=True)
)

In [None]:
year_slider

In [None]:
mPPipeline