# Data Analysis

In [1]:
import pandas as pd
import numpy as np
import panel as pn
pn.extension()               # pn.extension('tabulator') Switching between extensions might turn plots invisible in the notebook

import holoviews as hv
import hvplot.pandas         # adds hvplot method to pandas objects. Also, sets bokeh as pn extension
from hvplot import hvPlot
import sqlite3

import matplotlib as mpl

mpl.use('agg')                # we'll use some matplotlib aggregation functions in this project 

In [2]:
# cache data to improve the dashboard performance
if 'data' not in pn.state.cache.keys():

    df = pd.read_csv('dataSets\\allTimeSquads.csv')

    pn.state.cache['data'] = df.copy()

else: 

    df = pn.state.cache['data']

In [3]:
df.shape

(10964, 12)

In [4]:
df1930 = df[df.year == 1930]

In [5]:
df1930

Unnamed: 0,player,national_team,position,birthday,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,1905-05-05,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,1908-10-23,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,1907-02-23,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,1907-02-23,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,1909-03-19,21,0,Estudiantil Porteno,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...,...
236,Ljubiša Stefanović,Yugoslavia,MF,1910-01-04,20,0,Sete,France,1930-07-17,1930,Europe,-
237,Milan Stojanović,Yugoslavia,GK,1911-12-28,18,0,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-
238,Aleksandar Tirnanić,Yugoslavia,FW,1910-07-15,20,5,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-
239,Dragomir Tošić,Yugoslavia,DF,1909-11-08,20,0,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-


In [6]:
df.head()

Unnamed: 0,player,national_team,position,birthday,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,1905-05-05,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,1908-10-23,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,1907-02-23,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,1907-02-23,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,1909-03-19,21,0,Estudiantil Porteno,Argentina,1930-07-17,1930,South America,-


In [7]:
df = df.drop(columns = 'birthday')

In [8]:
wc = df[df.year == 1930]

In [9]:
wc.age.mean()

24.443983402489625

In [10]:
wc2 = df[df.year == 2022]

In [11]:
wc2.age.mean()

26.94344163658243

In [12]:
wc3 = df[df.year == 2018]

In [13]:
wc3.age 

9397     32
9398     31
9399     25
9400     31
9401     32
         ..
10128    27
10129    28
10130    31
10131    31
10132    35
Name: age, Length: 736, dtype: int64

In [14]:
df.year.unique()

array([1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978,
       1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022],
      dtype=int64)

## Calculate the average age of players for each world cup

In [15]:
for y in df.year.unique():
    means = df[df.year == y].age.mean()
    means = round(means,2)
    print(means)

24.44
24.92
25.43
26.46
26.38
26.27
25.78
25.79
25.76
26.21
26.1
26.61
26.63
26.39
26.85
27.01
26.97
26.88
26.9
26.85
27.44
26.94


In [16]:
groupYear = df.groupby(['year'])

In [17]:
#countAge = groupYear['age'].value_counts().to_frame().rename(columns = {'age':'count'}).reset_index(1)

In [18]:
countAge = groupYear['age'].mean()
countAge = round(countAge,2)
countAge.to_frame().reset_index()

Unnamed: 0,year,age
0,1930,24.44
1,1934,24.92
2,1938,25.43
3,1950,26.46
4,1954,26.38
5,1958,26.27
6,1962,25.78
7,1966,25.79
8,1970,25.76
9,1974,26.21


## calculate median age of players

In [19]:
print("Medians")
for y in df.year.unique():
    medians = df[df.year == y].age.median()
    print(f"{y} = {medians} years old")

Medians
1930 = 24.0 years old
1934 = 25.0 years old
1938 = 25.0 years old
1950 = 27.0 years old
1954 = 26.0 years old
1958 = 26.0 years old
1962 = 25.0 years old
1966 = 25.0 years old
1970 = 26.0 years old
1974 = 26.0 years old
1978 = 26.0 years old
1982 = 26.0 years old
1986 = 26.0 years old
1990 = 26.0 years old
1994 = 27.0 years old
1998 = 27.0 years old
2002 = 27.0 years old
2006 = 27.0 years old
2010 = 27.0 years old
2014 = 27.0 years old
2018 = 27.0 years old
2022 = 27.0 years old


In [20]:
#hvplot.help('bar')

In [21]:
def plot_world_avg_exp(year): # receives a slider object.
    year_df = df[df['year'] == year]
    
    return year_df.hvplot.bar('year', y='caps',   # y default is = all numeric columns
                              height=320, width=550,line_color = 'black', color = 'purple',ylim=(0,50),
                              legend=False, yformatter='%.0f'
                              
                             ).aggregate(function=np.mean).opts(xlabel="Year", ylabel="Avg. caps",
                                                                title="Iternational experience (world)",xrotation=20) 

In [22]:
def plot_sa_avg_exp(year): # receives a slider object.

    year_df = df[(df['year'] == year) & (df['confederation']== 'South America')]
    
    return year_df.hvplot.bar('year', y='caps',   # y default is = all numeric columns
                              height=320, width=550,line_color = 'black', color = 'purple',ylim=(0,50),
                              legend=False, yformatter='%.0f'
                              
                             ).aggregate(function=np.mean).opts(xlabel="Year", ylabel="Avg. caps",
                                                                title="Iternational experience (south america)",xrotation=20) 

In [23]:
# Create a slider widget 
year_slider2 = pn.widgets.IntSlider(name='Year Slider', width=200,
                                 start=1930, end=2022, value=(1930),
                                 step=4,value_throttled=(1930))
# Apply a callback policy                                 
@pn.depends(year_slider2.param.value_throttled)
def year_selected(year_slider2):
    return '### Jobs in {}'.format(year_slider2)

In [24]:
year_slider2 

In [25]:
plots_box2 = pn.WidgetBox(pn.Row(pn.bind(plot_world_avg_exp,year_slider2), align="start", sizing_mode="stretch_width"))

In [26]:
plots_box2

In [27]:
df

Unnamed: 0,player,national_team,position,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,21,0,Estudiantil Porteno,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...
10959,Sorba Thomas,Wales,MF,23,6,Huddersfield Town,England,2022-11-20,2022,Europe,-
10960,Dylan Levitt,Wales,MF,22,13,Dundee United,Scotland,2022-11-20,2022,Europe,-
10961,Ben Cabango,Wales,DF,22,5,Swansea City,Wales,2022-11-20,2022,Europe,-
10962,Rubin Colwill,Wales,MF,20,7,Cardiff City,Wales,2022-11-20,2022,Europe,-


In [28]:
df.hvplot.line(x= 'year' , y= ['age']).aggregate(function=np.mean)

## Average international player experience, by world cup

In [29]:
df.hvplot.line(x= 'year' , y= ['caps']).aggregate(function=np.mean)

In [30]:
conts = df.groupby(['year','confederation'])['caps'].mean()

In [31]:
conts = conts.to_frame().reset_index()
conts

Unnamed: 0,year,confederation,caps
0,1930,Europe,6.406250
1,1930,North America,0.727273
2,1930,South America,4.270833
3,1934,Africa,1.650000
4,1934,Europe,9.787072
...,...,...,...
99,2022,Asia,39.038760
100,2022,Europe,35.872781
101,2022,North America,38.240385
102,2022,Oceania,20.653846


In [32]:
sorted_conts = conts.sort_values(by=['confederation', 'year']).reset_index(drop = True)

In [33]:
sorted_conts.iloc[100:106]

Unnamed: 0,year,confederation,caps
100,2010,South America,28.86087
101,2014,South America,35.072464
102,2018,South America,35.269565
103,2022,South America,35.125


## Países con más presencias

In [34]:
nations = df[['year','national_team']]

In [35]:
nat_teams= nations.groupby(['year','national_team'])['national_team'].count().reset_index(name='jugadores')

In [36]:
Germany = {'West Germany': 'Germany'}

In [37]:
nat_teams['national_team'] = nat_teams['national_team'].replace(Germany)

In [38]:
nat_teams

Unnamed: 0,year,national_team,jugadores
0,1930,Argentina,22
1,1930,Belgium,16
2,1930,Bolivia,17
3,1930,Brazil,22
4,1930,Chile,19
...,...,...,...
484,2022,Switzerland,26
485,2022,Tunisia,26
486,2022,United States,26
487,2022,Uruguay,26


In [39]:
# Encontrar el valor mínimo en la columna 'count'
min_count = nat_teams['jugadores'].min()

In [40]:
# Crear una máscara booleana para identificar las filas con el valor mínimo
min_players = nat_teams[nat_teams['jugadores'] == min_count]

In [41]:
min_players

Unnamed: 0,year,national_team,jugadores
9,1930,Romania,15
31,1938,Cuba,15


In [42]:
nat_teams.drop('jugadores',axis=1)

Unnamed: 0,year,national_team
0,1930,Argentina
1,1930,Belgium
2,1930,Bolivia
3,1930,Brazil
4,1930,Chile
...,...,...
484,2022,Switzerland
485,2022,Tunisia
486,2022,United States
487,2022,Uruguay


In [43]:
nat_teams.national_team.value_counts().iloc[0:50]

Brazil            22
Germany           20
Argentina         18
Italy             18
Mexico            17
France            16
England           16
Spain             16
Belgium           14
Uruguay           14
Switzerland       12
Sweden            12
South Korea       11
Netherlands       11
United States     11
Poland             9
Hungary            9
Yugoslavia         9
Chile              9
Paraguay           8
Cameroon           8
Scotland           8
Czechoslovakia     8
Portugal           8
Romania            7
Austria            7
Soviet Union       7
Bulgaria           7
Japan              7
Nigeria            6
Costa Rica         6
Denmark            6
Saudi Arabia       6
Croatia            6
Morocco            6
Tunisia            6
Iran               6
Australia          6
Colombia           6
Peru               5
Algeria            4
Russia             4
Ecuador            4
Ghana              4
Serbia             3
Ivory Coast        3
Egypt              3
Bolivia      

## Clubes que más jugadores aportaron

In [44]:
df

Unnamed: 0,player,national_team,position,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,21,0,Estudiantil Porteno,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...
10959,Sorba Thomas,Wales,MF,23,6,Huddersfield Town,England,2022-11-20,2022,Europe,-
10960,Dylan Levitt,Wales,MF,22,13,Dundee United,Scotland,2022-11-20,2022,Europe,-
10961,Ben Cabango,Wales,DF,22,5,Swansea City,Wales,2022-11-20,2022,Europe,-
10962,Rubin Colwill,Wales,MF,20,7,Cardiff City,Wales,2022-11-20,2022,Europe,-


#### Normalizar nombres de países, para no tener datos divididos

In [45]:
df.club_origin.unique()

array(['Argentina', 'Belgium', 'Bolivia', 'Brazil', 'Chile', 'France',
       'Mexico', 'Paraguay', 'Peru', 'Romania', 'United States',
       'Uruguay', 'Kingdom of Yugoslavia', 'Austria', 'Czechoslovakia',
       'Egypt', 'Germany', 'Hungary', 'Italy', 'Netherlands', 'Spain',
       'Sweden', 'Switzerland', 'Cuba', 'Dutch East Indies', 'Norway',
       'Poland', 'England', 'Socialist Federal Republic of Yugoslavia',
       'Scotland', 'South Korea', 'Turkey', 'West Germany',
       'Northern Ireland', 'Soviet Union', 'Wales', 'Bulgaria',
       'Colombia', 'North Korea', 'Portugal', 'El Salvador', 'Israel',
       'Morocco', 'Australia', 'East Germany', 'Haiti',
       'Trinidad and Tobago', 'Zaire', '-', 'Iran', 'Tunisia',
       'Saudi Arabia', 'Algeria', 'Cameroon', 'Ivory Coast', 'Guatemala',
       'Honduras', 'Kuwait', 'New Zealand', 'Canada', 'Denmark', 'Greece',
       'Iraq', 'Réunion', 'Costa Rica', 'United Arab Emirates', 'Japan',
       'Qatar', 'Russia', 'Ecuador', 'Croa

In [46]:
df2 = df

In [47]:
nations2 = df2[['year','national_team']]

In [48]:
# replace country names
df2['club_origin'] = df2['club_origin'].replace({'Kingdom of Yugoslavia': 'Yugoslavia',
                                     'Socialist Federal Republic of Yugoslavia': 'Yugoslavia',
                                     'Federal Republic of Yugoslavia': 'Yugoslavia',
                                     'East Germany': 'Germany',
                                     'West Germany': 'Germany'})

In [49]:
cuenta = cuenta.rename(columns={'club_and_origin': 'count'})
cuenta.reset_index()

NameError: name 'cuenta' is not defined

In [None]:
naciones = nations2.groupby(['year','national_team'])['national_team'].count().reset_index(name='jugadores')

In [None]:
df2['club_and_origin'] = df2['club'] + ' (' + df2['club_origin'] + ')'

In [None]:
clubs = df2['club_and_origin'].to_frame()

In [None]:
#clubs

In [None]:
cuenta = clubs.groupby(['club_and_origin'])['club_and_origin'].count().to_frame()

In [None]:
cuenta = cuenta.rename(columns={'club_and_origin': 'count'})

In [None]:
cuenta = cuenta.sort_values(by='count', ascending=False)

In [None]:
cuenta.reset_index()

In [None]:
cuenta.iloc[0:50]

In [None]:
df2.head(4)

In [None]:
df2.info()

In [None]:
filas_club = (df2['club'] == 'River Plate') & (df2['national_team'] == 'Argentina')

In [None]:
filas_club = df2[filas_club]

In [None]:
len(filas_club)

In [None]:
filas_club

## Ligas con más convocados

In [None]:
df2.columns

In [None]:
df2.head()

In [None]:
leagues = df2[['year','club_origin']]

In [None]:
leagues = df2.groupby(['year','club_origin'])['year']

In [None]:
leagues = leagues.count().to_frame()

In [None]:
leagues = leagues.rename(columns={'year': 'count'})

In [None]:
leagues

In [None]:
leagues.iloc[150:200]

In [64]:
# Agrupar por 'club_origin' e 'year' y contar las ocurrencias
origin_count = df2.groupby(['club_origin', 'year']).size().reset_index(name='count')

In [65]:
origin_count

Unnamed: 0,club_origin,year,count
0,-,1978,1
1,Algeria,1982,15
2,Algeria,1986,11
3,Algeria,2010,3
4,Algeria,2014,2
...,...,...,...
646,Yugoslavia,1982,16
647,Yugoslavia,1990,13
648,Yugoslavia,1998,3
649,Yugoslavia,2002,1


In [63]:
nat_teams

Unnamed: 0,year,national_team,jugadores
0,1930,Argentina,22
1,1930,Belgium,16
2,1930,Bolivia,17
3,1930,Brazil,22
4,1930,Chile,19
...,...,...,...
484,2022,Switzerland,26
485,2022,Tunisia,26
486,2022,United States,26
487,2022,Uruguay,26


In [80]:
# Obtener una lista de años únicos en ambos DataFrames
unique_years = origin_count['year'].unique()

# Crear una lista para almacenar los resultados
results = []

# Iterar a través de cada año único
for year in unique_years:
    # Obtener los valores únicos de club_origin para el año actual
    club_origin_values = origin_count[origin_count['year'] == year]['club_origin'].unique()
    
    # Verificar si hay al menos un valor que coincide en national_team
    for club in club_origin_values:
        if club in nat_teams[nat_teams['year'] == year]['national_team'].values:
            results.append({'year': year, 'club_origin': club, 'matches': True})
        else:
            results.append({'year': year, 'club_origin': club, 'matches': False})

# Crear un DataFrame a partir de la lista de resultados
results_df = pd.DataFrame(results)

# Imprimir el DataFrame de resultados
print(results_df)
# Este código verificará para cada año si cada valor único en la columna "club_origin" del primer 
# DataFrame coincide con al menos un valor en la columna "national_team" del segundo DataFrame y 
# almacenará los resultados en un nuevo DataFrame llamado "results_df". Los resultados se marcan como True
# si hay coincidencia y como False si no hay coincidencia.


     year    club_origin  matches
0    1978              -    False
1    1978      Argentina     True
2    1978        Austria     True
3    1978        Belgium    False
4    1978         Brazil     True
..    ...            ...      ...
646  1950         Sweden     True
647  1950    Switzerland     True
648  1950  United States     True
649  1950        Uruguay     True
650  1950     Yugoslavia     True

[651 rows x 3 columns]


In [81]:
results_df

Unnamed: 0,year,club_origin,matches
0,1978,-,False
1,1978,Argentina,True
2,1978,Austria,True
3,1978,Belgium,False
4,1978,Brazil,True
...,...,...,...
646,1950,Sweden,True
647,1950,Switzerland,True
648,1950,United States,True
649,1950,Uruguay,True


In [None]:
idf = df.interactive()           # creates an interactive DataFrame with hvPlot

In [None]:
# Define Panel widgets
year_slider = pn.widgets.IntSlider(name='Year slider', start=1930, end=2022, step=5, value=1930,width=230)

In [None]:
# Radio buttons for counting fatalities
yAxis = pn.widgets.RadioButtonGroup(
    name='Y axis', 
    options=['national_team','player'],
    button_type='primary'   # choose a color with a predetermined category (as in bootstrap)
    , width=250,height=30
)
#yAxis.servable()

In [None]:
# Make a pipeline by processing the interactive DataFrame. 
# The slider widget should be incorporated here to separate data by year. 
# That will update the output dynamically(year-by-year).
mPPipeline = (
    idf[
        (idf.year <= year_slider)
    ]
    .groupby("year")[yAxis].sum()
    .to_frame()
    .reset_index()
    .sort_values(by='year')  
    .reset_index(drop=True)
)

In [None]:
year_slider

In [None]:
mPPipeline