# Data Analysis

In [1]:
import pandas as pd
import numpy as np
import panel as pn
pn.extension()               # pn.extension('tabulator') Switching between extensions might turn plots invisible in the notebook

import holoviews as hv
import hvplot.pandas         # adds hvplot method to pandas objects. Also, sets bokeh as pn extension
from hvplot import hvPlot
import sqlite3

import matplotlib as mpl

mpl.use('agg')                # we are going to use some matplotlib aggregation functions in this project 

In [2]:
# cache data to improve the dashboard performance
if 'data' not in pn.state.cache.keys():

    df = pd.read_csv('dataSets\\allTimeSquads.csv')

    pn.state.cache['data'] = df.copy()

else: 

    df = pn.state.cache['data']

In [3]:
df.shape

(10964, 12)

In [20]:
df1930 = df[df.year == 1930]

In [21]:
df1930

Unnamed: 0,player,national_team,position,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,21,0,Estudiantil Porteño,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...
236,Ljubiša Stefanović,Yugoslavia,MF,20,0,FC Sète,France,1930-07-17,1930,Europe,-
237,Milan Stojanović,Yugoslavia,GK,18,0,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-
238,Aleksandar Tirnanić,Yugoslavia,FW,20,5,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-
239,Dragomir Tošić,Yugoslavia,DF,20,0,BSK Beograd,Kingdom of Yugoslavia,1930-07-17,1930,Europe,-


In [3]:
df.head()

Unnamed: 0,player,national_team,position,birthday,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,1905-05-05,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,1908-10-23,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,1907-02-23,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,1907-02-23,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,1909-03-19,21,0,Estudiantil Porteño,Argentina,1930-07-17,1930,South America,-


In [5]:
df = df.drop(columns = 'birthday')

In [6]:
wc = df[df.year == 1930]

In [7]:
wc.age.mean()

24.443983402489625

In [8]:
wc2 = df[df.year == 2022]

In [9]:
wc2.age.mean()

26.94344163658243

In [10]:
wc3 = df[df.year == 2018]

In [11]:
wc3.age 

9397     32
9398     31
9399     25
9400     31
9401     32
         ..
10128    27
10129    28
10130    31
10131    31
10132    35
Name: age, Length: 736, dtype: int64

In [12]:
df.year.unique()

array([1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974, 1978,
       1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014, 2018, 2022],
      dtype=int64)

In [None]:
# Calculate the average age of players for each world cup

In [12]:
for y in df.year.unique():
    means = df[df.year == y].age.mean()
    means = round(means,2)
    print(means)

24.44
24.92
25.43
26.46
26.38
26.27
25.78
25.79
25.76
26.21
26.1
26.61
26.63
26.39
26.85
27.01
26.97
26.88
26.9
26.85
27.44
26.94


In [37]:
groupYear = df.groupby(['year'])

In [38]:
#countAge = groupYear['age'].value_counts().to_frame().rename(columns = {'age':'count'}).reset_index(1)

In [40]:
countAge = groupYear['age'].mean()
countAge = round(countAge,2)
countAge.to_frame().reset_index()

Unnamed: 0,year,age
0,1930,24.44
1,1934,24.92
2,1938,25.43
3,1950,26.46
4,1954,26.38
5,1958,26.27
6,1962,25.78
7,1966,25.79
8,1970,25.76
9,1974,26.21


In [24]:
# calculate median age of players

In [30]:
print("Medians")
for y in df.year.unique():
    medians = df[df.year == y].age.median()
    medians = round(medians,2)
    print(f"{y} = {medians} years old")

Medians
1930 = 24.0 years old
1934 = 25.0 years old
1938 = 25.0 years old
1950 = 27.0 years old
1954 = 26.0 years old
1958 = 26.0 years old
1962 = 25.0 years old
1966 = 25.0 years old
1970 = 26.0 years old
1974 = 26.0 years old
1978 = 26.0 years old
1982 = 26.0 years old
1986 = 26.0 years old
1990 = 26.0 years old
1994 = 27.0 years old
1998 = 27.0 years old
2002 = 27.0 years old
2006 = 27.0 years old
2010 = 27.0 years old
2014 = 27.0 years old
2018 = 27.0 years old
2022 = 27.0 years old


In [17]:
#hvplot.help('bar')

In [69]:
def plot_world_avg_exp(year): # receives a slider object.
    year_df = df[df['year'] == year]
    
    return year_df.hvplot.bar('year', y='caps',   # y default is = all numeric columns
                              height=320, width=550,line_color = 'black', color = 'purple',ylim=(0,50),
                              legend=False, yformatter='%.0f'
                              
                             ).aggregate(function=np.mean).opts(xlabel="Year", ylabel="Avg. caps",
                                                                title="Iternational experience (world)",xrotation=20) 

In [75]:
def plot_sa_avg_exp(year): # receives a slider object.

    year_df = df[(df['year'] == year) & (df['confederation']== 'South America')]
    
    return year_df.hvplot.bar('year', y='caps',   # y default is = all numeric columns
                              height=320, width=550,line_color = 'black', color = 'purple',ylim=(0,50),
                              legend=False, yformatter='%.0f'
                              
                             ).aggregate(function=np.mean).opts(xlabel="Year", ylabel="Avg. caps",
                                                                title="Iternational experience (south america)",xrotation=20) 

In [76]:
# Create a slider widget 
year_slider2 = pn.widgets.IntSlider(name='Year Slider', width=200,
                                 start=1930, end=2022, value=(1930),
                                 step=4,value_throttled=(1930))
# Apply a callback policy                                 
@pn.depends(year_slider2.param.value_throttled)
def year_selected(year_slider2):
    return '### Jobs in {}'.format(year_slider2)

In [82]:
year_slider2 

In [87]:
plots_box2 = pn.WidgetBox(pn.Row(pn.bind(plot_world_avg_exp,year_slider2), align="start", sizing_mode="stretch_width"))

In [88]:
plots_box2

In [23]:
df

Unnamed: 0,player,national_team,position,age,caps,club,club_origin,wc_date,year,confederation,imputed
0,Ángel Bossio,Argentina,GK,25,16,Talleres (BA),Argentina,1930-07-17,1930,South America,-
1,Juan Botasso,Argentina,GK,21,2,Argentino (Q),Argentina,1930-07-17,1930,South America,-
2,Roberto Cherro,Argentina,FW,23,10,Boca Juniors,Argentina,1930-07-17,1930,South America,-
3,Alberto Chividini,Argentina,DF,23,2,Central Norte,Argentina,1930-07-17,1930,South America,-
4,Attilio Demaría,Argentina,FW,21,0,Estudiantil Porteño,Argentina,1930-07-17,1930,South America,-
...,...,...,...,...,...,...,...,...,...,...,...
10959,Sorba Thomas,Wales,MF,23,6,Huddersfield Town,England,2022-11-20,2022,Europe,-
10960,Dylan Levitt,Wales,MF,22,13,Dundee United,Scotland,2022-11-20,2022,Europe,-
10961,Ben Cabango,Wales,DF,22,5,Swansea City,Wales,2022-11-20,2022,Europe,-
10962,Rubin Colwill,Wales,MF,20,7,Cardiff City,Wales,2022-11-20,2022,Europe,-


In [33]:
df.hvplot.line(x= 'year' , y= ['caps']).aggregate(function=np.mean)

In [25]:
df.hvplot()

In [27]:
#(weekly_mean_magnitude_plot + weekly_count_plot).cols(1)

In [118]:
conts = df.groupby(['year','confederation'])['caps'].mean()

In [121]:
conts.to_frame().reset_index()

Unnamed: 0,year,confederation,caps
0,1930,Europe,6.406250
1,1930,North America,0.727273
2,1930,South America,4.437500
3,1934,Africa,1.650000
4,1934,Europe,9.787072
...,...,...,...
99,2022,Asia,39.038760
100,2022,Europe,35.872781
101,2022,North America,38.240385
102,2022,Oceania,20.653846


In [99]:
conts = df.groupby(['year'])

In [100]:
countAge = conts['age'].mean()
countAge.to_frame().reset_index()

Unnamed: 0,year,age
0,1930,24.443983
1,1934,24.917404
2,1938,25.427673
3,1950,26.462366
4,1954,26.38
5,1958,26.269886
6,1962,25.775568
7,1966,25.792614
8,1970,25.759312
9,1974,26.207386


In [None]:
idf = df.interactive()           # creates an interactive DataFrame with hvPlot

In [None]:
# Define Panel widgets
year_slider = pn.widgets.IntSlider(name='Year slider', start=1930, end=2022, step=5, value=1930,width=230)

In [None]:
# Radio buttons for counting fatalities
yAxis = pn.widgets.RadioButtonGroup(
    name='Y axis', 
    options=['national_team','player'],
    button_type='primary'   # choose a color with a predetermined category (as in bootstrap)
    , width=250,height=30
)
#yAxis.servable()

In [None]:
# Make a pipeline by processing the interactive DataFrame. 
# The slider widget should be incorporated here to separate data by year. 
# That will update the output dynamically(year-by-year).
mPPipeline = (
    idf[
        (idf.year <= year_slider)
    ]
    .groupby("year")[yAxis].sum()
    .to_frame()
    .reset_index()
    .sort_values(by='year')  
    .reset_index(drop=True)
)

In [None]:
year_slider

In [None]:
mPPipeline