# Aggregating
* DataFrame.apply() vs DataFrame.agg()
* pd.pivot() vs pd.pivot_table()
* counting groups : .groupby() and .size() vs pivot_table()
* cumulative statistics

In [2]:
import pandas as pd
import warnings
warnings.simplefilter(action = "ignore",  category= Warning)

In [3]:
data_frame = pd.read_csv("../PROJECTS/FIFA/players_20.csv")

# .apply() vs .agg()
* .agg applies given summary functions for DataFrame, so it returns scalar value for each column
* .apply do not change input shape, rather just calulate new value for each numerical cell

In [5]:
def in_milion(df):
    return df/1000000

In [6]:
data_frame.iloc[1:5, 7:9]

Unnamed: 0,value_eur,wage_eur
1,58500000.0,410000.0
2,105500000.0,290000.0
3,90000000.0,470000.0
4,90000000.0,370000.0


In [7]:
data_frame.iloc[1:5, 7:9].apply([in_milion])

Unnamed: 0_level_0,value_eur,wage_eur
Unnamed: 0_level_1,in_milion,in_milion
1,58.5,0.41
2,105.5,0.29
3,90.0,0.47
4,90.0,0.37


In [8]:
data_frame.iloc[1:5, 7:9].apply([in_milion]).agg(["max"])

Unnamed: 0_level_0,value_eur,wage_eur
Unnamed: 0_level_1,in_milion,in_milion
max,105.5,0.47


In [9]:
data_frame.columns

Index(['sofifa_id', 'player_url', 'short_name', 'long_name',
       'player_positions', 'overall', 'potential', 'value_eur', 'wage_eur',
       'age',
       ...
       'lcb', 'cb', 'rcb', 'rb', 'gk', 'player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url'],
      dtype='object', length=110)

In [10]:
multiindex = data_frame.loc[ : , ["nationality_name", "preferred_foot", "overall", "potential"]].set_index(["nationality_name", "preferred_foot"]).sort_index()

In [11]:
multiindex.apply([in_milion])

Unnamed: 0_level_0,Unnamed: 1_level_0,overall,potential
Unnamed: 0_level_1,Unnamed: 1_level_1,in_milion,in_milion
nationality_name,preferred_foot,Unnamed: 2_level_2,Unnamed: 3_level_2
Afghanistan,Left,0.000064,0.000065
Afghanistan,Right,0.000060,0.000071
Albania,Left,0.000076,0.000076
Albania,Left,0.000071,0.000071
Albania,Left,0.000070,0.000070
...,...,...,...
Zimbabwe,Right,0.000066,0.000066
Zimbabwe,Right,0.000065,0.000074
Zimbabwe,Right,0.000060,0.000073
Zimbabwe,Right,0.000058,0.000066


In [12]:
multiindex.agg(["mean"])

Unnamed: 0,overall,potential
mean,66.198885,71.500893


# .pivot() vs .pivot_table()

* Check how many Nationality-preferred_foot players are present in dataframe

In [15]:
nationality_preferred_foot = pd.DataFrame(data_frame.loc[ : , ["overall", "potential", "nationality_name", "preferred_foot"]])

In [16]:
pd.DataFrame(nationality_preferred_foot.groupby(["nationality_name", "preferred_foot"]).size()).sort_values(0, ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
nationality_name,preferred_foot,Unnamed: 2_level_1
England,Right,1274
Germany,Right,941
Spain,Right,759
France,Right,737
Argentina,Right,668
...,...,...
New Caledonia,Right,1
São Tomé e Príncipe,Right,1
New Caledonia,Left,1
Thailand,Left,1


# Pivot method cannot be used with duplicated values because pivot does not contains aggfunc param so cells that are duplicated will occupy same result set cell

# Dropping duplicates are one way to deal with it, second is to use aggregate function inside pivot - its pivot_table duty

In [19]:
pivot_data = nationality_preferred_foot.drop_duplicates(subset=["nationality_name", "preferred_foot"], keep="first")

In [20]:
pivot_data

Unnamed: 0,overall,potential,nationality_name,preferred_foot
0,94,94,Argentina,Left
1,93,93,Portugal,Right
2,92,92,Brazil,Right
3,91,91,Belgium,Right
5,91,93,Slovenia,Right
...,...,...,...,...
15747,59,60,South Sudan,Right
16532,57,57,Malta,Right
16658,57,69,Grenada,Right
17078,56,74,Indonesia,Left


In [21]:
pd.pivot(
    data = pivot_data,
    index = "nationality_name",
    columns = "preferred_foot",
    values = ["overall", "potential"]
)

Unnamed: 0_level_0,overall,overall,potential,potential
preferred_foot,Left,Right,Left,Right
nationality_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Afghanistan,64.0,60.0,65.0,71.0
Albania,76.0,82.0,76.0,87.0
Algeria,84.0,80.0,84.0,80.0
Angola,70.0,78.0,74.0,79.0
Antigua and Barbuda,62.0,69.0,64.0,77.0
...,...,...,...,...
Venezuela,73.0,81.0,76.0,84.0
Vietnam,,62.0,,67.0
Wales,85.0,83.0,85.0,83.0
Zambia,64.0,74.0,73.0,74.0


# Using second method - pivot table that allows duplicated in dataset

In [23]:
pivot_table = pd.pivot_table(
    data = nationality_preferred_foot,
    index = "nationality_name",
    columns = "preferred_foot",
    values = ["overall", "potential"],
    aggfunc="max"
)

In [24]:
pivot_table

Unnamed: 0_level_0,overall,overall,potential,potential
preferred_foot,Left,Right,Left,Right
nationality_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Afghanistan,64.0,60.0,65.0,71.0
Albania,76.0,82.0,76.0,87.0
Algeria,84.0,80.0,84.0,86.0
Angola,70.0,78.0,78.0,81.0
Antigua and Barbuda,62.0,69.0,64.0,77.0
...,...,...,...,...
Venezuela,73.0,81.0,79.0,87.0
Vietnam,,62.0,,67.0
Wales,85.0,83.0,86.0,86.0
Zambia,64.0,74.0,73.0,82.0


# Check if results are the same

In [26]:
pd.DataFrame(pivot_table.loc[ "Brazil" , : ])

Unnamed: 0_level_0,Unnamed: 1_level_0,Brazil
Unnamed: 0_level_1,preferred_foot,Unnamed: 2_level_1
overall,Left,88.0
overall,Right,92.0
potential,Left,91.0
potential,Right,92.0


In [27]:
nationality_preferred_foot.set_index(["nationality_name", "preferred_foot"]).loc[("Brazil", "Left") , : ].sort_values(
    ["overall", "potential"],
    ascending = [ False, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,overall,potential
nationality_name,preferred_foot,Unnamed: 2_level_1,Unnamed: 3_level_1
Brazil,Left,88,91
Brazil,Left,85,85
Brazil,Left,85,85
Brazil,Left,84,86
Brazil,Left,84,84
Brazil,...,...,...
Brazil,Left,62,64
Brazil,Left,62,62
Brazil,Left,61,69
Brazil,Left,61,67


# Cumulative statistics

In [29]:
df19 = pd.read_csv("../PROJECTS/FIFA/players_19.csv")
df20 = pd.read_csv("../PROJECTS/FIFA/players_20.csv")
df21 = pd.read_csv("../PROJECTS/FIFA/players_21.csv")
df22 = pd.read_csv("../PROJECTS/FIFA/players_22.csv")

In [30]:
df19["Year"] = 2019
df20["Year"] = 2020
df21["Year"] = 2021
df22["Year"] = 2022

In [31]:
df19 = df19.loc[: , ["Year", "overall", "potential"]].sample(n = 3)
df20 = df20.loc[: , ["Year", "overall", "potential"]].sample(n = 3)
df21 = df21.loc[: , ["Year", "overall", "potential"]].sample(n = 3)
df22 = df22.loc[: , ["Year", "overall", "potential"]].sample(n = 3)

In [32]:
df_years = pd.concat([df19, df20, df21, df22], axis = 0)

In [33]:
pd.DataFrame(df_years.sort_values("Year", ascending=True).set_index("Year")["overall"].agg(["cummax", "cummin" ]))

Unnamed: 0_level_0,cummax,cummin
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2019,72,72
2019,72,69
2019,72,69
2020,72,67
2020,72,67
2020,72,66
2021,72,52
2021,72,52
2021,72,52
2022,72,52


# Counting groups and other summary statistics
* using .groupby() and .agg()
* using .groupby() and .size()
* using .value_counts()
* using .pivot_table()

In [35]:
df = data_frame.loc[: , ["nationality_name", "overall", "potential"]]

In [36]:
df

Unnamed: 0,nationality_name,overall,potential
0,Argentina,94,94
1,Portugal,93,93
2,Brazil,92,92
3,Belgium,91,91
4,Belgium,91,91
...,...,...,...
18478,England,48,60
18479,Cyprus,48,67
18480,England,48,73
18481,England,48,72


In [37]:
df.groupby("nationality_name")[["overall", "potential"]].agg(["mean", "max", "min", "count"])

Unnamed: 0_level_0,overall,overall,overall,overall,potential,potential,potential,potential
Unnamed: 0_level_1,mean,max,min,count,mean,max,min,count
nationality_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Afghanistan,62.000000,64,60,2,68.000000,71,65,2
Albania,66.418605,82,49,43,72.116279,87,62,43
Algeria,71.380000,84,59,50,74.240000,86,63,50
Angola,69.375000,78,62,16,73.437500,81,63,16
Antigua and Barbuda,59.428571,69,49,7,68.571429,77,62,7
...,...,...,...,...,...,...,...,...
Venezuela,67.075758,81,54,66,72.848485,87,57,66
Vietnam,62.000000,62,62,1,67.000000,67,67,1
Wales,64.692308,85,48,117,70.316239,86,59,117
Zambia,66.727273,74,60,11,73.090909,82,65,11


In [38]:
pd.pivot_table(
    data = df,
    index = "nationality_name",
    values = ["overall", "potential"],
    aggfunc = ["mean", "max", "min", "count"]
)

Unnamed: 0_level_0,mean,mean,max,max,min,min,count,count
Unnamed: 0_level_1,overall,potential,overall,potential,overall,potential,overall,potential
nationality_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Afghanistan,62.000000,68.000000,64,71,60,65,2,2
Albania,66.418605,72.116279,82,87,49,62,43,43
Algeria,71.380000,74.240000,84,86,59,63,50,50
Angola,69.375000,73.437500,78,81,62,63,16,16
Antigua and Barbuda,59.428571,68.571429,69,77,49,62,7,7
...,...,...,...,...,...,...,...,...
Venezuela,67.075758,72.848485,81,87,54,57,66,66
Vietnam,62.000000,67.000000,62,67,62,67,1,1
Wales,64.692308,70.316239,85,86,48,59,117,117
Zambia,66.727273,73.090909,74,82,60,65,11,11


In [39]:
pd.DataFrame(data_frame[["nationality_name", "preferred_foot"]].value_counts())

Unnamed: 0_level_0,Unnamed: 1_level_0,count
nationality_name,preferred_foot,Unnamed: 2_level_1
England,Right,1274
Germany,Right,941
Spain,Right,759
France,Right,737
Argentina,Right,668
...,...,...
New Caledonia,Right,1
São Tomé e Príncipe,Right,1
New Caledonia,Left,1
Thailand,Left,1


In [40]:
pd.DataFrame(data_frame.groupby(["nationality_name", "preferred_foot"]).size()).sort_values(by = 0, ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
nationality_name,preferred_foot,Unnamed: 2_level_1
England,Right,1274
Germany,Right,941
Spain,Right,759
France,Right,737
Argentina,Right,668
...,...,...
New Caledonia,Right,1
São Tomé e Príncipe,Right,1
New Caledonia,Left,1
Thailand,Left,1


# Summaries groups : single column --> Series

In [42]:
df.groupby(["nationality_name"])["overall"].mean()

nationality_name
Afghanistan            62.000000
Albania                66.418605
Algeria                71.380000
Angola                 69.375000
Antigua and Barbuda    59.428571
                         ...    
Venezuela              67.075758
Vietnam                62.000000
Wales                  64.692308
Zambia                 66.727273
Zimbabwe               67.250000
Name: overall, Length: 162, dtype: float64

# Summaries groups : many column --> DataFrame

In [44]:
df.groupby(["nationality_name"])[["overall", "potential"]].mean()

Unnamed: 0_level_0,overall,potential
nationality_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,62.000000,68.000000
Albania,66.418605,72.116279
Algeria,71.380000,74.240000
Angola,69.375000,73.437500
Antigua and Barbuda,59.428571,68.571429
...,...,...
Venezuela,67.075758,72.848485
Vietnam,62.000000,67.000000
Wales,64.692308,70.316239
Zambia,66.727273,73.090909
