In [233]:
import pandas as pd

mixed_table = pd.read_excel('Table mixed.xlsx')
players_2019 = pd.read_excel('Players 2019.xlsx')
players_2020 = pd.read_excel('Players 2020.xlsx')
players_2021 = pd.read_excel('Players 2021.xlsx')
players_2019_sample = players_2019.sample(160)
players_2020_sample = players_2020.sample(160)
players_2021_sample = players_2021.sample(160)

In [234]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import scipy.stats as st

In [235]:
#Checking the types present in all the databases I have

players_2019_sample.dtypes
players_2020_sample.dtypes
players_2021_sample.dtypes
mixed_table.dtypes

Position                           int64
Team                              object
Matches                            int64
W                                  int64
D                                  int64
L                                  int64
Scored                             int64
Conceded                           int64
Diff                               int64
Tries                              int64
Tries conceded                     int64
Points/visit to opposition 22    float64
Conversions                        int64
Missed Conversions                 int64
Carries                            int64
Line breaks                        int64
Passes                             int64
Knock Ons                          int64
Attempted Tackles                  int64
Tackles Made                       int64
Dominant Tackles                   int64
Total points                       int64
Possession                       float64
Year                               int64
dtype: object

In [236]:
display(mixed_table)

Unnamed: 0,Position,Team,Matches,W,D,L,Scored,Conceded,Diff,Tries,...,Carries,Line breaks,Passes,Knock Ons,Attempted Tackles,Tackles Made,Dominant Tackles,Total points,Possession,Year
0,1,Wales,5,5,0,0,114,65,49,10,...,548,13,630,32,114,925,70,23,47.8,2019
1,2,England,5,3,1,1,184,101,83,24,...,596,29,757,41,1164,1030,146,18,45.8,2019
2,3,Ireland,5,3,0,2,101,100,1,14,...,72,18,948,45,795,771,56,14,57.8,2019
3,4,France,5,2,0,3,93,118,-25,12,...,581,21,679,44,979,880,76,10,49.6,2019
4,5,Scotland,5,1,1,3,105,125,-20,14,...,676,22,919,28,1125,1001,88,9,50.0,2019
5,6,Italy,5,0,0,5,79,167,-88,10,...,635,17,771,27,959,855,50,0,49.0,2019
6,1,England,5,4,0,1,121,77,44,14,...,454,15,590,32,877,795,117,18,48.4,2020
7,2,France,5,4,0,1,138,117,21,17,...,536,20,547,34,976,879,98,18,45.8,2020
8,3,Ireland,5,3,0,2,132,102,30,17,...,620,29,785,20,793,711,58,14,53.8,2020
9,4,Scotland,5,3,0,2,77,59,18,7,...,553,22,645,25,818,757,76,14,50.6,2020


In [237]:
high_scoring = mixed_table[mixed_table['Diff']>20]
low_scoring = mixed_table[mixed_table['Diff']<20]



In [238]:
#mixed_table.columns.tolist()
mixed_table['Points/visit to opposition 22']

0     11.70
1     14.84
2     13.52
3     12.85
4     11.95
5      8.80
6     13.25
7     13.36
8      5.31
9      5.71
10    12.81
11     2.46
12     5.60
13     6.40
14     3.00
15     2.56
16     2.63
17     2.64
Name: Points/visit to opposition 22, dtype: float64

In [239]:
#First thing I would like to check is whether a higher possession means also more points, 
#I will use the mixed table for this purpose

high_scoring = mixed_table[mixed_table['Diff']>20]
low_scoring = mixed_table[mixed_table['Diff']<20]
low_s_possession = low_scoring['Possession']
high_s_possession = high_scoring['Possession']
stats.ttest_ind(low_s_possession,high_s_possession)

#Being the pvalue very high we can accept the hypothesis. So having a lower possession over the ball 
#does not mean scoring less points.

Ttest_indResult(statistic=1.1754696193441692, pvalue=0.25700288820466116)

In [240]:
#Let's also check if the value of 'Points/visit to opposition 22' affects the total point scored.
#First, let's see what is the average of 'Points/visit to opposition 22':

points_22 = mixed_table['Points/visit to opposition 22']
std = points_22.std()
mean = points_22.mean()
n=20


print(st.t.interval(0.95,n-1, loc=mean, scale=std/np.sqrt(n)))


#Are they independent?
high_points = mixed_table[mixed_table['Diff']>20]
low_points = mixed_table[mixed_table['Diff']<20]
low_p_22_scoring = low_points['Points/visit to opposition 22']
high_p_22_scoring = high_points['Points/visit to opposition 22']

stats.ttest_ind(high_p_22_scoring,low_p_22_scoring)

#We can reject the h0
#If you have a higher points 22, there is evidence that it leads to win more.



(6.109618577085573, 10.489270311803313)


Ttest_indResult(statistic=2.132886990250231, pvalue=0.048770497215068265)

In [241]:
#Do more knock ons mean less points?
#mixed_table['Knock Ons'].mean()
low_knock_ons = low_points['Knock Ons']

high_knock_ons = high_points['Knock Ons']


stats.ttest_ind(high_knock_ons,low_knock_ons)

#Pvalue has a high value which means we cannot reject this hypothesis
#But what can be the one affecting it the most then? 

Ttest_indResult(statistic=1.463170728344971, pvalue=0.1627880463276268)

In [242]:
#Maybe the problem is in the defence? When you do a dominant tackle you discourage a bit the attack of your opponents
#I can check then if the number of dominant tackles ifluence the points conceeded.

high_conceded = mixed_table[mixed_table['Conceded']>100]
low_conceded = mixed_table[mixed_table['Conceded']<100]
low_defense = low_conceded['Dominant Tackles']
high_defense = high_conceded['Dominant Tackles']
stats.ttest_ind(high_defense,low_defense)

#Low pvalue, so we can rejectd the null hypothesis.
#Dominant Tackles are influencing the points conceded.

Ttest_indResult(statistic=2.425642590497127, pvalue=0.02836508894611877)

In [243]:
#What is the average of dominant tackles for Italy compared to the average of the other teams?
italy_table = mixed_table[mixed_table['Team']=='\xa0Italy']
print('Italian dominant tackles average: ',italy_table['Dominant Tackles'].mean())
other_teams = mixed_table[mixed_table['Team']!='\xa0Italy']
print('Other teams dominant tackles average: ',other_teams['Dominant Tackles'].mean())

#DO THE CONFIDENCE INTERVAL FOR ITALY:
dominant_tackles_ita = italy_table['Dominant Tackles']
std = dominant_tackles.std()
mean = dominant_tackles.mean()
n=20
print('Confidence interval of Italian dominant tackles is between:',
      st.t.interval(0.95,n-1, loc=mean, scale=std/np.sqrt(n)))

#DO THE CONFIDENCE INTERVAL FOR THE OTHER TEAMS:
dominant_tackles_others = other_teams['Dominant Tackles']
std = dominant_tackles_others.std()
mean = dominant_tackles_others.mean()
n=20
print('Confidence interval of the other teams dominant tackles is between:',
      st.t.interval(0.95,n-1, loc=mean, scale=std/np.sqrt(n)))

#As we can see the italian average is considerably lower than the other teams.
#This may actually be a problem for the italian squad.

Italian dominant tackles average:  39.333333333333336
Other teams dominant tackles average:  59.46666666666667
Confidence interval of Italian dominant tackles is between: (25.977827915238205, 52.68883875142846)
Confidence interval of the other teams dominant tackles is between: (39.06726085277818, 79.86607248055515)


In [244]:
#In terms of player positions, how do teams play? Are there different patterns? Which roles score more points?
#Are they changing through the years?
players_2019.head()
players_2019_others = players_2019[players_2019['Nationality']!='Italy']
scrum_players_2019 = players_2019_others[players_2019_others['Positions']==0]
back_players_2019 = players_2019_others[players_2019_others['Positions']==1]
print('Tries of scrum players: ',scrum_players_2019['Tries'].sum())
print('Tries of back players: ',back_players_2019['Tries'].sum())

Tries of scrum players:  18
Tries of back players:  55


In [245]:
players_2020.head()
players_2020_others = players_2020[players_2020['Nationality']!='Italy']
scrum_players_2020 = players_2020_others[players_2020_others['Positions']==0]
back_players_2020 = players_2020_others[players_2020_others['Positions']==1]
print('Tries of scrum players: ',scrum_players_2020['Tries'].sum())
print('Tries of back players: ',back_players_2020['Tries'].sum())

Tries of scrum players:  24
Tries of back players:  44


In [246]:
players_2021.head()

players_2021_others = players_2021[players_2021['Nationality']!='Italy']
scrum_players_2021 = players_2021_others[players_2021_others['Positions']==0]
back_players_2021 = players_2021_others[players_2021_others['Positions']==1]
print('Tries of scrum players: ',scrum_players_2021['Tries'].sum())
print('Tries of back players: ',back_players_2021['Tries'].sum())

#So back players score on average a higher number of tries even though we can se that from 2019 to 2020
#they got a closer average. And even if the tournament of 2021 has just started it seems to follow the same pattern of 2019

Tries of scrum players:  7
Tries of back players:  20


In [247]:
#But how is Italy doing? What playing style do they have?
#Let's start from the situation in 2019

italian_players_2019 = players_2019[players_2019['Nationality']=='Italy']
italian_backs_2019 = italian_players_2019[italian_players_2019['Positions']==1]
italian_scrum_2019 = italian_players_2019[italian_players_2019['Positions']==0]
print('Tries of italian scrum players in 2019: ',italian_scrum_2019['Tries'].sum())
print('Tries of italian back players in 2019: ',italian_backs_2019['Tries'].sum())

Tries of italian scrum players in 2019:  1
Tries of italian back players in 2019:  9


In [248]:
#What happened in 2020?

italian_players_2020 = players_2020[players_2020['Nationality']=='Italy']
italian_backs_2020 = italian_players_2020[italian_players_2020['Positions']==1]
italian_scrum_2020 = italian_players_2020[italian_players_2020['Positions']==0]
print('Tries of italian scrum players in 2020: ',italian_scrum_2020['Tries'].sum())
print('Tries of italian back players in 2020: ',italian_backs_2020['Tries'].sum())

Tries of italian scrum players in 2020:  2
Tries of italian back players in 2020:  4


In [249]:
#What about 2021 so far?

italian_players_2021 = players_2021[players_2021['Nationality']=='Italy']
italian_backs_2021 = italian_players_2021[italian_players_2021['Positions']==1]
italian_scrum_2021 = italian_players_2021[italian_players_2021['Positions']==0]
print('Tries of italian scrum players in 2021: ',italian_scrum_2021['Tries'].sum())
print('Tries of italian back players in 2021: ',italian_backs_2021['Tries'].sum())

Tries of italian scrum players in 2021:  0
Tries of italian back players in 2021:  3


In [250]:
#So far we observed that Italy follows the average trends, 
#which also means that its stronger point stays in the backs players for the attack,
#let's go check then something else.
#What can we say now about their defensive skills? First I will analyze the average for all the players
#and then the italians.
players_2019.head()

Unnamed: 0,Nationality,PLAYER,Minutes played,Tries,Conversions,Carries,Passes,Knock Ons,Tackles Made,Missed tackles,Dominant Tackles,Year,Position
0,Italy,Jayden Hayward,400,0,0,42,52,2,26,3,1,2019,1
1,Ireland,Peter O'Mahony,400,0,0,32,25,2,50,6,2,2019,0
2,England,Elliot Daly,400,1,0,45,38,3,12,6,2,2019,1
3,Wales,Josh Adams,400,3,0,39,9,3,17,4,2,2019,1
4,Wales,Jonathan Davies,400,1,0,26,16,1,47,5,3,2019,1


In [251]:
#We'll check for both Tackles Made and Missed Tackles.
#But as we also see in the 'mixed table', the Dominant tackles are influencing the negative score we'll
#also analyze that.
#First Tackles Made

In [252]:
#Let's create a function to make the job easier:
def confidence_calculator(x):
    std = x.std()
    mean = x.mean()
    n=25
    return 'Confidence interval is between:',st.t.interval(0.95,n-1, loc=mean, scale=std/np.sqrt(n))


In [253]:
scrum = [scrum_players_2020, scrum_players_2021]
back = [back_players_2020,back_players_2021]
years = zip(scrum,back)
count = 2020

print('Tackles Scrum players 2019')
print(confidence_calculator(scrum_players_2019['Tackles Made\xa0']))
print('Tackles Back players 2019')
print(confidence_calculator(back_players_2019['Tackles Made\xa0']))


for s, b in years:
    print(f'Tackles Scrum players {count}')
    print(confidence_calculator(s['Tackles Made']))
    print(f'Tackles Back players {count}')
    print(confidence_calculator(b['Tackles Made']))
    count +=1
    


Tackles Scrum players 2019
('Confidence interval is between:', (22.67264264631446, 43.03569068701887))
Tackles Back players 2019
('Confidence interval is between:', (13.712554513294817, 26.66425708090808))
Tackles Scrum players 2020
('Confidence interval is between:', (20.89654664307842, 39.507947738944054))
Tackles Back players 2020
('Confidence interval is between:', (10.611218522385329, 21.81131668888228))
Tackles Scrum players 2021
('Confidence interval is between:', (9.201104177968247, 17.94556248869842))
Tackles Back players 2021
('Confidence interval is between:', (4.818742171607271, 9.08125782839273))


In [254]:
#Now it's time to see if for Italy is the same result for tackles made

In [255]:
scrum = [italian_scrum_2020, italian_scrum_2021]
back = [italian_backs_2020,italian_backs_2021]
years = zip(scrum,back)
count = 2020

print('Tackles italian Scrum players 2019')
print(confidence_calculator(italian_scrum_2019['Tackles Made\xa0']))
print('Tackles italian Back players 2019')
print(confidence_calculator(italian_backs_2019['Tackles Made\xa0']))


for s, b in years:
    print(f'Tackles italian Scrum players {count}')
    print(confidence_calculator(s['Tackles Made']))
    print(f'Tackles italian Back players {count}')
    print(confidence_calculator(b['Tackles Made']))
    count +=1
    
    
#There seems to be a quite good advantage of scrum players doing more tackles of backs, 
#as observed also for the rest of the teams.

Tackles italian Scrum players 2019
('Confidence interval is between:', (24.260662868424433, 41.73933713157557))
Tackles italian Back players 2019
('Confidence interval is between:', (13.404087940773659, 27.73876920208349))
Tackles italian Scrum players 2020
('Confidence interval is between:', (15.240334046175523, 32.577847772006294))
Tackles italian Back players 2020
('Confidence interval is between:', (10.963168867388585, 25.75111684689713))
Tackles italian Scrum players 2021
('Confidence interval is between:', (8.099369525043432, 14.567297141623236))
Tackles italian Back players 2021
('Confidence interval is between:', (4.551136549292167, 9.08522708707147))


In [256]:
#Which type of players are missing more tackles?

In [257]:
scrum = [scrum_players_2019,scrum_players_2020, scrum_players_2021]
back = [back_players_2019,back_players_2020,back_players_2021]
years = zip(scrum,back)
count = 2019

for s, b in years:
    print(f'Tackles missed by Scrum players {count}')
    print(confidence_calculator(s['Missed tackles']))
    print(f'Tackles missed by Back players {count}')
    print(confidence_calculator(b['Missed tackles']))
    count +=1
    
#While in 2019 the situation was balanced, it seems that scrum players 
#improved their defensive skills
#Back players are now missing more tackles.

Tackles missed by Scrum players 2019
('Confidence interval is between:', (2.0246311279537297, 4.433702205379603))
Tackles missed by Back players 2019
('Confidence interval is between:', (2.0480092224896183, 4.386773386206034))
Tackles missed by Scrum players 2020
('Confidence interval is between:', (1.2003097515029681, 2.9569936192835486))
Tackles missed by Back players 2020
('Confidence interval is between:', (1.9295244089251329, 4.549348830511487))
Tackles missed by Scrum players 2021
('Confidence interval is between:', (0.7143585158741453, 2.112308150792521))
Tackles missed by Back players 2021
('Confidence interval is between:', (0.8223726906240941, 2.177627309375906))


In [258]:
#Are then back italian players missing more tackles?

In [259]:
scrum = [italian_scrum_2019,italian_scrum_2020, italian_scrum_2021]
back = [italian_backs_2019,italian_backs_2020,italian_backs_2021]
years = zip(scrum,back)
count = 2019


for s, b in years:
    print(f'Missed Tackles italian Scrum players {count}')
    print(confidence_calculator(s['Missed tackles']))
    print(f'Missed Tackles italian Back players {count}')
    print(confidence_calculator(b['Missed tackles']))
    count +=1
    
#They seem to follow the general trend, backs players are still missing too many tackles
#which on a long term might be a problem.
#Especially in 2021 the situation seemed got even worse as just after 2 matches they have such a high average.
#Almost the same of the year before.

Missed Tackles italian Scrum players 2019
('Confidence interval is between:', (2.1287484599622304, 4.459486834155417))
Missed Tackles italian Back players 2019
('Confidence interval is between:', (1.9071698536551398, 4.521401574916289))
Missed Tackles italian Scrum players 2020
('Confidence interval is between:', (1.4866817261390355, 3.4224091829518737))
Missed Tackles italian Back players 2020
('Confidence interval is between:', (2.488786712550394, 4.654070430306749))
Missed Tackles italian Scrum players 2021
('Confidence interval is between:', (1.1744405753487919, 2.825559424651208))
Missed Tackles italian Back players 2021
('Confidence interval is between:', (1.2358561584253698, 2.945962023392812))


In [260]:
#Which type of players are doing more dominant tackles?

In [261]:
scrum = [scrum_players_2019,scrum_players_2020, scrum_players_2021]
back = [back_players_2019,back_players_2020,back_players_2021]
years = zip(scrum,back)
count = 2019

for s, b in years:
    print(f'Tackles dominant by Scrum players {count}')
    print(confidence_calculator(s['Dominant Tackles']))
    print(f'Tackles dominant by Back players {count}')
    print(confidence_calculator(b['Dominant Tackles']))
    count +=1
    
    
#As expected the scrum players are doing more dominant tackles.

Tackles dominant by Scrum players 2019
('Confidence interval is between:', (1.9133993164952943, 4.940767350171373))
Tackles dominant by Back players 2019
('Confidence interval is between:', (0.6768569922817396, 2.453577790326956))
Tackles dominant by Scrum players 2020
('Confidence interval is between:', (2.090533379924487, 5.167893586367648))
Tackles dominant by Back players 2020
('Confidence interval is between:', (0.9131148457434553, 2.241814731721333))
Tackles dominant by Scrum players 2021
('Confidence interval is between:', (0.07371830673655205, 0.8329483599301146))
Tackles dominant by Back players 2021
('Confidence interval is between:', (0.011534551061821247, 0.32179878227151204))


In [262]:
#Let's eventually check if italian players are dominant on the tackles:

In [263]:
scrum = [italian_scrum_2019,italian_scrum_2020, italian_scrum_2021]
back = [italian_backs_2019,italian_backs_2020,italian_backs_2021]
years = zip(scrum,back)
count = 2019


for s, b in years:
    print(f'Dominant Tackles italian Scrum players {count}')
    print(confidence_calculator(s['Dominant Tackles']))
    print(f'Dominant Tackles italian Back players {count}')
    print(confidence_calculator(b['Dominant Tackles']))
    count +=1
    

#In general they seem to follow the trend even if their averages are a bit lower,
#and in the current tournament they seem to be suffering much more than the others,
#especially the back players are a bit behind.
#This may be one of the reasons why Italy doesn't get to win a lot of matches
#and especially the reason why they do not score a lot of points.

Dominant Tackles italian Scrum players 2019
('Confidence interval is between:', (1.1190062009085695, 2.9986408579149595))
Dominant Tackles italian Back players 2019
('Confidence interval is between:', (0.6034140804226362, 1.3965859195773638))
Dominant Tackles italian Scrum players 2020
('Confidence interval is between:', (1.422375279957764, 3.6685338109513266))
Dominant Tackles italian Back players 2020
('Confidence interval is between:', (0.3942050013276872, 1.6057949986723128))
Dominant Tackles italian Scrum players 2021
('Confidence interval is between:', (0.13893518742840238, 0.6610648125715977))
Dominant Tackles italian Back players 2021
('Confidence interval is between:', (-0.03354867516862445, 0.21536685698680627))


In [264]:
#I can now check who is getting the most carries (bringing the ball ahead is very important in rugby)
#first for all the teams:

In [265]:
scrum = [scrum_players_2020, scrum_players_2021]
back = [back_players_2020,back_players_2021]
years = zip(scrum,back)
count = 2020

print('Carries Scrum players 2019')
print(confidence_calculator(scrum_players_2019['Carries\xa0']))
print('Carries Back players 2019')
print(confidence_calculator(back_players_2019['Carries\xa0']))


for s, b in years:
    print(f'Carries Scrum players {count}')
    print(confidence_calculator(s['Carries']))
    print(f'Carries Back players {count}')
    print(confidence_calculator(b['Carries']))
    count +=1
    

#Generally the situation seems balanced, just a bit on the side of back players, 
#but also because they have more chances to run with the ball.

Carries Scrum players 2019
('Confidence interval is between:', (11.71622167224869, 24.40877832775131))
Carries Back players 2019
('Confidence interval is between:', (15.198353360401926, 25.555269828003873))
Carries Scrum players 2020
('Confidence interval is between:', (10.724541631366993, 22.781076346161097))
Carries Back players 2020
('Confidence interval is between:', (12.17990621551426, 23.425727587302646))
Carries Scrum players 2021
('Confidence interval is between:', (5.371641133888209, 11.881692199445125))
Carries Back players 2021
('Confidence interval is between:', (5.642892855860303, 11.590440477473031))


In [266]:
#And now for Italy:

In [267]:
scrum = [italian_scrum_2020, italian_scrum_2021]
back = [italian_backs_2020,italian_backs_2021]
years = zip(scrum,back)
count = 2020

print('Carries by italian Scrum players 2019')
print(confidence_calculator(italian_scrum_2019['Carries\xa0']))
print('Carries by italian Back players 2019')
print(confidence_calculator(italian_backs_2019['Carries\xa0']))


for s, b in years:
    print(f'Carries by italian Scrum players {count}')
    print(confidence_calculator(s['Carries']))
    print(f'Carries by italian Back players {count}')
    print(confidence_calculator(b['Carries']))
    count +=1
    
#Ok here we might actually have something, it seems that the scrum players cannot really carry
#the ball for a lot of meters, which is a huge problem in rugby because they're the players supposed
#to create breaks where then the backs can run into, and this trend got inverted since 2019 when they
#were carrying more.
#To be at the level of the other teams these values should be more balanced.

Carries by italian Scrum players 2019
('Confidence interval is between:', (14.626858456106195, 28.66725919095263))
Carries by italian Back players 2019
('Confidence interval is between:', (11.594692610141848, 23.548164532715298))
Carries by italian Scrum players 2020
('Confidence interval is between:', (8.726629700872738, 22.000643026399988))
Carries by italian Back players 2020
('Confidence interval is between:', (12.481924903470581, 25.660932239386565))
Carries by italian Scrum players 2021
('Confidence interval is between:', (5.233428501813341, 10.499904831519991))
Carries by italian Back players 2021
('Confidence interval is between:', (8.029132902835585, 13.970867097164415))


In [268]:
#It might be useful now to check on the main table if actually the carries are influencing the points scored.


low_score = mixed_table[mixed_table['Scored']<70]
high_score = mixed_table[mixed_table['Scored']>100]

low_s_carry = low_score['Carries']
high_s_carry = high_score['Carries']
stats.ttest_ind(high_s_carry,low_s_carry)

#We can reject the null hypothesis.

Ttest_indResult(statistic=2.5610175277834193, pvalue=0.02369340007629521)

In [269]:
#One last thing I would like to check is the accurancy of Italy and of the other teams on average when they 
#have a free kick.

players_2019.head()

Unnamed: 0,Nationality,PLAYER,Minutes played,Tries,Conversions,Carries,Passes,Knock Ons,Tackles Made,Missed tackles,Dominant Tackles,Year,Position
0,Italy,Jayden Hayward,400,0,0,42,52,2,26,3,1,2019,1
1,Ireland,Peter O'Mahony,400,0,0,32,25,2,50,6,2,2019,0
2,England,Elliot Daly,400,1,0,45,38,3,12,6,2,2019,1
3,Wales,Josh Adams,400,3,0,39,9,3,17,4,2,2019,1
4,Wales,Jonathan Davies,400,1,0,26,16,1,47,5,3,2019,1


In [270]:
kickers_2019 = players_2019[players_2019['Conversions']>0]
kickers_2020 = players_2020[players_2020['Conversions']>0]
kickers_2021 = players_2021[players_2021['Conversions']>0]

In [271]:
other_kickers_2019 = kickers_2019[kickers_2019['Nationality']!='Italy']
other_kickers_2020 = kickers_2020[kickers_2020['Nationality']!='Italy']
other_kickers_2021 = kickers_2021[kickers_2021['Nationality']!='Italy']

In [272]:
italian_kickers_2019 = kickers_2019[kickers_2019['Nationality']=='Italy']
italian_kickers_2020 = kickers_2020[kickers_2020['Nationality']=='Italy']
italian_kickers_2021 = kickers_2021[kickers_2021['Nationality']=='Italy']

In [273]:
print('Kickers average conversion for 2019')
print(confidence_calculator(other_kickers_2019['Conversions']))
print('Kickers average conversion for 2020')
print(confidence_calculator(other_kickers_2020['Conversions']))
print('Kickers average conversion for 2021')
print(confidence_calculator(other_kickers_2021['Conversions']))

Kickers average conversion for 2019
('Confidence interval is between:', (2.3482290401628756, 5.366056674122839))
Kickers average conversion for 2020
('Confidence interval is between:', (4.0661378131060655, 7.48941774244949))
Kickers average conversion for 2021
('Confidence interval is between:', (1.6224198883746994, 3.520437254482444))


In [274]:
print('Italian conversions 2019:')
print((italian_kickers_2019['Conversions']).sum())
print('Italian conversions 2020:')
print((italian_kickers_2020['Conversions']).sum())
print('Italian conversions 2021:')
print((italian_kickers_2021['Conversions']).sum())

#It is pointless to check the confidence here as Italy has just one kicker or max two,
#but we can see that in 2019 the main kicker managed to score 4 free kicks,
#whereas in 2020 were needed two players to score the same amount, which means that probably the same kicker 
#could not keep the same average
#and now in 2021 they are keeping the same average.
#But we can notice that the conversions scored by Italy are drastically getting worse.

Italian conversions 2019:
4
Italian conversions 2020:
4
Italian conversions 2021:
2


In [289]:
#Let's also check how many missed conversions the other teams have compared to Italy:
#Does it mean to score more points overall?

high_scoring = mixed_table[mixed_table['Scored']>100]
low_scoring = mixed_table[mixed_table['Scored']<70]
low_s_conversions = low_scoring['Missed Conversions']
high_s_conversions = high_scoring['Missed Conversions']
stats.ttest_ind(high_s_conversions,low_s_conversions)

#It seems like they have a connection!

Ttest_indResult(statistic=2.0320954123037342, pvalue=0.0630989066582577)

In [275]:
list(players_2019.columns)
#back_players_2019['Nationality']

['Nationality',
 'PLAYER',
 'Minutes played',
 'Tries',
 'Conversions',
 'Carries\xa0',
 'Passes',
 'Knock Ons\xa0',
 'Tackles Made\xa0',
 'Missed tackles',
 'Dominant Tackles',
 'Year',
 'Position']

In [282]:
mixed_table.columns

Index(['Position', 'Team', 'Matches', 'W', 'D', 'L', 'Scored', 'Conceded',
       'Diff', 'Tries', 'Tries conceded', 'Points/visit to opposition 22',
       'Conversions', 'Missed Conversions', 'Carries', 'Line breaks', 'Passes',
       'Knock Ons', 'Attempted Tackles', 'Tackles Made', 'Dominant Tackles',
       'Total points', 'Possession', 'Year'],
      dtype='object')