## Indian Premier League (IPL) Regression Analysis

In [1]:
%%capture
# Due to the configuration of the base Jupter image, the following imports are required for the regressions in the assignment to report the correct metrics

import sys 
!{sys.executable} -m pip uninstall statsmodels --yes 
!{sys.executable} -m pip uninstall numpy --yes
!{sys.executable} -m pip uninstall pandas --yes 
!{sys.executable} -m pip uninstall patsy --yes 
!{sys.executable} -m pip install numpy==1.17
!{sys.executable} -m pip install pandas==1.0
!{sys.executable} -m pip install patsy==0.5.2
!{sys.executable} -m pip install statsmodels==0.11.1

In [2]:
# As usual, we begin by loading the packages we will need

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [37]:
# Now we load the data

IPL=pd.read_excel("Assignment Data/Week 5/IPL (assignment) data.xlsx")

In [38]:
IPL.head()

Unnamed: 0,year,team,played,won,lost,noresult,points,netrunrate,champions,runnersup,third,fourth,salaries
0,2008,Chennai Super Kings,14,8,6,0,16,-0.192,0,1,0,0,5825000
1,2009,Chennai Super Kings,14,8,5,1,17,0.951,0,0,0,0,6765000
2,2010,Chennai Super Kings,14,7,7,0,14,0.274,1,0,0,0,4890000
3,2011,Chennai Super Kings,14,9,5,0,18,0.443,1,0,0,0,6330000
4,2012,Chennai Super Kings,16,8,7,1,17,0.1,0,1,0,0,7900000


### 2. Create the sum of salaries in each season

In [40]:
IPL['total_salaries'] = IPL.groupby('team')['salaries'].transform('sum')
IPL.groupby('team').head()

'''
transform 在Pandas中是一個非常實用的函數，它常用於以下幾種情境：

1.標準化或歸一化：例如，在每個組內部標準化數據，使得每個組的數據都具有0的均值和1的標準差。
2.填充缺失值：可以在組內使用特定的聚合值填充NA，比如每個組的平均值或中位數。
3.計算組內排名或百分位數：如組內數據相對於組內其他數據的排名或百分位數。
4.廣播組統計：如將組的總和、平均值等廣播回每個組的成員。
5.transform 函數可以直接將操作結果插入到原始的DataFrame中，這樣做的好處是不改變原始數據的結構，同時能夠擴展原始數據的信息。
這與 aggregate 函數所返回的總結結果不同，aggregate 通常會減少數據的維度，只返回每個組的一個聚合值。
'''

Unnamed: 0,year,team,played,won,lost,noresult,points,netrunrate,champions,runnersup,third,fourth,salaries,total_salaries
0,2008,Chennai Super Kings,14,8,6,0,16,-0.192,0,1,0,0,5825000,74830760
1,2009,Chennai Super Kings,14,8,5,1,17,0.951,0,0,0,0,6765000,74830760
2,2010,Chennai Super Kings,14,7,7,0,14,0.274,1,0,0,0,4890000,74830760
3,2011,Chennai Super Kings,14,9,5,0,18,0.443,1,0,0,0,6330000,74830760
4,2012,Chennai Super Kings,16,8,7,1,17,0.1,0,1,0,0,7900000,74830760
9,2008,Deccan Chargers,14,2,12,0,4,-0.467,0,0,0,0,5885000,28210000
10,2009,Deccan Chargers,14,7,7,0,14,0.203,1,0,0,0,5250000,28210000
11,2010,Deccan Chargers,14,8,6,0,16,-0.297,0,0,0,0,5645000,28210000
12,2011,Deccan Chargers,14,6,8,0,12,0.222,0,0,0,0,5620000,28210000
13,2012,Deccan Chargers,16,4,11,1,9,-0.509,0,0,0,0,5810000,28210000


### 3. Create a variable for team salary divided by total salaries for that season (relsal).

In [41]:
IPL['season_all_teams_salaries'] = IPL.groupby('year')['salaries'].transform('sum')
IPL.groupby('year').head()

Unnamed: 0,year,team,played,won,lost,noresult,points,netrunrate,champions,runnersup,third,fourth,salaries,total_salaries,season_all_teams_salaries
0,2008,Chennai Super Kings,14,8,6,0,16,-0.192,0,1,0,0,5825000,74830760,34105000
1,2009,Chennai Super Kings,14,8,5,1,17,0.951,0,0,0,0,6765000,74830760,33445000
2,2010,Chennai Super Kings,14,7,7,0,14,0.274,1,0,0,0,4890000,74830760,33075000
3,2011,Chennai Super Kings,14,9,5,0,18,0.443,1,0,0,0,6330000,74830760,62210000
4,2012,Chennai Super Kings,16,8,7,1,17,0.1,0,1,0,0,7900000,74830760,59706250
5,2013,Chennai Super Kings,16,11,5,0,22,0.53,0,1,0,0,10740000,74830760,81535000
6,2014,Chennai Super Kings,14,9,5,0,18,0.385,0,0,1,0,11279000,74830760,73973332
7,2015,Chennai Super Kings,14,9,5,0,18,0.709,0,1,0,0,9780000,74830760,65242665
8,2018,Chennai Super Kings,14,9,5,0,18,0.253,1,0,0,0,11321760,74830760,84370160
9,2008,Deccan Chargers,14,2,12,0,4,-0.467,0,0,0,0,5885000,28210000,34105000


In [42]:
IPL["relsal"] = IPL["salaries"]/IPL['season_all_teams_salaries']

In [43]:
IPL.head()

Unnamed: 0,year,team,played,won,lost,noresult,points,netrunrate,champions,runnersup,third,fourth,salaries,total_salaries,season_all_teams_salaries,relsal
0,2008,Chennai Super Kings,14,8,6,0,16,-0.192,0,1,0,0,5825000,74830760,34105000,0.170796
1,2009,Chennai Super Kings,14,8,5,1,17,0.951,0,0,0,0,6765000,74830760,33445000,0.202272
2,2010,Chennai Super Kings,14,7,7,0,14,0.274,1,0,0,0,4890000,74830760,33075000,0.147846
3,2011,Chennai Super Kings,14,9,5,0,18,0.443,1,0,0,0,6330000,74830760,62210000,0.101752
4,2012,Chennai Super Kings,16,8,7,1,17,0.1,0,1,0,0,7900000,74830760,59706250,0.132314


### 4. Create a value for win percentage. Define win percentage as wins divided games with a result (= games played minus games with no result). 

In [44]:
IPL["wpc"] = IPL["won"]/IPL['played']

### 5. Create the lagged value of win percentage for each team

In [45]:
IPL["wpc_lagged"] = IPL.groupby("team")["wpc"].shift(1)

### 6. Regress win percentage on:
##### a) Relsal
##### b) Relsal + lagged win percentage
##### c) Relsal + lagged win percentage  + team fixed effects

In [46]:
#a
reg_1 = smf.ols(formula = 'wpc ~ relsal', data = IPL).fit()
print(reg_1.summary())

                            OLS Regression Results                            
Dep. Variable:                    wpc   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.010
Method:                 Least Squares   F-statistic:                    0.1428
Date:                Wed, 06 Mar 2024   Prob (F-statistic):              0.706
Time:                        08:24:55   Log-Likelihood:                 45.280
No. Observations:                  92   AIC:                            -86.56
Df Residuals:                      90   BIC:                            -81.52
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4687      0.063      7.461      0.0

In [47]:
#b
reg_2 = smf.ols(formula = 'wpc ~ relsal + wpc_lagged', data = IPL).fit()
print(reg_2.summary())

                            OLS Regression Results                            
Dep. Variable:                    wpc   R-squared:                       0.035
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     1.374
Date:                Wed, 06 Mar 2024   Prob (F-statistic):              0.259
Time:                        08:24:57   Log-Likelihood:                 43.331
No. Observations:                  79   AIC:                            -80.66
Df Residuals:                      76   BIC:                            -73.55
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3527      0.086      4.088      0.0

In [48]:
#c
reg_3 = smf.ols(formula = 'wpc ~ relsal + wpc_lagged + C(team)', data = IPL).fit()
print(reg_3.summary())

                            OLS Regression Results                            
Dep. Variable:                    wpc   R-squared:                       0.290
Model:                            OLS   Adj. R-squared:                  0.148
Method:                 Least Squares   F-statistic:                     2.046
Date:                Wed, 06 Mar 2024   Prob (F-statistic):             0.0303
Time:                        08:25:01   Log-Likelihood:                 55.477
No. Observations:                  79   AIC:                            -82.95
Df Residuals:                      65   BIC:                            -49.78
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------