In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import scale, robust_scale
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold, cross_val_score

### 5. 모델평가


* Model 평가 : 
     PCA를 적용한 Linear Regression 
     vs (TSR, PDO, Team_Value, AGE, Country Score) 변수들을 Feature로 선택 후 Linear Regression
     

In [8]:
# Load the Dataset
X = pd.read_csv('train.csv')

X.tail()

Unnamed: 0,Season,NAT,Club_Name,TSR,STR,PDO,Pass_Success,Possesion,Team_Value,AGE,FPPsq,Country Score,Rating,Pts
476,1516,GER,Wolfsburg,0.551852,0.560662,893.8322,0.820588,0.578824,7.58,24.5,0.542857,79.415,6.85,56.035
477,1516,GER,Eintracht Frankfurt,0.458194,0.439859,953.9767,0.721765,0.471765,2.5,23.9,0.571429,79.415,6.69,28.035
478,1516,GER,Borussia,0.493404,0.536775,949.0774,0.795152,0.556667,4.81,24.4,0.586207,79.415,6.94,42.035
479,1516,GER,Mainz 05,0.445851,0.466027,946.8986,0.72,0.461471,1.73,24.3,0.564103,79.415,6.84,18.035
480,1516,GER,Augsburg,0.437485,0.446503,1082.306,0.747647,0.468824,1.6,24.9,0.512821,79.415,6.8,23.035


In [9]:
#변수로 쓰지 않을 column 제거
del X["Season"]
del X["NAT"]
del X["Club_Name"]
X.tail()

Unnamed: 0,TSR,STR,PDO,Pass_Success,Possesion,Team_Value,AGE,FPPsq,Country Score,Rating,Pts
476,0.551852,0.560662,893.8322,0.820588,0.578824,7.58,24.5,0.542857,79.415,6.85,56.035
477,0.458194,0.439859,953.9767,0.721765,0.471765,2.5,23.9,0.571429,79.415,6.69,28.035
478,0.493404,0.536775,949.0774,0.795152,0.556667,4.81,24.4,0.586207,79.415,6.94,42.035
479,0.445851,0.466027,946.8986,0.72,0.461471,1.73,24.3,0.564103,79.415,6.84,18.035
480,0.437485,0.446503,1082.306,0.747647,0.468824,1.6,24.9,0.512821,79.415,6.8,23.035


In [10]:
# 실제 X Feature로 쓸 Feature 변수, Target 값, 데이터 수 표현 
dfX0 = X[[x for x in [0,1,2,3,4,5,6,7,8,9]]]
dfy = X[['Pts']]
dfX0.head()
len(dfy) # 데이터 수

481

### TSR, PDO, Team_value, AGE,  Country_Score를 Feature로 선택
### target = np.log(X['Pts'])

In [11]:
robust_tsr = robust_scale(dfX0['TSR'])
robust_pdo = robust_scale(dfX0['PDO'])
log_T_V = np.log(dfX0['Team_Value'])
scale_age = scale(dfX0['AGE'])
scale_C_S = scale(dfX0['Country Score'])
target = np.log(X['Pts'])



In [12]:
case5_array = np.array([robust_tsr, robust_pdo, log_T_V, scale_age, scale_C_S]).T
case_5 = pd.DataFrame(case5_array, columns = ['TSR', 'PDO', 'Team_Value', 
                                              'AGE' ,'Country Score'])

In [13]:
case_5_constant = sm.add_constant(case_5)
model = sm.OLS(target, case_5_constant)
result = model.fit()
print (result.summary())

                            OLS Regression Results                            
Dep. Variable:                    Pts   R-squared:                       0.676
Model:                            OLS   Adj. R-squared:                  0.673
Method:                 Least Squares   F-statistic:                     198.2
Date:                Mon, 26 Dec 2016   Prob (F-statistic):          8.72e-114
Time:                        17:08:43   Log-Likelihood:                -271.22
No. Observations:                 481   AIC:                             554.4
Df Residuals:                     475   BIC:                             579.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [95.0% Conf. Int.]
---------------------------------------------------------------------------------
const             2.7256      0.050     54.233

### PCA를 이용한 Linear Regression 

In [15]:
# Scale
Scale_data = scale(dfX0)
A = np.array(Scale_data) 
pca = PCA().fit(A)
pca = PCA(n_components=5).fit(A)
pca.components_.T.shape
soccer_PCA_analysis = pd.DataFrame(pca.components_.T, columns = ["PC1", "PC2", "PC3", "PC4", "PC5"], index = dfX0.columns)
soccer_PCA_analysis

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
TSR,0.320913,-0.276918,-0.444801,0.07976,-0.086319
STR,0.411223,0.162802,-0.016865,0.144534,-0.174635
PDO,0.208648,0.294922,0.517871,-0.228252,0.601846
Pass_Success,0.394824,0.138201,-0.096569,0.117614,0.151686
Possesion,0.420964,0.145239,-0.062862,0.146547,-0.028692
Team_Value,0.400406,-0.214424,0.176314,-0.038025,-0.177763
AGE,-0.011769,-0.445354,0.385594,0.768908,0.198709
FPPsq,0.136733,-0.504761,-0.351397,-0.286494,0.605112
Country Score,0.087044,-0.511256,0.450046,-0.44636,-0.349106
Rating,0.40564,0.095683,0.120872,-0.106581,-0.12993


In [18]:
# PCA 분석으로 OLS Summary 
pca_s = PCA(5).fit_transform(A)
T_features = pd.DataFrame(pca_s, columns = ["PC1","PC2","PC3","PC4","PC5"])
regression = "dfy ~ PC1 + PC2 + PC3 + PC4 + PC5"
model = sm.OLS.from_formula(regression, data = T_features)
result = model.fit()

#print (result.summary())
coef = np.array(result.params) # PCA로 모델링한 coef(가중치 계수) 저장

## Cross Validation 

* PCA를 적용한 모델 Score가 PCA를 적용하지 않은 모델 Score보다 높음
  * => PCA 모델 선택

In [19]:
#Cross Validation Score : KFold로 진행
X = T_features.values
y = dfy.values.flatten()

model = LinearRegression()
cv2 = KFold(len(y), 3)
cross_val_score(model, X, y, "r2", cv2).mean()

0.64170066915169643

In [20]:
X = case_5_constant.values
y = target.values.flatten()

model = LinearRegression()
cv2 = KFold(len(y), 3)
cross_val_score(model, X, y, "r2", cv2).mean()

0.61103119585818644

### 6. 최종 성능평가

In [21]:
Test = pd.read_csv('Test1617.csv') #EPL, Ligue 1, Bundesliga
Test.head()

Unnamed: 0,Season,NAT,Club_Name,TSR,STR,PDO,Pass_Success,Possesion,AGE,FPPsq,Team_Value,Country Score,Rating,Pts,Rank
0,1617,ENG,Manchester United,0.579077,0.55279,983.649439,0.835556,0.536667,26.7,0.666667,20.31,76.284,6.94,71.449,22
1,1617,ENG,Chelsea,0.646924,0.689434,1013.678451,0.846667,0.556667,26.9,0.769231,20.09,76.284,7.05,104.449,8
2,1617,ENG,Arsenal,0.564703,0.60259,1071.34865,0.845556,0.583333,26.5,0.724138,16.91,76.284,7.05,93.449,11
3,1617,ENG,Tottenham,0.633261,0.669257,1034.14183,0.811111,0.591111,25.2,0.652174,16.24,76.284,7.0,72.449,21
4,1617,ENG,Liverpool,0.654232,0.69599,964.98704,0.8225,0.59375,25.6,0.62963,14.01,76.284,6.97,54.449,29


In [22]:
Test_X = Test[[x for x in [3,4,5,6,7,8,9,10,11,12]]] #TSR ~ Rating까지 변수로 썼음.
Test_y = Test[['Pts']]

Test_X.head()

Unnamed: 0,TSR,STR,PDO,Pass_Success,Possesion,AGE,FPPsq,Team_Value,Country Score,Rating
0,0.579077,0.55279,983.649439,0.835556,0.536667,26.7,0.666667,20.31,76.284,6.94
1,0.646924,0.689434,1013.678451,0.846667,0.556667,26.9,0.769231,20.09,76.284,7.05
2,0.564703,0.60259,1071.34865,0.845556,0.583333,26.5,0.724138,16.91,76.284,7.05
3,0.633261,0.669257,1034.14183,0.811111,0.591111,25.2,0.652174,16.24,76.284,7.0
4,0.654232,0.69599,964.98704,0.8225,0.59375,25.6,0.62963,14.01,76.284,6.97


In [23]:
soccer_PCA_analysis #모델에서 추출한 PC1~PC5의 고유벡터

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
TSR,0.320913,-0.276918,-0.444801,0.07976,-0.086319
STR,0.411223,0.162802,-0.016865,0.144534,-0.174635
PDO,0.208648,0.294922,0.517871,-0.228252,0.601846
Pass_Success,0.394824,0.138201,-0.096569,0.117614,0.151686
Possesion,0.420964,0.145239,-0.062862,0.146547,-0.028692
Team_Value,0.400406,-0.214424,0.176314,-0.038025,-0.177763
AGE,-0.011769,-0.445354,0.385594,0.768908,0.198709
FPPsq,0.136733,-0.504761,-0.351397,-0.286494,0.605112
Country Score,0.087044,-0.511256,0.450046,-0.44636,-0.349106
Rating,0.40564,0.095683,0.120872,-0.106581,-0.12993


In [24]:
# 예측할 16/17 시즌들의 데이터와 PCA들의 행렬곱
convert_to_PCA = scale(Test_X).dot(np.array(soccer_PCA_analysis))
convert_to_PCA = sm.add_constant(convert_to_PCA)

In [25]:
# 모델에서의 coef 와 PCA의 성분들의 행렬곱
predict_to_PCA = convert_to_PCA.dot(coef)
predict_to_PCA

array([ 76.09602654,  92.38040651,  89.75467186,  88.98934797,
        88.98628107,  93.68233895,  83.01742086,  57.24282719,
        42.33836917,  79.75025561,  55.47569239,  40.06269225,
        93.47275414,  39.61435738,  47.59718258,  37.08213331,
        21.63390936,  44.33978537,  44.29385766,  54.52269546,
        48.31604878,  41.6535303 ,  48.86562425,  36.35019383,
        25.86442885,  16.88338765,  62.80295678,   6.21353477,
        27.29190518,  45.87539168,   1.25577217,   7.70642477,  48.86354171])

In [26]:
# 예측한 Pts값과 실제 Pts의 값
predict = pd.DataFrame(data = predict_to_PCA, columns = ["Predict"])
result = pd.concat([Test.Club_Name ,Test.Pts, predict, Test.Rank], axis=1)

result.tail()

Unnamed: 0,Club_Name,Pts,Predict,Rank
28,Saint-Etienne,26.899,27.291905,62
29,Monaco,43.399,45.875392,40
30,Montpellier,15.399,1.255772,114
31,Guingamp,18.399,7.706425,97
32,Nice,10.399,48.863542,129


In [27]:
result.sort(['Predict'], ascending=[False] ) # Predict 기준으로 내림차순으로 정렬

  if __name__ == '__main__':


Unnamed: 0,Club_Name,Pts,Predict,Rank
5,Manchester City,87.449,93.682339,13
12,Bayern Munich,138.928,93.472754,2
1,Chelsea,104.449,92.380407,8
2,Arsenal,93.449,89.754672,11
3,Tottenham,72.449,88.989348,21
4,Liverpool,54.449,88.986281,29
6,Everton,27.449,83.017421,66
9,Southampton,17.949,79.750256,98
0,Manchester United,71.449,76.096027,22
26,Paris Saint,113.399,62.802957,6


### 7. 한계점

* 보다 정확한 데이터 확보 부족
   + UEFA랭킹에는 존재하지만 해당시즌에 2부리그로 강등되어 있는 팀의 경우는 
     데이터를 전부 확보할 수 없는 경우가 많았음. 그래서 결국 2부리그팀을 기준선상에서 제외
     
   + 처음 주제였던 UEFA 랭킹에 나와있는 모든 리그들에 속해있는 팀들의 데이터를 분석하여 UEFA Coefficient를 예측하여
     최종적으로 순위를 예측하려고 했음. 하지만 역시 데이터확보 문제에 직면하여 확보할 수 있는 주요 리그 팀 데이터만 크롤링.
     보다 정확하게 설명할 수 있는 데이터 확보 부족
   ___
* 리그는 장기전인데 비해 유럽대회 단기전 성격이 강함
   + 단기전은 장기전인 리그에 비해 여러 변수에 의한 영향을 받는 정도가 더 큼. 
   ___
* 축구에는 다양한 전술, 다양한 변수 존재
   + 이 모든 것을 다 설명하기에는 기존의 분석에 쓰였던 독립변수로는 커버력이 부족하다는 것을 느꼈음
   ___
   
* 우리가 정한 변수들이 너무 공격력에 기반한 데이터였음.

* 독립변수들이 데이터를 가공한 평균값으로 되어 있어서 아웃라이어를 제대로 파악하지 못했음.