## 注意

如果在matplotlib绘图时出现异常，请删除"turtle.done()"

如果仍有问题，请在每次绘图时注释掉之前的绘图代码并重启kernel

可能是由于不同电脑之间存在差异，带来的不便烦请谅解

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from factor_analyzer import FactorAnalyzer as FA
from sklearn.cross_decomposition import CCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression as LR
import turtle
# offers to receive: direction of pass
# channel: 肋部
matplotlib.use("TkAgg")

## 研究目标

### 球队水平

1. 是否存在潜在因子 FA 随后用这些因子的得分来进行后续计算（3、4、5）

2. 球队的风格（进攻/防守）间是否有相关性 CCA

3. 哪些球队较为相似 CA

### 比赛水平

（给定两球队的各自重要数据，由1决定，作差得到相对数据，来预测净胜球，作为胜率判断标准）

4. 哪些变量对于胜负的影响较为显著 Regression
   
5. 两球队胜率预测 Regression

## 选用变量

### 原数据集中变量对应关系

0 team1：主队

1 team2：客队

2 possesion team1：主队控球率

3 possesion team2：客队控球率

5 number of goals team1: 主队进球数

6 number of goals team2：客队进球数

10 total attempts team1：主队射门尝试次数

11 total attempts team2：客队射门尝试次数

12 conceded team1：主队失球数

13 conceded team2：客队失球数

18 assists team1：主队助攻数

19 assists team2：客队助攻数

20 on target attempts team1：主队射正次数

21 on target attempts team2：客队射正次数

（可选）38 total offers to receive team1：主队主动接球次数

（可选）39 total offers to receive team2：客队主动接球次数

60 fouls against team1：主队被犯规数

61 fouls against team2：客队被犯规数

64 passes team1：主队传球数

65 passes team2：客队传球数

66 passes completed team1：主队传球成功数

67 passes completed team2：客队传球成功数

80 goal preventions team1：主队扑救数

81 goal preventions team2：客队扑救数

84 forced turnovers team1：主队造成失误数

85 forced turnovers team2：客队造成失误数

86 defensive pressures applied team1：主队防守压迫次数

87 defensive pressures applied team2：客队防守压迫次数



### 最后选取变量

控球率

射门尝试次数

射正率

助攻数

被犯规数

传球数

传球率

扑救率

防守压迫次数

造成失误数

进球数

失球数

净胜球数


## 读取数据

### 读取数据代码

In [5]:
# 原数据
data = pd.read_csv("./data/data.csv", header=0, encoding='utf-8')
# 选择变量后的简化数据
data_reduced = data.iloc[:64,[0,1,2,3,5,6,10,11,12,13,18,19,20,21,60,61,64,65,66,67,80,81,84,85,86,87]]
data_reduced.to_csv("./data/data_reduced.csv", encoding='utf-8')
data_reduced.head(10)

Unnamed: 0,team1,team2,possession team1,possession team2,number of goals team1,number of goals team2,total attempts team1,total attempts team2,conceded team1,conceded team2,...,passes team1,passes team2,passes completed team1,passes completed team2,goal preventions team1,goal preventions team2,forced turnovers team1,forced turnovers team2,defensive pressures applied team1,defensive pressures applied team2
0,QATAR,ECUADOR,0.42,0.5,0,2,5,6,2,0,...,450,480,381,409,6,5,52,72,256,279
1,ENGLAND,IRAN,0.72,0.19,6,2,13,8,2,6,...,809,224,730,154,8,13,63,72,139,416
2,SENEGAL,NETHERLANDS,0.44,0.45,0,2,14,9,2,0,...,383,438,313,374,9,15,63,73,263,251
3,UNITED STATES,WALES,0.51,0.39,1,1,6,7,1,1,...,569,409,509,321,7,7,81,72,242,292
4,ARGENTINA,SAUDI ARABIA,0.64,0.24,1,2,14,3,2,1,...,610,267,529,190,4,14,65,80,163,361
5,DENMARK,TUNISIA,0.55,0.33,0,0,11,11,0,0,...,594,387,523,302,11,11,76,73,226,287
6,MEXICO,POLAND,0.54,0.31,0,0,13,8,0,0,...,485,322,422,241,8,15,61,70,193,290
7,FRANCE,AUSTRALIA,0.56,0.35,4,1,22,4,1,4,...,734,466,672,399,4,22,64,56,250,316
8,MOROCCO,CROATIA,0.32,0.57,0,0,8,6,0,0,...,360,667,290,592,7,8,87,58,391,217
9,GERMANY,JAPAN,0.65,0.22,1,2,25,10,2,1,...,820,261,743,207,12,26,55,87,164,487


In [None]:
print(data.columns)

## 数据预处理

### 数据预处理代码

In [None]:
team_list = []
pos_list = []
goal_list = []
attempt_list = []
lost_list = []
assist_list = []
attempt_on_list = []
foul_list = []
passes_list = []
passes_ok_list = []
prevent_list = []
turnover_list = []
pressure_list = []
net_list = []

for i in range(data_reduced.shape[0] - 2):
    line = list(data_reduced.iloc[i,:])
    for j in range(2):
        team_list.append(line[0+j])
        pos_list.append(float(line[2+j]))
        goal_list.append(int(line[4+j]))
        attempt_list.append(int(line[6+j]))
        lost_list.append(int(line[8+j]))
        assist_list.append(int(line[10+j]))
        try: # 可能有球队没有射门数
            attempt_on_list.append(int(line[12+j])/int(line[6+j]))
        except:
            attempt_on_list.append(int(line[12+j]))
        foul_list.append(int(line[15-j]))
        passes_list.append(int(line[16+j]))
        passes_ok_list.append(int(line[18+j])/int(line[16+j]))
        try:
            prevent_list.append(int(line[20+j])/int(line[7-j]))
        except:
            prevent_list.append(np.NaN)
        turnover_list.append(int(line[23-j]))
        pressure_list.append(int(line[25-j]))
        net_list.append(int(line[4+j])-int(line[8+j]))

data_dict = {
    '球队': team_list,
    '控球率': pos_list,
    '进球数': goal_list,
    '射门数': attempt_list,
    '射正率': attempt_on_list,
    '失球数': lost_list,
    '助攻数': assist_list,
    '犯规数': foul_list,
    '传球数': passes_list,
    '传球成功率': passes_ok_list,
    '扑救率': prevent_list,
    '造成失误数': turnover_list,
    '压迫数': pressure_list,
    '净胜球': net_list
}

data_df = pd.DataFrame(data_dict)
data_df = data_df.groupby(['球队'], as_index=False).agg(np.mean)
data_df.to_csv("./data/data_df.csv", encoding='utf-8')
data_df  # 每支球队总体表现统计

In [None]:
data_df.describe()

## 因子分析FA

### 因子分析代码

In [None]:
data_X = data_df.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12]]
#data_X = (data_X - np.mean(data_X)) / np.std(data_X)
method = 'varimax'  # varimax/oblimax
fa = FA(3, rotation=method) 
fa.fit(data_X)
print(data_X.columns)
print(fa.loadings_)
df_new = pd.DataFrame(fa.transform(data_X))
df_new['球队'] = data_df['球队']
df_new.columns = ['中场','前场','后场','球队']
df_new['后场'] = -df_new['后场']
df_new.to_csv("./data/df_new.csv", encoding='utf-8')
df_new  #各球队因子得分 

### 因子分析结果

中场（组织、拦截）：控球率、传球数、传球成功率、造成失误数、压迫数

前场（进攻）：进球数、射正率、助攻数

后场（防守）：失球数（正）、扑救率（反）（与防守能力反相关）

### 因子分析解释比例

In [None]:
fa.get_factor_variance()

### 因子分析可视化

#### 崖底碎石图

In [None]:
ev, v = fa.get_eigenvalues()
plt.scatter(range(1, data_X.shape[1]+1), ev)
plt.plot(range(1,data_X.shape[1]+1), ev)
plt.title("Scree Plot")
plt.xlabel("Factors")
plt.ylabel("Eigenvalue")
plt.grid()
plt.show()
plt.savefig('./pics/scree.png', bbox_inches='tight')
turtle.done()

#### 因子载荷可视化

In [None]:
df_loadings = pd.DataFrame(np.abs(fa.loadings_), index=data_X.columns)
font = {'family' : 'SimHei',
    'weight' : 'bold',
    'size'  : '16'}
plt.rc('font', **font)
plt.rc('axes', unicode_minus=False)
sns.heatmap(df_loadings, annot=True, cmap='BuPu')
plt.title('因子载荷可视化示意图')
plt.xlabel('因子')
plt.ylabel('球队数据变量')
plt.show()
file_path  = './pics/FA_' + method + '.png'
plt.savefig(file_path, bbox_inches='tight')
turtle.done()

## 典型相关分析CCA

### 典型相关分析代码

In [None]:
front = data_df.iloc[:,[2,4,6]]
mid = data_df.iloc[:,[1,8,9,11,12]]
back = data_df.iloc[:,[5,10]]
cca_1 = CCA(n_components=1)
cca_1.fit(front,mid)
front_mid, mid_front = cca_1.transform(front, mid)
print(cca_1.x_loadings_, '\n')
print(cca_1.y_loadings_, '\n')
cca_2 = CCA(n_components=1)
cca_2.fit(mid, back)
mid_back, back_mid = cca_2.transform(mid, back)
print(cca_2.x_loadings_, '\n')
print(cca_2.y_loadings_)

In [None]:
print(cca_1.coef_)
cca_1_df = pd.DataFrame(cca_1.coef_)
font = {'family' : 'SimHei',
    'weight' : 'bold',
    'size'  : '16'}
plt.rc('font', **font)
plt.rc('axes', unicode_minus=False)
sns.heatmap(cca_1_df, annot=True, cmap='BuPu')
plt.show()
turtle.done()

### 典型相关分析结果

前场-中场：进球能力（进球数、助攻数）& 综合支配球能力（包括控球、传递球以及拦截抢球）

中场-后场：控球能力（控球率、传球）& 防失球能力（失球数 反相关） 

## 聚类分析CA

### 聚类分析代码

In [None]:
df_new = df_new.set_index("球队")
km = KMeans(n_clusters=4, n_init=20, max_iter=300).fit(df_new.iloc[:,0:3])
df_new['分类'] = km.labels_
print(df_new)
pca = PCA(n_components=2)
res_pca = pca.fit_transform(df_new.iloc[:,0:3])
df_pca = pd.DataFrame(res_pca)
df_pca = df_pca.set_index(df_new.index)
df_pca['分类'] = df_new['分类']
x = df_pca.iloc[:,0]
y = df_pca.iloc[:,1]
colors = ['r', 'g', 'b']
text = df_new.index
xx = [-0.1 for i in range(32)]
yy = [0.05 for i in range(32)]
xx[23] = -0.15
xx[27] = 0
xx[16] = -0.15
yy[16] = -0.15
plot = []
for i in range(3):
    df_i = df_pca[df_pca['分类']==i]
    temp = plt.scatter(df_i[0], df_i[1], marker='o', color=colors[i], s=50)
    plot.append(temp)
for i in range(32):
    plt.annotate(text[i], xy=(x[i],y[i]), xytext=(x[i]+xx[i],y[i]+yy[i]))
plt.legend(handles=[plot[2], plot[1], plot[0]], labels=['Class 1','Class 2', 'Class 3'], loc='best')
plt.show()
turtle.done()

### 聚类分析结果

In [None]:
df_new['分类']

## 回归Regression

### 球队数据差距处理代码

In [None]:
def sigmoid(x):
    return 1 / (1+np.exp(-0.8*x))

def find(team):
    for i in range(32):
        if team == text[i]:
            return i
        
mid_list = list(df_new['中场'])
front_list = list(df_new['前场'])
back_list = list(df_new['后场'])

team1_list = [] # 球队
team2_list = [] # 对手
mid_diff = []
front_diff = []
back_diff = []
win_rate = []

for i in range(data_reduced.shape[0]):
    line = list(data_reduced.iloc[i,:])
    team1 = line[0]
    team2 = line[1]
    id1 = find(team1)
    id2 = find(team2)
    # team1 update
    team1_list.append(team1)
    team2_list.append(team2)
    mid_diff.append(mid_list[id1]-mid_list[id2])
    front_diff.append(front_list[id1]-front_list[id2])
    back_diff.append(back_list[id1]-back_list[id2])
    win_rate.append(sigmoid(int(line[4])-int(line[5])))
    # team2 update
    team1_list.append(team2)
    team2_list.append(team1)
    mid_diff.append(mid_list[id2]-mid_list[id1])
    front_diff.append(front_list[id2]-front_list[id1])
    back_diff.append(back_list[id2]-back_list[id1])
    win_rate.append(sigmoid(int(line[5])-int(line[4])))

diff_dict = {
    '球队': team1_list,
    '对手': team2_list,
    '前场差距': front_diff,
    '中场差距': mid_diff,
    '后场差距': back_diff,
    '获胜概率': win_rate
}
diff_df = pd.DataFrame(diff_dict)
diff_df.to_csv("C:/Users/fuym20/Desktop/符亦铭/清华/大三下/多元统计分析/大作业/data/diff_df.csv", encoding='utf-8')
diff_df  # 球队表现差异数据

### 回归模型建立代码

In [None]:
def normalize(data_X):
    return((data_X - np.mean(data_X)) / np.std(data_X))

lm = LR()
diff_X = diff_df[['前场差距','中场差距','后场差距']].iloc[:,:]
diff_X = normalize(diff_X.iloc[:,:])
diff_Y = diff_df['获胜概率'].iloc[:]
lm.fit(diff_X, diff_Y)
print(lm.coef_, lm.intercept_)

def predict(team1, team2):
    id1 = find(team1)
    id2 = find(team2)
    x_pred_1 = np.array([[front_list[id1]-front_list[id2], mid_list[id1]-mid_list[id2], back_list[id1]-back_list[id2]]])
    x_pred_2 = np.array([[front_list[id2]-front_list[id1], mid_list[id2]-mid_list[id1], back_list[id2]-back_list[id1]]])
    print(team1, 'against', team2, ':', "%.2f" % (lm.predict(x_pred_1)*100), '%')
    print(team2, 'against', team1, ':', "%.2f" % (lm.predict(x_pred_2)*100), '%\n')


### 比赛结果预测代码

In [None]:
# Add with your will
predict('ARGENTINA', 'SAUDI ARABIA')
predict('ARGENTINA', 'MEXICO')
predict('ARGENTINA', 'POLAND')
predict('ARGENTINA', 'AUSTRALIA')
predict('ARGENTINA', 'NETHERLANDS')
predict('ARGENTINA', 'CROATIA')
predict('ARGENTINA', 'FRANCE')
predict('ARGENTINA', 'PORTUGAL')