In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
#Загружаем данные 
schedule = pd.read_csv('fifa-world-cup-2022.csv')
matches = pd.read_csv('results.csv')


Уберём из датасета с результатами команды, которые не участвуют в чемпионате

In [3]:
teams = schedule['Home Team'].unique()[:32] # все команды-участники ЧМ
print(f'Размер исходного датасета - {matches.shape}')
teams_home = matches[matches['home_team'].isin(teams)]
teams_away = matches[matches['away_team'].isin(teams)]
matches = pd.concat((teams_home, teams_away))
print(f'Размер нового датасета - {matches.shape}')

Размер исходного датасета - (43752, 9)
Размер нового датасета - (21016, 9)


Подготовим финальную версию датасета - оставим только столбцы с участниками матча и добавим столбец с результатом матча

In [4]:
matches =matches.reset_index(drop=True)
result = []
'''МО работает только с числами, поэтому заполним колонку результат следующим образом: 
    если победила команда слева - 2, если команда справа - 1, если ничья - 0'''
for i in range (len(matches['home_team'])):
    if matches ['home_score'][i] > matches['away_score'][i]:
        result.append(2)
    elif matches['home_score'][i] < matches ['away_score'][i]:
        result.append(0)
    else:
        result.append(1)
matches['result'] = result
matches = matches.drop(['date','home_score','away_score','tournament','city','country','neutral'], axis=1)
#также, требуется перевести названия команд в численный формат

df = pd.get_dummies(matches, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
matches

Unnamed: 0,home_team,away_team,result
0,England,Scotland,2
1,England,Scotland,1
2,England,Scotland,0
3,Wales,Scotland,0
4,England,Wales,2
...,...,...,...
21011,Jamaica,Mexico,1
21012,Netherlands,Wales,2
21013,Poland,Belgium,0
21014,Chile,Ghana,1


Обучим модель логистической регрессии определять результат матча

In [5]:

X = df.drop(['result'], axis=1)
y = df["result"]
y = y.astype('int')

# Делим данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

logreg = LogisticRegression( max_iter=1500 )
logreg.fit(X_train, y_train)
score = logreg.score(X_train, y_train)
score2 = logreg.score(X_test, y_test)

print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))
a=np.array(X.columns)
a

Training set accuracy:  0.575
Test set accuracy:  0.565


array(['home_team_Afghanistan', 'home_team_Albania', 'home_team_Algeria',
       'home_team_Andalusia', 'home_team_Andorra', 'home_team_Angola',
       'home_team_Argentina', 'home_team_Armenia', 'home_team_Aruba',
       'home_team_Australia', 'home_team_Austria', 'home_team_Azerbaijan',
       'home_team_Bahrain', 'home_team_Bangladesh', 'home_team_Barbados',
       'home_team_Basque Country', 'home_team_Belarus',
       'home_team_Belgium', 'home_team_Belize', 'home_team_Benin',
       'home_team_Bermuda', 'home_team_Bhutan', 'home_team_Bolivia',
       'home_team_Bosnia and Herzegovina', 'home_team_Botswana',
       'home_team_Brazil', 'home_team_Brittany', 'home_team_Brunei',
       'home_team_Bulgaria', 'home_team_Burkina Faso',
       'home_team_Burundi', 'home_team_Cambodia', 'home_team_Cameroon',
       'home_team_Canada', 'home_team_Cape Verde', 'home_team_Catalonia',
       'home_team_Cayman Islands', 'home_team_Central African Republic',
       'home_team_Chad', 'home_team_

Данная модель обучалась, учитывая матчи, которые проходили на территории одной из команд, что, возможно, давало команде-хозяину преимущество. Поэтому перед тем, как предсказывать победителя в матче, нужно будет как-то выдать преимущество одной из команд. Для этого воспользуемся файлом с рейтингом фифа и будем помечать как фаворита команду с большим рейтингом фифа.

In [30]:
df

Unnamed: 0,result,home_team_Afghanistan,home_team_Albania,home_team_Algeria,home_team_Andalusia,home_team_Andorra,home_team_Angola,home_team_Argentina,home_team_Armenia,home_team_Aruba,...,away_team_Vanuatu,away_team_Venezuela,away_team_Vietnam,away_team_Vietnam Republic,away_team_Wales,away_team_Yemen,away_team_Yemen DPR,away_team_Yugoslavia,away_team_Zambia,away_team_Zimbabwe
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21011,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21012,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
21013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21014,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
ranking=pd.read_csv('fifa_rankings.csv')
#ranking=ranking[['rank','country_full','country_abrv']]

group_schedule=schedule[:48] #групповой этап
group_schedule.insert(1, 'first_rank', group_schedule['Home Team'].map(ranking.set_index('Team')['Position']))
group_schedule.insert(2, 'second_rank', group_schedule['Away Team'].map(ranking.set_index('Team')['Position']))
pred_df=[]
for index, row in group_schedule.iterrows():
    if row['first_rank'] < row['second_rank']:
        pred_df.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_df.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
pred_df = pd.DataFrame(pred_df)
backup_pred_set=pred_df
df2 = pd.get_dummies(pred_df, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])
df2 = df2.drop(['winning_team'], axis=1)
missing_cols = set(df.columns) - set(df2.columns)
for c in missing_cols:
    df2[c] = 0
df2 = df2[df.columns]
df2=df2.drop(['result'],axis=1)


Создадим словарь с количеством очков каждой команды в группе и функцию для подсчета очков

In [7]:
#schedule[schedule['Group']=='Group A'][:2]
groups=schedule.Group.unique()[:8]
points=[0,0,0,0]
group_dict={}
for group in groups:
    teams1=schedule[schedule['Group']==group][:2]['Home Team'].unique()
    teams2=schedule[schedule['Group']==group][:2]['Away Team'].unique()
    teams=[]
    for el in teams1:
        teams.append(el)
    for el in teams2:
        teams.append(el)
    keys = teams
    values = points
    dictionary = dict(zip(keys, values))
    group_dict[group]=dictionary


def add_points(team,points):

    group=[key for key,value in group_dict.items() if team in value.keys()][0]
    group_dict[group][team]+=points    

Запустим симуляцию группового этапа

In [8]:
fixtures=schedule[:48]
predictions = logreg.predict(df2)
for i in range(fixtures.shape[0]):
    print(backup_pred_set.iloc[i, 0] + " and " + backup_pred_set.iloc[i, 1])
    if predictions[i] == 2:
        print("Winner: " + backup_pred_set.iloc[i, 0])
        add_points(backup_pred_set.iloc[i, 0],3)
    elif predictions[i] == 1:
        print("Draw")
        add_points(backup_pred_set.iloc[i, 0],1)
        add_points(backup_pred_set.iloc[i, 1],1)
    elif predictions[i] == 0:
        print("Winner: " + backup_pred_set.iloc[i, 1])
        add_points(backup_pred_set.iloc[i, 1],3)
    print('Probability of ' + backup_pred_set.iloc[i, 0] + ' winning: ', '%.3f'%(logreg.predict_proba(df2)[i][2]))
    print('Probability of Draw: ', '%.3f'%(logreg.predict_proba(df2)[i][1]))
    print('Probability of ' + backup_pred_set.iloc[i, 1] + ' winning: ', '%.3f'%(logreg.predict_proba(df2)[i][0]))
    print("")

Netherlands and Senegal
Winner: Netherlands
Probability of Netherlands winning:  0.647
Probability of Draw:  0.237
Probability of Senegal winning:  0.115

England and Iran
Winner: England
Probability of England winning:  0.634
Probability of Draw:  0.262
Probability of Iran winning:  0.105

Ecuador and Qatar
Winner: Ecuador
Probability of Ecuador winning:  0.568
Probability of Draw:  0.293
Probability of Qatar winning:  0.140

Wales and USA
Winner: Wales
Probability of Wales winning:  0.660
Probability of Draw:  0.174
Probability of USA winning:  0.166

Argentina and Saudi Arabia
Winner: Argentina
Probability of Argentina winning:  0.803
Probability of Draw:  0.152
Probability of Saudi Arabia winning:  0.045

Denmark and Tunisia
Winner: Denmark
Probability of Denmark winning:  0.636
Probability of Draw:  0.222
Probability of Tunisia winning:  0.142

Poland and Mexico
Winner: Poland
Probability of Poland winning:  0.427
Probability of Draw:  0.285
Probability of Mexico winning:  0.288



In [15]:
group_dict

{'Group A': {'Senegal': 6, 'Qatar': 0, 'Netherlands': 9, 'Ecuador': 3},
 'Group B': {'England': 9, 'USA': 0, 'Iran': 3, 'Wales': 6},
 'Group C': {'Argentina': 9, 'Mexico': 3, 'Saudi Arabia': 0, 'Poland': 6},
 'Group D': {'Denmark': 6, 'France': 9, 'Tunisia': 3, 'Australia': 0},
 'Group F': {'Morocco': 3, 'Belgium': 9, 'Croatia': 6, 'Canada': 0},
 'Group E': {'Germany': 9, 'Spain': 6, 'Japan': 0, 'Costa Rica': 3},
 'Group G': {'Switzerland': 6, 'Brazil': 9, 'Cameroon': 0, 'Serbia': 3},
 'Group H': {'Uruguay': 6, 'Portugal': 9, 'Korea Republic': 0, 'Ghana': 3}}