In [1078]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import ast
from typing import Dict, List
from catboost import CatBoostClassifier
import json
from pandas import json_normalize
from tqdm import tqdm_notebook as tqdm
import catboost

In [1079]:
pd.set_option("display.max_columns", None)
pd.options.mode.chained_assignment = None

In [1080]:
events = pd.read_csv('../input/hackaton-ufc/data/0.events_raw.csv')
fighters = pd.read_csv('../input/hackaton-ufc/data/0.fighters_raw.csv')
fighters_cols = [
    "id",
    "name",
    "weight",
    "height",
    "armSpan",
    "legSwing",
    "weightCategory.id",
    "weight_category_name",
    "dateOfBirth",
    "country",
    "city",
    "timezone",
]

## Исследование данных о бойцах

In [1081]:
fighters.head()

In [1082]:
fighters.info()

In [1083]:
# Есть ли дубликаты
fighters.duplicated().sum()

In [1084]:
# Переведем dateOfBirth в datetime
fighters['dateOfBirth'] = pd.to_datetime(fighters["dateOfBirth"])

In [1085]:
# Посмотрим из каких стран бойцы
fighters['country'].unique()

In [1086]:
len(fighters['country'].unique())

Найденные ошибки в странах:

Штаты и United States заменить на USA, Дагестан - это Россия, Wales - Великобритания, есть Советский Союз, есть England и United Kingdom, Saint Pierre and Miquelon - Франция, 'Los Angeles', 'Saint Louis' - USA, Ontario - Канада, Ukrainian SSR и Ukraine, Floirda - США, Rostov-On-Don - Россия, Califorina- USA, Democratic Republic of Congo/Democratic Republic Of The Congo/Republic of Congo, Bosnia/Bosnia Herzegovina, Armenian SSR/Armenia, Colombia/ British Columbia, Quebec - Канада

Исправим то, что нашли

In [1087]:
usa_state_names = [
    "Alaska",
    "Alabama",
    "Arkansas",
    "American Samoa",
    "Arizona",
    "California",
    "Colorado",
    "Connecticut",
    "District ",
    "of Columbia",
    "Delaware",
    "Florida",
    "Floirda"
    "Georgia",
    "Guam",
    "Hawaii",
    "Iowa",
    "Idaho",
    "Illinois",
    "Indiana",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Saint Louis",
    "Los Angeles"
    "Massachusetts",
    "Maryland",
    "Maine",
    "Michigan",
    "Minnesota",
    "Missouri",
    "Mississippi",
    "Montana",
    "North Carolina",
    "North Dakota",
    "Nebraska",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "Nevada",
    "New York",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Puerto Rico",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Virginia",
    "Virgin Islands",
    "Vermont",
    "Washington",
    "Wisconsin",
    "West Virginia",
    "Wyoming",
    "Massachusetts"
    
    
]


In [1088]:
fighters.loc[fighters['country'] == 'United States', 'country'] = 'USA'
fighters.loc[fighters['country'].isin(usa_state_names), 'country'] = 'USA'


len(fighters['country'].unique())

In [1089]:
fighters.loc[fighters['country'] == 'Dagestan', 'country'] = 'Russia'
fighters.loc[fighters['country'] == 'Rostov-On-Don', 'country'] = 'Russia'
fighters.loc[fighters['country'] == 'Wales', 'country'] = 'United Kingdom'
fighters.loc[fighters['country'] == 'England', 'country'] = 'United Kingdom'
fighters.loc[fighters['country'] == 'Saint Pierre and Miquelon', 'country'] = 'France'
fighters.loc[fighters['country'] == 'Ontario', 'country'] = 'Canada'
fighters.loc[fighters['country'] == 'Quebec', 'country'] = 'Canada'
fighters.loc[fighters['country'] == 'Ukrainian SSR', 'country'] = 'Soviet Union'
fighters.loc[fighters['country'] == 'Democratic Republic Of The Congo', 'country'] = 'Democratic Republic of Congo'
fighters.loc[fighters['country'] == 'Republic of Congo', 'country'] = 'Democratic Republic of Congo'
fighters.loc[fighters['country'] == 'Bosnia Herzegovina', 'country'] = 'Bosnia'
fighters.loc[fighters['country'] == 'Armenian SSR', 'country'] = 'Soviet Union'
fighters.loc[fighters['country'] == 'British Columbia', 'country'] = 'Colombia'
fighters.loc[fighters['country'] == 'Holland', 'country'] = 'Netherlands'


In [1090]:
len(fighters['country'].unique())

In [1091]:
fighters['country'].unique()

In [1092]:
fighters.describe()


## Посмотрим на выбросы в параметрах тела

In [1093]:
fighters['armSpan'].describe()

In [1094]:
fighters['armSpan'].plot.box()

In [1095]:
fighters[fighters['armSpan'] > 210]['armSpan'].count()

In [1096]:
fighters[fighters['armSpan'] > 210]['armSpan']

In [1097]:
# Удалим этот выброс
fighters.drop(index=3405, inplace=True)

In [1098]:
# Проверка
fighters[fighters['armSpan'] > 400]['armSpan']

In [1099]:
fighters['legSwing'].describe()

In [1100]:
fighters['legSwing'].plot.box()

In [1101]:
fighters[fighters['legSwing'] < 85]['legSwing']

In [1102]:
fighters[fighters['legSwing'] > 120]['legSwing']

In [1103]:
fighters['height'].describe()

In [1104]:
fighters['height'].plot.box()

In [1105]:
fighters[fighters['height'] > 220]['height']

In [1106]:
fighters[fighters['height'] < 154]['height']

In [1107]:
fighters['weight'].describe()

In [1108]:
fighters['weight'].plot.box()

In [1109]:
fighters[fighters['weight'] < 52]['weight']

In [1110]:
fighters[fighters['weight'] > 200]['weight']

In [1111]:
# Удалим этот выброс
fighters.drop(index=1904, inplace=True)

In [1112]:
# Проверка
fighters[fighters['weight'] > 200]['weight']

## Посмотрим на количество нулевых значений в параметрах тела

In [1113]:
fighters['weight'].isnull().sum() 

In [1114]:
fighters['height'].isnull().sum()

In [1115]:
fighters['legSwing'].isnull().sum()

In [1116]:
fighters['armSpan'].isnull().sum()

In [1117]:
fighters['id'].count()

## Заменим пропуски в параметрах тела на средние значения по весовым категорям

In [1118]:
groupNames = fighters['weightCategory.name'].unique()
groupNames

In [1119]:
median_armSpan = fighters.groupby(['weightCategory.name']).median()['armSpan']

for name in groupNames:
    fighters.loc[ (fighters['weightCategory.name']==name) & 
                 (fighters['armSpan'].isnull()), 
                 'armSpan'] = median_armSpan[name]

In [1120]:
fighters['armSpan'].isnull().sum()

In [1121]:
median_legSwing = fighters.groupby(['weightCategory.name']).median()['legSwing']

for name in groupNames:
    fighters.loc[ (fighters['weightCategory.name']==name) & 
                 (fighters['legSwing'].isnull()), 
                 'legSwing'] = median_legSwing[name]

In [1122]:
fighters['legSwing'].isnull().sum()

In [1123]:
median_height = fighters.groupby(['weightCategory.name']).median()['height']

for name in groupNames:
    fighters.loc[ (fighters['weightCategory.name']==name) & 
                 (fighters['height'].isnull()), 
                 'height'] = median_height[name]

In [1124]:
fighters['height'].isnull().sum()

In [1125]:
meadian_weight = fighters.groupby(['weightCategory.name']).median()['weight']

for name in groupNames:
    fighters.loc[ (fighters['weightCategory.name']==name) & 
                 (fighters['weight'].isnull()), 
                 'weight'] = meadian_weight[name]

In [1126]:
fighters['weight'].isnull().sum()

## Переведем среднее время боя в секунды

In [1127]:
def fightTimeToSeconds(x):
    if pd.isnull(x):
        return 0
    else:
        return int(x[0:2]) * 60 + int(x[3:5])

fighters['avgFightTime'] = fighters['avgFightTime'].apply(lambda x: fightTimeToSeconds(x))

In [1128]:
fighters['avgFightTime'].head()

In [1129]:
fighters.describe()

## Терминология

https://ufc.ru/intro-to-mma

armSpan Arm span or reach (sometimes referred to as wingspan, or spelled "armspan") is the physical measurement of the length from one end of an individual's arms (measured at the fingertips) to the other when raised parallel to the ground at shoulder height at a 90° angle.

legSwing размах ноги

Draw ничья

significantStrikesMissedPerMinute Значительные удары Significant Strikes are basically any strike that significantly improves your chance of winning the fight. That basically boils down to any strike thrown to the legs, body, or head with enough force to be considered damaging.

Judges' Decision (U-Dec, S-Dec):
When a fight is not finished by a submission, KO, or TKO, the judges deliver their scorecards for a final decision. 


Нокаут ( англ. КО ) : боец оказывается в бессознательном состоянии в результате одного или нескольких точных ударов из положения стоя, после которого (ых) бой немедленно остановлен рефери, без возможности добивания соперника на земле.



Технический нокаут (англ. TKO): остановка боя третьим лицом в результате потери одним из бойцов способности продолжать бой. Технические нокауты могут подразделяться на три категории:

Остановка рефери (англ. Referee stoppage). Рефери решает, что один из бойцов не может осмысленно защищаться, и останавливает бой.

Остановка врачом (врач, присутствующий возле ринга, решает, что дальнейшее участие одного из бойцов ставит жизнь или здоровье этого участника под угрозу. Например, травмы или обильное кровотечение)

Остановка «углом» (англ. Corner stoppage). Угловой секундант бойца сигнализирует об остановке боя.

DQ - дисквалификация. Использование запрещённого приема в поединке.
Болевые и удушающие относятся к ТКО, то есть это добровольная сдача бойца или остановка боя арбитром.

Submission (SUB): 
Physical or verbal tap out by one of dozens of methods of submission. 

In [1130]:
# Посмотрим на столбец disciplines
fighters['disciplines'].unique()

In [1131]:
# Узнаем названия весовых категорий
fighters['weightCategory.name'].unique()

Наилегчайший вес — от 53 до 57 килограммов.
Легчайший вес — от 57 до 61кг
Полулёгкий вес — от 61 до 66кг
Лёгкий вес — от 66 до 70кг
Полусредний вес — от 70 до 77кг
Средний вес — от 77 до 84 кг
Полутяжёлый вес — от 84 до 94 кг
Тяжёлый вес — от 93 до 120кг

Женский минимальный вес (Strawweight) — до 115 фунтов (52 кг);
Наилегчайший вес (Flyweight) — до 125 фунтов (57 кг);
Легчайший вес (Bantamweight) — до 135 фунтов (61 кг);
Полулёгкий вес (Bantamweight) — до 145 фунтов (66 кг).

In [1132]:
# Посмотрим имена категорий веса по id
weight_id_name = fighters.loc[:,['weightCategory.id', 'weightCategory.name']]
weight_id_name.drop_duplicates().sort_values(by=['weightCategory.id'])

In [1133]:
# Среднее значение по весовой категории
fighters.groupby(by='weightCategory.id').mean()['weight']

In [1134]:
# Количество бойцов в каждой весовой категории
fighters['weightCategory.name'].value_counts()

In [1135]:
# График количество бойцов в каждой весовой категории
plt.figure(figsize=(10,5))
bar = sns.countplot(fighters['weightCategory.name'])
plt.xticks(rotation=90)
ax = plt.gca()
y_max = fighters['weightCategory.name'].value_counts().max() 
ax.set_ylim(1)
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), p.get_height(), 
        fontsize=10, color='black', ha='center', va='bottom')   
plt.show();

In [1136]:
new_f = fighters
new_f.rename(columns={'weightCategory.name': 'weight_category_name'}, inplace=True)
women = new_f.weight_category_name.str.contains('Жен')

women1 = len(new_f[women])
men = (len(new_f['weight_category_name'])) - len(new_f[women])

In [1137]:
# Соотношение мужчин и женщин среди борцов
labels = 'Men', 'Women'
sizes = [men,women1]
explode = (0, 0.1,)  

fig1, ax1 = plt.subplots(figsize=(10,8))
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90 )
ax1.axis('equal') 

plt.show()

In [1138]:
# Количество бойцов по странам
(fighters
 .pivot_table(index='country', values='id', aggfunc='count')
 .sort_values('id', ascending=False)).head(50)

In [1139]:
 # Посмотрим на столбец methods.winMethods
fighters['methods.winMethods'].unique()

In [1140]:
# Посмотрим из каких стран больше всего победителей
(fighters
 .pivot_table(index='country', values='wins', aggfunc='sum')
 .sort_values('wins', ascending=False)).head(50)

# Исследование данных о боях

In [1141]:
events.head()

In [1142]:
events.columns

In [1143]:
events.info()

Пропуски в city, country, duration, rounds, timezone, winnerId

In [1144]:
events.describe()

In [1145]:
# Переведем eventDate.date в datetime
events['eventDate.date'] = pd.to_datetime(events['eventDate.date'])

In [1146]:
events['eventDate.date'].head()

In [1147]:
# Посмотрим на информацию в ячейках столбца events['fighters']
events['fighters'].head(2)

In [1148]:
events.loc[0, 'fighters']

In [1149]:
# Посмотрим на информацию в ячейках столбца events['avgOdds']
events['avgOdds'].unique()

In [1150]:
# Сколько пустых списков в avgOdds
events[events['avgOdds'] == '[]']['avgOdds'].count()

In [1151]:
# Сколько непустых списков в avgOdds
events[events['avgOdds'] != '[]']['avgOdds'].count()

In [1152]:
events[events['avgOdds'] != '[]']['avgOdds'].head()

In [1153]:
events.loc[1558, 'avgOdds']

# Извлекаем данные из колонок avgOdds и fighters

**Парсим колонку avgOdds**

In [1154]:
def parse_odds(row: pd.Series) -> pd.Series:
    
    avg_odds = row["avgOdds"]
    if avg_odds == "[]" or avg_odds == np.nan:
        return pd.Series([np.nan] * 2)
    avg_odds = ast.literal_eval(avg_odds)
    if avg_odds[0]["fighterId"] == row["fighterId_1"]:
        return pd.Series([f.get("value", np.nan) for f in avg_odds])
    else:
        return pd.Series([f.get("value", np.nan) for f in reversed(avg_odds)])

In [1155]:
events[["f1_odds", "f2_odds"]] = events[
    ["avgOdds", "fighterId_1", "fighterId_2"]
].apply(lambda row: parse_odds(row), axis=1)
events.drop(columns="avgOdds", inplace=True)
events.head(5)

**Парсим колонку fighters**

In [1156]:
fighter_stats_keys = [
    "hitsTotal",
    "hitsSuccessful",
    "takedownTotal",
    "takedownSuccessful",
    "submissionAttempts",
    "takeovers",
    "accentedHitsTotal",
    "accentedHitsSuccessful",
    "knockdowns",
    "protectionPassage",
    "hitsHeadTotal",
    "hitsHeadSuccessful",
    "hitsBodyTotal",
    "hitsBodySuccessful",
    "hitsLegsTotal",
    "hitsLegsSuccessful",
    "accentedHitsPositionDistanceTotal",
    "accentedHitsPositionDistanceSuccessful",
    "accentedHitsPositionClinchTotal",
    "accentedHitsPositionClinchSuccessful",
    "accentedHitsPositionParterTotal",
    "accentedHitsPositionParterSuccessful",
]


def get_fighter_stats_cols() -> List[str]:

    fighter_stats_cols = []
    for i in range(1, 3):
        for k in fighter_stats_keys:
            fighter_stats_cols.append(f"f{i}_{k}")
    return fighter_stats_cols


def sum_round_stats(stats: List[Dict[str, int]]) -> List[int]:
    
    if len(stats) == 0:
        return [np.nan for _ in range(len(fighter_stats_keys))]
    res = {k: 0 for k in fighter_stats_keys}
    for i in stats:
        for k in res:
            res[k] = i.get(k, 0)
    return list(res.values())


def parse_fight_data(row: pd.Series) -> pd.Series:

    fighters = row["fighters"]
    if fighters == "[]" or fighters == np.nan:
        return pd.Series([np.nan for _ in range(len(fighter_stats_keys))])
    cols = []
    fighters = ast.literal_eval(fighters)
    if fighters[0]["fighterId"] == row["fighterId_2"]:
        fighters = reversed(fighters)
    for f in fighters:
        cols.extend(sum_round_stats(f["roundStats"]))
    return pd.Series(cols)

In [1157]:
events[get_fighter_stats_cols()] = events[
    ["fighters", "fighterId_1", "fighterId_2"]
].apply(lambda row: parse_fight_data(row), axis=1)
events.drop(columns="fighters", inplace=True)
events.head(5)

**Добавляем данные о бойцах в датафрейм с боями**

In [1158]:
fighter_data_cols = fighters.drop(
    columns=["weightCategory.id"]
).columns
events = events.join(
    fighters[fighter_data_cols].add_prefix("f1_"), on="fighterId_1"
)
events = events.join(
    fighters[fighter_data_cols].add_prefix("f2_"), on="fighterId_2"
)
events.head(5)

**Добавляем признак age**

In [1159]:
def add_age(row: pd.Series) -> pd.Series:

    result = []
    for prefix in ["f1_", "f2_"]:
        try:
            age = row["eventDate.date"].year - row[prefix + "dateOfBirth"].year
        except Exception:
            age = np.nan
        result.append(age)
    return pd.Series(result)

In [1160]:
events[["f1_age", "f2_age"]] = events[
    ["eventDate.date", "f1_dateOfBirth", "f2_dateOfBirth"]
].apply(lambda row: add_age(row), axis=1)

### Некторые участиники получились слишком юные для UFC

In [1161]:
events['f1_age'].describe()

In [1162]:
events['f2_age'].describe()

In [1163]:
events[events['f1_age'] < 18]['f1_age'].count()

In [1164]:
events[events['f1_age'] > 55]['f1_age'].count()

In [1165]:
events['f1_age'].isnull().sum()

In [1166]:
events[events['f2_age'] < 18]['f2_age'].count()

In [1167]:
events[events['f2_age'] > 55]['f2_age']

In [1168]:
events['f2_age'].isnull().sum()

In [1169]:
# Не так много выбросов и нулевых значений  удалим их
events = events[(events['f1_age'] >= 18) & (events['f2_age'] >= 18)]
events.head(5)

## Посмотрим на завершенность боёв

In [1170]:
events['completed'].unique()

In [1171]:
events[events['completed'] == False]['completed'].count()

In [1172]:
# Удалим незавершенные боя
events = events[events['completed'] == True]

In [1173]:
# Проверка
events[events['completed'] == False]['completed'].count()

## Создадим dataframe, на котором будем обучать модель

In [1175]:
final_df = events[[ 
    'duration', 
    'eventDate.date', 
    'fighterId_1',
    'fighterId_2',
    'id',
    'rounds',
    'weightCategory.id',
    #'winMethods',
    'winnerId',   
    #"f1_hitsTotal",
    "f1_hitsSuccessful",
    "f1_takedownTotal",
    "f1_takedownSuccessful",
    "f1_submissionAttempts",
    "f1_takeovers",
    "f1_accentedHitsTotal",
    #"f1_accentedHitsSuccessful",
    #"f2_hitsTotal",
    "f2_hitsSuccessful",
    "f2_takedownTotal",
    "f2_takedownSuccessful",
    "f2_submissionAttempts",
    "f2_takeovers",
    "f2_accentedHitsTotal",
    #"f2_accentedHitsSuccessful",
    #"f1_knockdowns",
    #"f2_knockdowns",
    "f1_weight",
    "f2_weight",
    "f1_age",
    "f2_age"
]]

In [1176]:
# When fighterID_1 wins, then Winner == 1 
final_df['winner'] = final_df['winnerId'] == final_df['fighterId_1']

In [1177]:
final_df.head()

In [1178]:
final_df.drop(columns="winnerId", inplace=True)

In [1179]:
final_df["year"] = final_df["eventDate.date"].dt.year
df_test = final_df[final_df['year'] == 2020]
df_valid = final_df[final_df['year'] == 2019]
df_train = final_df[(final_df['year'] != 2020) & (final_df['year'] != 2019)]
df_test.head()

## Корреляционная карта параметров бойцов

In [1180]:
# Check for collinearity between variables
corr = final_df.corr()

display(corr)
plt.figure(figsize=(19, 15))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='RdBu', annot=True, fmt='.2f', vmin=-1.0);

# Пробуем модель CatBoostClassifier

In [1181]:
X_train = df_train.drop(columns=["winner"])
y_train = df_train["winner"]

X_valid = df_valid.drop(columns=["winner"])
y_valid = df_valid["winner"]

X_test = df_test.drop(columns=["winner"])
y_test = df_test["winner"]

In [1182]:
clf = CatBoostClassifier(iterations=1000, 
                           depth=4,
                           learning_rate=0.01,
                           loss_function='Logloss',
                           colsample_bylevel=1,
                           subsample=0.5,
#                                    bagging_temperature=bagging_temperature,
                           random_strength=1,
                           l2_leaf_reg=30,
                           eval_metric='Accuracy',
#                            boosting_type='Plain', # 'Plain'
                           bootstrap_type='Bernoulli',# 'Bernoulli' Bayesian
                           max_ctr_complexity=2,
                           task_type='GPU',
                          )

clf.fit(X_train, y_train, plot=True, eval_set=(X_valid, y_valid), verbose=False)

In [1183]:
y_pred = clf.predict(df_test)
df_test['y_pred'] = y_pred
df_test

In [1184]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

In [1185]:
df_test['y_pred'].unique()

In [1186]:
df_test.info()

In [1187]:
y_pred = np.apply_along_axis(lambda x : x == "True", 0, y_pred)

In [1188]:
# Точность модели CatBoostClassifie
f1_score(y_test, y_pred)


# Пробуем модель RandomForestClassifier

In [1189]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.pipeline import Pipeline

rfc = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=10,  min_samples_split=2,
                             min_samples_leaf=1, random_state=0)


dataset_train = df_train.drop(columns=['eventDate.date'])
dataset_valid = df_valid.drop(columns=['eventDate.date'])

dataset_train = dataset_train.dropna()
dataset_valid = dataset_valid.dropna()

X_new_train = dataset_train.drop(columns=['winner'])
y_new_train = dataset_train['winner']

X_new_valid = dataset_valid.drop(columns=['winner'])
y_new_valid = dataset_valid['winner']

model = Pipeline([ ('random_forest', rfc)])
model.fit(X_new_train, y_new_train)
y_pred_forest = model.predict(X_new_valid )



In [1190]:
# Точность модели RandomForestClassifier
metrics.accuracy_score(y_new_valid, y_pred_forest)