### Import necessary modules

In [93]:
import time
import joblib

import requests
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, accuracy_score

### DataCollection: Fetching Match Details from the API

In this section, we collect detailed match data using the <a href=https://docs.opendota.com>OpenDota API</a>. Specifically, we retrieve the following information for each match:

- **Match ID**: A unique identifier for each match.
- **Radiant Team Heroes**: A list of heroes picked by the Radiant team.
- **Dire Team Heroes**: A list of heroes picked by the Dire team.
- **Winning Team**: The team that won the match (Radiant or Dire).

In [45]:
# Url to recent public matches
MATCHES_URL = 'https://api.opendota.com/api/publicMatches'

In [43]:
# Empty dataframe to concat collected data with API
matches_df = pd.DataFrame()

In [96]:
i = 5

while i > 0:
    response = requests.get(MATCHES_URL)
    data = response.json()
    current_matches = pd.DataFrame(data)
    matches_df = pd.concat([matches_df, current_matches], axis=0)

    time.sleep(60) # Wait 60 seconds for next request
    i -= 1

dropping duplicate matches

In [97]:
matches_df.drop_duplicates(subset=['match_id'], inplace=True)

saving data

In [99]:
matches_df.to_csv('../data/matches.csv', index=False)

# ReadData

In [2]:
matches = pd.read_csv('../data/matches.csv')

Removing additional features

In [3]:
columns_to_keep = ['radiant_win', 'radiant_team', 'dire_team']
# Removing columns_to_keep list from dataframe columns
columns_to_drop = matches.columns.difference(columns_to_keep)

# Dropping additional features
matches.drop(columns_to_drop, axis=1, inplace=True)

making better column for winner team

In [4]:
winner_team = matches['radiant_win']
matches['winner'] = np.where(winner_team == 1, 'radiant', 'dire')

matches.drop(['radiant_win'], axis=1, inplace=True)

### Now lets look at our dataframe

- **radiant_team**: Contains list of hero id which is picked in radiant team.
- **dire_team**: Contains list of hero id which is picked in dire team.
- **winner**: Name of the winner team

In [5]:
matches.head(5)

Unnamed: 0,radiant_team,dire_team,winner
0,"[102, 138, 19, 47, 101]","[18, 53, 59, 129, 35]",radiant
1,"[39, 4, 102, 71, 27]","[21, 29, 40, 46, 82]",radiant
2,"[42, 113, 136, 11, 79]","[14, 34, 80, 49, 18]",dire
3,"[8, 13, 53, 14, 68]","[5, 96, 71, 93, 39]",radiant
4,"[51, 114, 20, 30, 44]","[5, 97, 41, 113, 57]",radiant


### Preprocessing

Now we have to do something with two team columns which contain list

My idea is using one-hot-encoding for these lists for example:

column radiant_(X) = 1, means the hero with id X is picked in radiant team

column radiant_(X) = 0, means the hero with id X is NOT picked in radiant team (maybe picked in dire or maybe not!)

Using MultiLabelBinarizer class from sklearn.preprocessing for transforming list columns

In [6]:
print(type(matches['dire_team'][0]))
print(type(matches['radiant_team'][0]))

<class 'str'>
<class 'str'>


As can be seen type of these columns are string

In the shell below we convert columns to actual list so we can use MultiLabelBinarizer

In [7]:
matches['dire_team'] = matches['dire_team'].apply(pd.eval)
matches['radiant_team'] = matches['radiant_team'].apply(pd.eval)

In [8]:
print(type(matches['dire_team'][0]))
print(type(matches['radiant_team'][0]))

<class 'list'>
<class 'list'>


In [9]:
mlb = MultiLabelBinarizer()

# Fit on the combined data from both teams
mlb.fit(matches['radiant_team'] + matches['dire_team'])

# Transform both radiant and dire teams
radiant_encoded = pd.DataFrame(mlb.transform(matches['radiant_team']),
                               columns=['radiant_' + str(hero) for hero in mlb.classes_])
dire_encoded = pd.DataFrame(mlb.transform(matches['dire_team']),
                            columns=['dire_' + str(hero) for hero in mlb.classes_])

# Combine the encoded columns with the original DataFrame
matches_encoded = pd.concat([matches['winner'], radiant_encoded, dire_encoded], axis=1)

Now we create target column by mapping winner column

In [10]:
winner_map = {'radiant': 1, 'dire': 0}
matches_encoded['winner'] = matches_encoded['winner'].map(winner_map)

In [11]:
matches_encoded.head(5)

Unnamed: 0,winner,radiant_1,radiant_2,radiant_3,radiant_4,radiant_5,radiant_6,radiant_7,radiant_8,radiant_9,...,dire_120,dire_121,dire_123,dire_126,dire_128,dire_129,dire_135,dire_136,dire_137,dire_138
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Splitting data into train and test

In [13]:
X = matches_encoded.drop(['winner'], axis=1)
y = matches_encoded['winner']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

For Creating model we can use support vector machines (SVM) or Random forests

## Random Forest model

First lets feed our data to random forests model while using gridsearch for hyperparameters

In [88]:
# Empty dictionary for saving different model accuracy
models_accuracy = {}

In [34]:
param_grid = {'max_depth': [20, 40, 60, 80],
              'min_samples_split': [10, 30, 50, 70],
              'max_leaf_nodes': [20, 30, 40, 50]}

rf_gs = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
rf_gs.fit(X_train, y_train)
rf_gs.best_estimator_

In [89]:
y_pred = rf_gs.predict(X_test)
print(classification_report(y_test, y_pred))
rf_accuracy = accuracy_score(y_test, y_pred)
models_accuracy['RandomForest'] = rf_accuracy

              precision    recall  f1-score   support

           0       0.53      0.23      0.32       181
           1       0.60      0.85      0.70       243

    accuracy                           0.58       424
   macro avg       0.56      0.54      0.51       424
weighted avg       0.57      0.58      0.54       424



# SVM model

At the beginning we use LinearSVC which is faster than SVC with linear kernel

In [58]:
# Initialize the base models
param_grid = {
    'C': [1, 10, 100, 1000]
}

linear_svc_gs = GridSearchCV(LinearSVC(), param_grid=param_grid)
linear_svc_gs.fit(X_train, y_train)
linear_svc_gs.best_estimator_

In [90]:
y_pred = linear_svc_gs.predict(X_test)
print(classification_report(y_test, y_pred))
linear_svc_accuracy = accuracy_score(y_test, y_pred)
models_accuracy['LinearSVC'] = linear_svc_accuracy

              precision    recall  f1-score   support

           0       0.57      0.49      0.53       181
           1       0.66      0.72      0.69       243

    accuracy                           0.62       424
   macro avg       0.61      0.61      0.61       424
weighted avg       0.62      0.62      0.62       424



Now lets check SVC with rbf kernel

In [76]:
param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.01, 0.001, 0.0001]
}

svc_kernel_gs = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid)
svc_kernel_gs.fit(X_train, y_train)
svc_kernel_gs.best_estimator_

In [91]:
y_pred = svc_kernel_gs.predict(X_test)
print(classification_report(y_test, y_pred))
svc_kernel_accuracy = accuracy_score(y_test, y_pred)
models_accuracy['SVC_kernel'] = svc_kernel_accuracy

              precision    recall  f1-score   support

           0       0.52      0.49      0.51       181
           1       0.64      0.66      0.65       243

    accuracy                           0.59       424
   macro avg       0.58      0.58      0.58       424
weighted avg       0.59      0.59      0.59       424



## All models accuracy

In [92]:
models_accuracy

{'RandomForest': 0.5849056603773585,
 'LinearSVC': 0.6226415094339622,
 'SVC_kernel': 0.589622641509434}

### Saving models

In [98]:
joblib.dump(rf_gs, '../models/random_forest_model.joblib')
joblib.dump(linear_svc_gs, '../models/linear_svc_model.joblib')
joblib.dump(svc_kernel_gs, '../models/svc_kernel_model.joblib')
print('Saved')

Saved
