In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from time import time

LOGISTIC REGRESSION

In [2]:
def sigmoid(z):
    """Numerically stable sigmoid function."""
    z = np.clip(z, -500, 500)  # Limit z to avoid overflow
    return np.where(z >= 0,
                    1 / (1 + np.exp(-z)),
                    np.exp(z) / (1 + np.exp(z)))

def cross_entropy(y, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

def compute_gradients(X, y, y_pred):
    diff = y_pred - y
    return np.dot(X.T, diff) / len(y)

def initialize_weights(size):
    std_dev = np.sqrt(2 / (size + 1))
    return np.random.randn(size) * std_dev

class LogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X_train, y_train):
        self.weights = initialize_weights(X_train.shape[1])
        self.losses = []

        for _ in range(self.epochs):
            y_pred = sigmoid(np.dot(X_train, self.weights))
            loss = cross_entropy(y_train, y_pred)
            self.losses.append(loss)

            gradients = compute_gradients(X_train, y_train, y_pred)
            self.weights -= self.lr * gradients

    def predict_probabilities(self, X):
        return sigmoid(np.dot(X, self.weights))
        
    def predict(self, X, threshold=0.5):
        return self.predict_probabilities(X) >= threshold

KNN

In [3]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))
class KNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common
    
    def predict_probabilities(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        probs = np.bincount(k_nearest_labels, minlength=np.max(self.y_train)+1) / self.k
        return probs

INCARCARE SETURI DE DATE DE ANTRENARE

In [4]:
df1 = pd.read_csv('../data/match_data.csv')
X_train_1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y_train_1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('../data/cleaned_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X_train_2 = df2.drop(['round_winner'], axis=1)
y_train_2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X_train_3 = df3.drop(columns=['round_winner'])
y_train_3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

PARAMETRII OPTIMI AI FIECARUI MODEL

In [15]:
# learning_rate, epochs
best_params_logistic_regression = [[0.1, 500], [0.3, 2000], [0.1, 600]]

# learning_rate, dropout_rate, l2_lambda, batch_size, epochs
best_params_neural_network = [[0.001, 0.5, 0.0001, 64, 500], [0.0005, 0.3, 0.001, 64, 1000], [0.0001, 0.3, 0.01, 128, 300]]

# max_depth, learning_rate, gamma, reg_lambda, n_estimators
best_params_xgboost = [[3, 0.3, 0, 0.2, 7], [9, 0.05, 0.2, 0.2, 146], [9, 0.1, 0.3, 0, 62]]

# learning_rate, max_depth, n_estimators
best_params_adaboost = [[0.05, 2, 150], [0.1, 9, 22], [0.05, 10, 52]]

# k
best_params_knn = [[5], [45], [40]]

# max_depth, n_estimators, min_samples_split
best_params_random_forest = [[9, 300, 5], [40, 150, 3], [25, 300, 7]]

INCARCARE SETURI DE DATE DE TEST

In [5]:
df1 = pd.read_csv('demo_data/test_10_matches_data.csv')
X_test_1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y_test_1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('demo_data/test_10_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X_test_2 = df2.drop(['round_winner'], axis=1)
y_test_2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('demo_data/test_10_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X_test_3 = df3.drop(columns=['round_winner'])
y_test_3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

In [6]:
datasets = [(X_train_1, y_train_1, X_test_1, y_test_1, ds1_name), (X_train_2, y_train_2, X_test_2, y_test_2, ds2_name), (X_train_3, y_train_3, X_test_3, y_test_3, ds3_name)]

In [7]:
def PrintMessages(idx, y_test):
    if idx == 0:
        print("Actual Match Winner")
    else:
        print("Actual Round Winner")
    print(y_test.values)

    if idx == 0:
        print("Predicting Match Winner")
    else:
        print("Predicting Round Winner")

LOGISTIC REGRESSION TRAIN AND PREDICT

In [8]:
def LogisticRegressionModel(X_train, y_train, X_test, y_test, learning_rate, epochs):
        start_time = time()
        model = LogisticRegression(lr=learning_rate, epochs=epochs)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        duration = time() - start_time
        print(f"{y_pred.astype(int)} predicted with Logistic Regression Duration: {duration:.2f}s Accuracy: {accuracy_score(y_test, y_pred):.2f}")

NEURAL NETWORK TRAIN AND PREDICT

In [9]:
def NeuralNetworkModel(X_train, y_train, X_test, y_test, learning_rate, dropout_rate, l2_lambda, batch_size, epochs):
        start_time = time()
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1],)))
        model.add(Dense(128, activation='relu', kernel_regularizer=l2(l2_lambda)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(64, activation='relu', kernel_regularizer=l2(l2_lambda)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(32, activation='relu', kernel_regularizer=l2(l2_lambda)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1, activation='sigmoid'))
        optimizer = Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10)
        history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=epochs, callbacks=[es], verbose=0)
        
        y_probs = model.predict(X_test)
        y_pred = (y_probs > 0.5)

        duration = time() - start_time
        print(f"{y_pred.astype(int).ravel()} predicted with Neural Network Duration: {duration:.2f}s Accuracy: {accuracy_score(y_test, y_pred):.2f}")

XGBOOST TRAIN AND PREDICT

In [10]:
def XGBoostModel(X_train, y_train, X_test, y_test, max_depth, learning_rate, gamma, reg_lambda, n_estimators):
        start_time = time()
        model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, gamma=gamma, reg_lambda=reg_lambda, n_estimators=n_estimators)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        duration = time() - start_time
        print(f"{y_pred} predicted with XGBoost Duration: {duration:.2f}s Accuracy: {accuracy_score(y_test, y_pred):.2f}")

ADABOOST TRAIN AND PREDICT

In [11]:
def AdaBoostModel(X_train, y_train, X_test, y_test, learning_rate, max_depth, n_estimators):
        start_time = time()
        model = AdaBoostClassifier(algorithm="SAMME", estimator = DecisionTreeClassifier(max_depth=max_depth), learning_rate=learning_rate, n_estimators=n_estimators)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        duration = time() - start_time
        print(f"{y_pred} predicted with AdaBoost Duration: {duration:.2f}s Accuracy: {accuracy_score(y_test, y_pred):.2f}")

KNN TRAIN AND PREDICT

In [12]:
def KNNModel(X_train, y_train, X_test, y_test, k):
        start_time = time()
        model = KNN(k=k)
        model.fit(X_train, y_train)
        y_pred = [model.predict(x) for x in X_test]
        duration = time() - start_time
        y_pred_str = ' '.join(map(str, y_pred))
        print(f"[{y_pred_str}] predicted with KNN Duration: {duration:.2f}s Accuracy: {accuracy_score(y_test, y_pred):.2f}")


RANDOM FOREST TRAIN AND PREDICT

In [13]:
def RandomForestModel(X_train, y_train, X_test, y_test, max_depth, n_estimators, min_samples_split):
        start_time = time()
        model = RandomForestClassifier(criterion="log_loss", n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        duration = time() - start_time
        print(f"{y_pred} predicted with RandomForest Duration: {duration:.2f}s Accuracy: {accuracy_score(y_test, y_pred):.2f}")

In [16]:
for idx, (X_train, y_train, X_test, y_test, ds_name) in enumerate(datasets):
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    print(f"Dataset: {ds_name}")

    PrintMessages(idx, y_test)

    LogisticRegressionModel(X_train, y_train, X_test, y_test, learning_rate=best_params_logistic_regression[idx][0], epochs=best_params_logistic_regression[idx][1])

    NeuralNetworkModel(X_train, y_train, X_test, y_test, learning_rate=best_params_neural_network[idx][0], dropout_rate=best_params_neural_network[idx][1], l2_lambda=best_params_neural_network[idx][2], batch_size=best_params_neural_network[idx][3], epochs=best_params_neural_network[idx][4])

    XGBoostModel(X_train, y_train, X_test, y_test, max_depth=best_params_xgboost[idx][0], learning_rate=best_params_xgboost[idx][1], gamma=best_params_xgboost[idx][2], reg_lambda=best_params_xgboost[idx][3], n_estimators=best_params_xgboost[idx][4])

    AdaBoostModel(X_train, y_train, X_test, y_test, learning_rate=best_params_adaboost[idx][0], max_depth=best_params_adaboost[idx][1], n_estimators=best_params_adaboost[idx][2])

    KNNModel(X_train, y_train, X_test, y_test, k=best_params_knn[idx][0])

    RandomForestModel(X_train, y_train, X_test, y_test, max_depth=best_params_random_forest[idx][0], n_estimators=best_params_random_forest[idx][1], min_samples_split=best_params_random_forest[idx][2])

    print("\n\n\n\n")


Dataset: match_data
Actual Match Winner
[1 1 1 1 1 1 0 0 1 1]
Predicting Match Winner
[0 0 1 0 1 1 0 1 1 1] predicted with Logistic Regression Duration: 0.38s Accuracy: 0.60
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[0 1 1 0 1 1 0 1 1 1] predicted with Neural Network Duration: 2.28s Accuracy: 0.70
[0 0 1 0 1 0 0 1 0 1] predicted with XGBoost Duration: 0.01s Accuracy: 0.40
[0 0 0 0 1 0 0 1 0 1] predicted with AdaBoost Duration: 0.83s Accuracy: 0.30
[1 1 0 0 1 1 0 1 1 1] predicted with KNN Duration: 0.11s Accuracy: 0.70
[0 1 1 0 1 1 0 1 1 1] predicted with RandomForest Duration: 0.44s Accuracy: 0.70





Dataset: rounds_data
Actual Round Winner
[1 1 0 0 1 0 0 1 1 0]
Predicting Round Winner
[1 1 1 0 0 0 0 1 1 1] predicted with Logistic Regression Duration: 7.87s Accuracy: 0.70
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1 1 0 0 0 0 0 1 1 0] predicted with Neural Network Duration: 11.59s Accuracy: 0.90
[1 1 0 0 0 1 0 0 1 0] predict