In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
def sigmoid(z):
    """Numerically stable sigmoid function."""
    z = np.clip(z, -500, 500)  # Limit z to avoid overflow
    return np.where(z >= 0,
                    1 / (1 + np.exp(-z)),
                    np.exp(z) / (1 + np.exp(z)))

In [4]:
def cross_entropy(y, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))


In [5]:
def compute_gradients(X, y, y_pred):
    diff = y_pred - y
    return np.dot(X.T, diff) / len(y)

In [6]:
def initialize_weights(size):
    std_dev = np.sqrt(2 / (size + 1))
    return np.random.randn(size) * std_dev

In [8]:
class LogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X_train, y_train):
        self.weights = initialize_weights(X_train.shape[1])
        self.losses = []

        for _ in range(self.epochs):
            y_pred = sigmoid(np.dot(X_train, self.weights))
            loss = cross_entropy(y_train, y_pred)
            self.losses.append(loss)

            gradients = compute_gradients(X_train, y_train, y_pred)
            self.weights -= self.lr * gradients

    def predict_probabilities(self, X):
        return sigmoid(np.dot(X, self.weights))
        
    def predict(self, X, threshold=0.5):
        return self.predict_probabilities(X) >= threshold

FOR THE MATCH_DATA BEST PARAMETERS ARE EPOCHS = 4000 AND LR=0.01

In [11]:
df1 = pd.read_csv('../data/match_data.csv')
X1 = df1.drop(columns=['Date', 'Team1', 'Team2', 'Score'])
y1 = df1['Score']
ds1_name = "match_data"

df2 = pd.read_csv('../data/cleaned_rounds_data.csv')
df2 = df2.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X2 = df2.drop(['round_winner'], axis=1)
y2 = df2['round_winner']
ds2_name = "rounds_data"

df3 = pd.read_csv('../data/cleaned_rounds_data_with_stats.csv')
df3 = df3.drop(columns=[f'player_{i}_{suffix}' for i in range(1, 11) for suffix in ['team_name', 'name']])
X3 = df3.drop(['round_winner'], axis=1)
y3 = df3['round_winner']
ds3_name = "rounds_data_with_stats"

datasets = [(X1, y1, ds1_name), (X2, y2, ds2_name), (X3, y3, ds3_name)]

In [13]:
EPOCHS = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]
LR = [0.0001, 0.001, 0.01, 0.1, 1]
best_params = []
for X, y, ds_name in datasets:
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
    best_accuracy = 0
    best_lr = 0
    best_epochs = 0
    for epochs in EPOCHS:
        for lr in LR:
            model = LogisticRegression(lr=lr, epochs=epochs)
            model.fit(X, y)
            y_pred = model.predict(X_val)
            accuracy = accuracy_score(y_val, y_pred)
            print(f"For dataset: {ds_name}, lr: {lr}, epochs: {epochs}, accuracy: {accuracy}")
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_lr = lr
                best_epochs = epochs
    
    model = LogisticRegression(lr=best_lr, epochs=best_epochs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"For dataset: {ds_name}, Best lr: {best_lr}, Best epochs: {best_epochs}, Accuracy: {accuracy_score(y_test, y_pred)}")
    best_params.append((ds_name, best_lr, best_epochs, accuracy_score(y_test, y_pred)))

For dataset: match_data, lr: 0.0001, epochs: 100, accuracy: 0.4931506849315068
For dataset: match_data, lr: 0.001, epochs: 100, accuracy: 0.5283757338551859
For dataset: match_data, lr: 0.01, epochs: 100, accuracy: 0.5068493150684932
For dataset: match_data, lr: 0.1, epochs: 100, accuracy: 0.6046966731898239
For dataset: match_data, lr: 1, epochs: 100, accuracy: 0.6125244618395304
For dataset: match_data, lr: 0.0001, epochs: 200, accuracy: 0.48532289628180036
For dataset: match_data, lr: 0.001, epochs: 200, accuracy: 0.4500978473581213
For dataset: match_data, lr: 0.01, epochs: 200, accuracy: 0.5401174168297456
For dataset: match_data, lr: 0.1, epochs: 200, accuracy: 0.5949119373776908
For dataset: match_data, lr: 1, epochs: 200, accuracy: 0.6066536203522505
For dataset: match_data, lr: 0.0001, epochs: 300, accuracy: 0.49510763209393344
For dataset: match_data, lr: 0.001, epochs: 300, accuracy: 0.5283757338551859
For dataset: match_data, lr: 0.01, epochs: 300, accuracy: 0.5499021526418

In [14]:
for ds_name, lr, epochs, accuracy in best_params:
    print(f"Dataset: {ds_name}, Best lr: {lr}, Best epochs: {epochs}, Accuracy: {accuracy}")

Dataset: match_data, Best lr: 0.01, Best epochs: 1000, Accuracy: 0.6281800391389433
Dataset: rounds_data, Best lr: 0.01, Best epochs: 4000, Accuracy: 0.7612323491655969
Dataset: rounds_data_with_stats, Best lr: 0.1, Best epochs: 800, Accuracy: 0.7578091570389388
