# PART 3: Prediction of the winner of an NBA game

The goal of this exercise is to ...

----

## Studying the data

In [10]:
"""
Import section
"""

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
"""
Let's now loads the different datasets
"""

# Load the datasets
X_train = np.load('X_train.npy') #features
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy') #labels
y_test = np.load('y_test.npy')

In [5]:
# Check the shapes of the loaded arrays to understand the dimensions of the data
print("Training set features shape:", X_train.shape)
print("Test set features shape:", X_test.shape)
print("Training set labels shape:", y_train.shape)
print("Test set labels shape:", y_test.shape)

Training set features shape: (500, 50)
Test set features shape: (500, 50)
Training set labels shape: (500,)
Test set labels shape: (500,)


---
We now know that there are 500 samples under 50 different features on both of the features sets

And we have 500 samples on the labels sets

Now let's study some features on the datasets

---

In [6]:

# Basic statistics for features
print("Features Mean: ", np.mean(X_train, axis=0))
print("Features Std Dev: ", np.std(X_train, axis=0))
print("Features Min: ", np.min(X_train, axis=0))
print("Features Max: ", np.max(X_train, axis=0))

# Check for class imbalance
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("Label Distribution: ", dict(zip(unique_elements, counts_elements)))

# Label statistics
print("Labels Mean: ", np.mean(y_train))
print("Labels Std Dev: ", np.std(y_train))
print("Labels Min: ", np.min(y_train))
print("Labels Max: ", np.max(y_train))

Features Mean:  [-0.09394964  0.29162237  0.01315227 -0.10866658  0.29876652 -0.04194446
 -0.27063616 -0.01383477 -0.01765614  0.12838712  0.04927599 -0.04291361
 -0.13972603 -0.04933297  0.10869843 -0.04044432 -0.14213375  0.01072076
  0.09881601  0.13244445  0.05321012  0.03624287  0.10451249  0.25611365
  0.10852024 -0.1103403   0.10137301  0.10943872  0.07175347  0.21231416
 -0.04600741  0.07674707  0.08265513 -0.00691493 -0.03347023 -0.0244771
  0.06785085  0.06928688  0.19024655 -0.09204261  0.08997888  0.14340439
 -0.14356795  0.04574898  0.10034907  0.02223365  0.03351004  0.14728103
  0.24826058  0.21036759]
Features Std Dev:  [2.874549   4.20182198 4.16172174 2.88975104 2.84355854 3.02342542
 2.97198526 2.84598783 2.83291862 4.23691427 2.78059374 2.85605665
 2.95017511 2.92699069 2.88631273 2.92642427 2.86696164 2.95628739
 2.80475421 4.01297811 4.10906207 2.87031161 2.93980393 4.01251758
 4.0906219  2.85806035 2.86522846 2.93501005 2.97859072 2.90846665
 4.16658949 2.9539349

## Scaling the dataset

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Mean of scaled training features:", np.mean(X_train_scaled, axis=0)[:5])  # First 5 features
print("Std of scaled training features:", np.std(X_train_scaled, axis=0)[:5])  # First 5 features

Mean of scaled training features: [ 2.48689958e-17 -2.88657986e-17  1.03250741e-17  1.77635684e-17
 -3.99680289e-18]
Std of scaled training features: [1. 1. 1. 1. 1.]


## Training the models

### Method 1: Logistic Regression Model

In [None]:
# Initialize the logistic regression model
log_reg = LogisticRegression()

# Train the model
log_reg.fit(X_train_scaled, y_train)

# Predict on the training set (to see training performance)
y_train_pred = log_reg.predict(X_train_scaled)

# Predict on the test set
y_test_pred = log_reg.predict(X_test_scaled)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Logistic Regression Training Accuracy:", train_accuracy)
print("Logistic Regression Test Accuracy:", test_accuracy)