In [2]:
import pandas as pd

df = pd.read_excel('/content/encoded_allstats.xlsx')

necessary = ['+/-', 'OFFRTG', 'DEFRTG', 'NETRTG', 'PIE', 'PTS', 'FGM', 'FTM', '3PM', 'FGA', 'FTA', '3PA', 'TEAM_encoded', 'OPPONENT_encoded', 'AST.1']

columns_to_remove = necessary + ['OREB', 'DREB', 'REB', 'TOV', 'EFG%', 'AST', 'TS%']

df = df.drop(columns_to_remove, axis=1)

In [3]:
# import necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
# Make the train-test split in 80:20 ratio
X = df.drop('W/L', axis = 1)
y = df['W/L']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Normalize the X_train and X_test datasets by the min and max values
# scaler = MinMaxScaler()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **SVM**

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the SVM Classifier
model = SVC(kernel='linear')  # You can choose other kernels like 'rbf', 'poly', etc.

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8252032520325203


***Apply K-Fold CV ***

In [6]:
from sklearn.model_selection import cross_val_score
kfold = cross_val_score(model, X, y, cv=5)
kfold

array([0.8495935 , 0.84349593, 0.83943089, 0.83130081, 0.82520325])

In [7]:
kfold.mean()

0.8378048780487806

**Apply Stratified K-fold CV**

In [8]:
from sklearn.model_selection import StratifiedKFold
strat_kfold = cross_val_score(model, X, y, cv=StratifiedKFold())
print(strat_kfold)

[0.8495935  0.84349593 0.83943089 0.83130081 0.82520325]


In [9]:
strat_kfold.mean()

0.8378048780487806

**Hyperparameter Tuning**

In [10]:
from sklearn.model_selection import GridSearchCV

# Define the parameter ranges to be tested
C_range = [0.1, 1, 10, 100]              # Regularization parameter
kernel_range = ['linear', 'rbf', 'poly'] # Kernel functions to be tested
gamma_range = ['scale', 'auto']          # Kernel coefficient for 'rbf' and 'poly'

# Create the parameter grid
param_grid = {
    'C': C_range,
    'kernel': kernel_range,
    'gamma': gamma_range
}

# Create the GridSearchCV object
grid = GridSearchCV(estimator=SVC(),
                    param_grid=param_grid,
                    cv=5,
                    scoring='accuracy',
                    refit=True,  # Setting grid with estimator
                    verbose=1)

# Fit the grid
grid.fit(X_train, y_train)

# Print the best parameters and best score
print("Accuracy: ", grid.best_score_)
print(grid.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy:  0.846021105384844
{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


# **RANDOM FOREST**

In [11]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7357723577235772


***Apply K-Fold CV ***

In [13]:
from sklearn.model_selection import cross_val_score
kfold = cross_val_score(rfc, X, y, cv=5)
kfold

array([0.77235772, 0.78658537, 0.76219512, 0.75      , 0.73780488])

In [14]:
kfold.mean()

0.7617886178861789

**Apply Stratified K-fold CV**

In [15]:
from sklearn.model_selection import StratifiedKFold
strat_kfold = cross_val_score(rfc, X, y, cv=StratifiedKFold())
print(strat_kfold)

[0.79268293 0.80284553 0.75609756 0.75203252 0.74186992]


In [16]:
strat_kfold.mean()

0.7691056910569105

**Hyperparameter Tuning**

In [17]:
from sklearn.model_selection import GridSearchCV

criteria = ["gini", "entropy"]              #criteria to be tested
min_sample_split_range = [2,10, 20]         #min sample split to be tested
max_depth_range = [None, 2, 5, 10]          #max depth to be tested
min_samples_leaf_range = [1, 5, 10]         #min samples in the leaf to be tested
min_leaf_nodes_range = [None, 5, 10, 20]    #min leaf nodes to be tested
# Added parameters
n_estimators = [100, 300, 600]              #n_estimators to be tested#

param_grid = {"criterion": criteria,
              "min_samples_split": min_sample_split_range,
              "max_depth": max_depth_range,
              "min_samples_leaf": min_samples_leaf_range,
              #"max_leaf_nodes": min_leaf_nodes_range,
              "n_estimators" : n_estimators
                }

grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=param_grid,
                    cv = 5,
                    scoring='accuracy',
                    refit=True, #setting grid with estimator
                    verbose=1)

grid.fit(X_train, y_train)
print("Accuracy:", grid.best_score_)
print(grid.best_params_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Accuracy: 0.7814856434300772
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


# **Naive Bayes**

In [18]:
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
y_pred = gaussian.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7317073170731707


***Apply K-Fold CV ***

In [20]:
from sklearn.model_selection import cross_val_score
kfold = cross_val_score(gaussian, X, y, cv=5)
kfold

array([0.76626016, 0.79674797, 0.72560976, 0.73170732, 0.74593496])

In [21]:
kfold.mean()

0.7532520325203252

**Apply Stratified K-fold CV**

In [22]:
from sklearn.model_selection import StratifiedKFold
strat_kfold = cross_val_score(gaussian, X, y, cv=StratifiedKFold())
print(strat_kfold)

[0.76626016 0.79674797 0.72560976 0.73170732 0.74593496]


In [23]:
strat_kfold.mean()

0.7532520325203252

**Hyperparameter Tuning**

In [24]:
from sklearn.model_selection import GridSearchCV

# Define the parameter ranges to be tested
var_smoothing_range = [1e-9, 1e-7, 1e-5, 1e-3, 1e-1]

# Create the parameter grid
param_grid = {
    'var_smoothing': var_smoothing_range
}

# Create the GridSearchCV object
grid = GridSearchCV(estimator=GaussianNB(),
                    param_grid=param_grid,
                    cv=5,
                    scoring='accuracy',
                    refit=True,  # Setting grid with estimator
                    verbose=1)

# Fit the grid
grid.fit(X_train, y_train)

# Print the best parameters and best score
print("Accuracy: ", grid.best_score_)
print(grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Accuracy:  0.761160408674649
{'var_smoothing': 0.1}
