## Load the dataset

In [21]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn import preprocessing

seed = 42

FILENAME = "datasets/train_players.csv"

# prepare the players dataset
features = ["games","minutes","assists","cards_yellow","cards_red","xg","npxg","xa" ,"value"]

#Prepare train data
df1 = pd.read_csv(FILENAME, encoding='ISO-8859-1', sep=",")
print("EX1) #Righe: " + str(df1.shape[0])+ " #Colonne: "+str(df1.shape[1]))
df1 = df1.dropna()
X = df1[features].to_numpy()
y = df1["position"].to_numpy()
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)

indices = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indices, test_size=0.1, stratify=y, random_state=seed)

fold = np.zeros(X.shape[0])
fold[train_idx] = -1

ps = PredefinedSplit(fold)
ps.get_n_splits()

# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

X_val = X[val_idx,:]
y_val = y[val_idx]
X_train = X[train_idx,:]
y_train = y[train_idx]

EX1) #Righe: 1995 #Colonne: 399


## Preprocess the dataset

In [22]:
from sklearn.preprocessing import MinMaxScaler
import pickle

file = open("scaler.save","wb")
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X = scaler.transform(X)
pickle.dump(scaler, file)

## Apply Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(random_state=seed).fit(X_train, y_train)
print("Performance:",reg.score(X_val,y_val))
file = open("lr.save","wb")
pickle.dump(reg, file)
file.close()

Performance: 0.225


## Apply Support Vector Classifier with HP tuning

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV


param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ["linear", "poly", "rbf", "sigmoid"]}  

  
grid = GridSearchCV(SVC(random_state=seed), param_grid, cv=ps)
grid.fit(X, y)
file = open("svc.save","wb")
pickle.dump(grid.best_estimator_, file)
print("Best hyperparameters: ",grid.best_estimator_)
print("Best performance:",  grid.best_score_)
file.close()


Best hyperparameters:  SVC(C=1000, gamma=1, kernel='linear', random_state=42)
Best performance: 0.555
