In [247]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
import pickle

In [168]:
# Load the dataset from a CSV file
data = pd.read_csv('corrected_3P.csv')
df = data.copy()
df

Unnamed: 0,GP,MIN,PTS,FG%,3P Made,3P%,FTA,FT%,REB,AST,STL,BLK,TOV,TARGET_5Yrs,Efficiency,Efficiency_per_Minute
0,36,27.4,7.4,34.7,0.5,25.0,2.3,69.9,4.1,1.9,0.4,0.4,1.3,0.0,0.616667,0.022506
1,35,26.9,7.2,29.6,0.7,23.5,3.4,76.5,2.4,3.7,1.1,0.5,1.6,0.0,0.558140,0.020749
2,74,15.3,5.2,42.2,0.4,24.4,1.3,67.0,2.2,1.0,0.5,0.3,1.0,0.0,0.675325,0.044139
3,58,11.6,5.7,42.6,0.1,22.6,1.3,68.9,1.9,0.8,0.6,0.1,1.0,1.0,0.780822,0.067312
4,48,11.5,4.5,52.4,0.0,0.0,1.9,67.4,2.5,0.3,0.3,0.4,0.8,1.0,0.900000,0.078261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331,80,15.8,4.3,43.3,0.0,14.3,1.5,79.2,1.2,2.5,0.6,0.2,0.8,0.0,0.811321,0.051349
1332,68,12.6,3.9,35.8,0.1,16.7,1.0,79.4,1.5,2.3,0.8,0.0,1.3,1.0,0.672414,0.053366
1333,43,12.1,5.4,55.0,0.0,0.0,1.6,64.3,3.8,0.3,0.3,0.4,0.9,0.0,0.981818,0.081142
1334,52,12.0,4.5,43.9,0.0,10.0,1.8,62.5,0.7,2.2,0.4,0.1,0.8,1.0,0.775862,0.064655


In [169]:
def split_data(df, target_column, test_size=0.2, random_state=0):
    """
    Function to split the dataset into training and testing sets.
    
    Parameters:
    - df: The dataframe containing the dataset.
    - target_column: The name of the column to be used as the target (dependent variable).
    - test_size: The proportion of the dataset to be used for testing.
    - random_state: Random seed for reproducibility.
    
    Returns:
    - X_train: Features for the training set.
    - X_test: Features for the testing set.
    - y_train: Target variable for the training set.
    - y_test: Target variable for the testing set.
    """
    
    # Define the feature columns (X) and target column (y)
    X = df.drop(columns=[target_column])  # Features (all columns except the target)
    y = df[target_column]  # Target variable
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df = df.drop(columns=['Efficiency_per_Minute',"3P Made"]), target_column='TARGET_5Yrs')

#  Check the distribution of the target variable in both sets
print(y_train.value_counts())
print(y_test.value_counts())

TARGET_5Yrs
1.0    673
0.0    395
Name: count, dtype: int64
TARGET_5Yrs
1.0    155
0.0    113
Name: count, dtype: int64


In [179]:
# Define the preprocessor pipeline (Polynomial features and SelectKBest)
preprocessor = make_pipeline(SelectKBest(f_classif, k=8))


In [182]:
# Define the models to be used in a dictionary, with scaling where needed
list_of_models = {
    'RandomForest': make_pipeline(preprocessor, RandomForestClassifier(random_state=0)),
    'XGBoost': make_pipeline(preprocessor, XGBClassifier(random_state=0)),
    'SVM': make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0)),
    'KNN': make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier()),
    'LogisticRegression': make_pipeline(preprocessor, LogisticRegression(random_state=0)),
    'AdaBoost': make_pipeline(preprocessor, AdaBoostClassifier(random_state=0)) }

In [183]:
def evaluation(model):
    
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    
    # Print confusion matrix and classification report
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))

    # Plot the learning curve using Plotly
    N, train_score, val_score = learning_curve(model, X_train, y_train,
                                               cv=4, scoring='f1',
                                               train_sizes=np.linspace(0.1, 1, 10))
    
    # Create a figure for learning curve
    fig = go.Figure()
    
    # Add train and validation scores to the plot
    fig.add_trace(go.Scatter(x=N, y=train_score.mean(axis=1), mode='lines', name='Train Score'))
    fig.add_trace(go.Scatter(x=N, y=val_score.mean(axis=1), mode='lines', name='Validation Score'))
    
    # Update layout
    fig.update_layout(
        title="Learning Curve",
        xaxis_title="Training Set Size",
        yaxis_title="F1 Score",
        template="plotly_dark"
    )
    
    # Show the learning curve plot
    fig.show()



In [220]:
for name, model in list_of_models.items():
    print(f"Evaluating model: {name}")
    evaluation(model)

Evaluating model: RandomForest
[[ 63  50]
 [ 30 125]]
              precision    recall  f1-score   support

         0.0       0.68      0.56      0.61       113
         1.0       0.71      0.81      0.76       155

    accuracy                           0.70       268
   macro avg       0.70      0.68      0.68       268
weighted avg       0.70      0.70      0.70       268



Evaluating model: XGBoost
[[ 63  50]
 [ 34 121]]
              precision    recall  f1-score   support

         0.0       0.65      0.56      0.60       113
         1.0       0.71      0.78      0.74       155

    accuracy                           0.69       268
   macro avg       0.68      0.67      0.67       268
weighted avg       0.68      0.69      0.68       268



Evaluating model: SVM
[[ 58  55]
 [ 17 138]]
              precision    recall  f1-score   support

         0.0       0.77      0.51      0.62       113
         1.0       0.72      0.89      0.79       155

    accuracy                           0.73       268
   macro avg       0.74      0.70      0.71       268
weighted avg       0.74      0.73      0.72       268



Evaluating model: KNN
[[ 61  52]
 [ 30 125]]
              precision    recall  f1-score   support

         0.0       0.67      0.54      0.60       113
         1.0       0.71      0.81      0.75       155

    accuracy                           0.69       268
   macro avg       0.69      0.67      0.68       268
weighted avg       0.69      0.69      0.69       268



Evaluating model: LogisticRegression
[[ 57  56]
 [ 25 130]]
              precision    recall  f1-score   support

         0.0       0.70      0.50      0.58       113
         1.0       0.70      0.84      0.76       155

    accuracy                           0.70       268
   macro avg       0.70      0.67      0.67       268
weighted avg       0.70      0.70      0.69       268




lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th





Evaluating model: AdaBoost
[[ 63  50]
 [ 27 128]]
              precision    recall  f1-score   support

         0.0       0.70      0.56      0.62       113
         1.0       0.72      0.83      0.77       155

    accuracy                           0.71       268
   macro avg       0.71      0.69      0.69       268
weighted avg       0.71      0.71      0.71       268





















































































In [217]:
# Retrieve the models from the dictionary
svm_model = list_of_models['SVM']
adaboost_model = list_of_models['AdaBoost']
print(adaboost_model.get_params())


{'memory': None, 'steps': [('pipeline', Pipeline(steps=[('selectkbest', SelectKBest(k=8))])), ('adaboostclassifier', AdaBoostClassifier(random_state=0))], 'verbose': False, 'pipeline': Pipeline(steps=[('selectkbest', SelectKBest(k=8))]), 'adaboostclassifier': AdaBoostClassifier(random_state=0), 'pipeline__memory': None, 'pipeline__steps': [('selectkbest', SelectKBest(k=8))], 'pipeline__verbose': False, 'pipeline__selectkbest': SelectKBest(k=8), 'pipeline__selectkbest__k': 8, 'pipeline__selectkbest__score_func': <function f_classif at 0x00000118B0165A60>, 'adaboostclassifier__algorithm': 'SAMME.R', 'adaboostclassifier__estimator': None, 'adaboostclassifier__learning_rate': 1.0, 'adaboostclassifier__n_estimators': 50, 'adaboostclassifier__random_state': 0}


In [243]:
# Define the parameter grid to search
param_grid = {
    'adaboostclassifier__learning_rate': [1,1e-2, 1e-3, 1e-4, 1e-5],
    'adaboostclassifier__n_estimators': [1, 10,100, 1000],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R'],
    'adaboostclassifier__random_state':[0]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=4, scoring='f1')
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

y_pred = grid_search.predict(X_test)

print(classification_report(y_test, y_pred))



































































































































































{'adaboostclassifier__algorithm': 'SAMME', 'adaboostclassifier__learning_rate': 0.01, 'adaboostclassifier__n_estimators': 1000, 'adaboostclassifier__random_state': 0}
              precision    recall  f1-score   support

         0.0       0.72      0.51      0.60       113
         1.0       0.71      0.86      0.78       155

    accuracy                           0.71       268
   macro avg       0.72      0.69      0.69       268
weighted avg       0.71      0.71      0.70       268



In [226]:
evaluation(grid_search.best_estimator_)

[[ 58  55]
 [ 22 133]]
              precision    recall  f1-score   support

         0.0       0.72      0.51      0.60       113
         1.0       0.71      0.86      0.78       155

    accuracy                           0.71       268
   macro avg       0.72      0.69      0.69       268
weighted avg       0.71      0.71      0.70       268



In [245]:
param_grid_svm = {
    'svc__C': [0.1, 1, 10],  # Réduisez le nombre de valeurs
    'svc__kernel': ['linear', 'rbf'],  # Testez uniquement les kernels principaux
    'svc__gamma': ['scale', 'auto'],  # Limitez à 2 options pour gamma
    'svc__random_state': [0]  # Gardez la reproductibilité
}
# Set up the GridSearchCV
grid_search_svm = GridSearchCV(estimator=svm_model, param_grid=param_grid_svm, cv=4, scoring='f1', n_jobs=-1)

# Fit the model
grid_search_svm.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters for SVM:", grid_search_svm.best_params_)

# Make predictions on the test set
y_pred_svm = grid_search_svm.predict(X_test)

# Print the classification report for the SVM model
print(classification_report(y_test, y_pred_svm))


Best Parameters for SVM: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'linear', 'svc__random_state': 0}
              precision    recall  f1-score   support

         0.0       0.74      0.56      0.64       113
         1.0       0.73      0.86      0.79       155

    accuracy                           0.73       268
   macro avg       0.73      0.71      0.71       268
weighted avg       0.73      0.73      0.72       268



In [246]:
evaluation(grid_search_svm)

[[ 63  50]
 [ 22 133]]
              precision    recall  f1-score   support

         0.0       0.74      0.56      0.64       113
         1.0       0.73      0.86      0.79       155

    accuracy                           0.73       268
   macro avg       0.73      0.71      0.71       268
weighted avg       0.73      0.73      0.72       268



In [248]:
# Save the best model found by GridSearchCV
with open('best_svm_model.pkl', 'wb') as file:
    pickle.dump(grid_search_svm.best_estimator_, file)

print("Model saved as 'best_svm_model.pkl'")

Model saved as 'best_svm_model.pkl'
