In [1]:
# python version
import sys
sys.version

'3.10.15 (main, Oct  3 2024, 07:27:34) [GCC 11.2.0]'

In [2]:
# pycaret version
import pycaret
pycaret.__version__

'3.3.2'

In [3]:
#default directory
import os
#os.chdir()

#data import
import pandas as pd
data = pd.read_excel("dataset/base.xlsx")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   N°                       550 non-null    int64  
 1   R/P/T                    550 non-null    object 
 2   sexe                     550 non-null    object 
 3   Nom et Prénoms           550 non-null    object 
 4   prof père                483 non-null    object 
 5   prof mère                517 non-null    object 
 6   nb frère                 549 non-null    float64
 7   nb sœur                  549 non-null    float64
 8   commune d'origine        550 non-null    object 
 9   COLLEGE d'ORIGINE        550 non-null    object 
 10  Habite avec les parents  549 non-null    object 
 11  electricité              549 non-null    object 
 12  conn sur les options     550 non-null    object 
 13  MLG                      547 non-null    float64
 14  FRS                      5

In [4]:
# class distribution
data['Opt'].value_counts()

Opt
L      269
S      213
OSE     68
Name: count, dtype: int64

In [5]:
data = data.drop(['N°', 'Nom et Prénoms', 'prof père', 'prof mère', 'Rang', 'RANG'], axis=1)

In [6]:
data.shape

(550, 22)

In [7]:
# Categorical variable transform
def encoding(data, col):
    return data[col].astype("category").cat.codes

In [8]:
# fill or drop a row with missing value
def imputation(data):
    return data.dropna(axis=0)

data = imputation(data)
data.shape

(452, 22)

In [32]:
def preprocessing(data, cat_var_list):
    for col in data.columns:
        if col in cat_var_list:
            data[col] = encoding(data, col)
    return data

# List of the categorical variables
cat_var_list = list(data.select_dtypes(include= 'object').columns)

# Data after transformation
data = preprocessing(data, cat_var_list)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sexe                     452 non-null    int8   
 1   nb frère                 452 non-null    float64
 2   nb sœur                  452 non-null    float64
 3   commune d'origine        452 non-null    int8   
 4   Habite avec les parents  452 non-null    int8   
 5   electricité              452 non-null    int8   
 6   conn sur les options     452 non-null    int8   
 7   MLG                      452 non-null    float64
 8   FRS                      452 non-null    float64
 9   ANG                      452 non-null    float64
 10  HG                       452 non-null    float64
 11  SES                      452 non-null    float64
 12  MATHS                    452 non-null    float64
 13  PC                       452 non-null    float64
 14  SVT                      4

In [10]:
# Feature selection (for importance of the variable in the estimator)
def selector(estimator):
    selector = RFECV(estimator, step = 1, min_features_to_select=9, cv=5)
    selector.fit(data.drop('Opt', axis=1), data['Opt'])
    return selector

In [11]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import RFECV
selector = selector(SGDClassifier(random_state=0))
print(selector.ranking_)
print(selector.support_)

[2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[False  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


In [12]:
# Selectionned variable of this method
X = data.drop('Opt', axis=1)
X.columns[selector.support_]

Index(['sexe', 'nb frère', 'nb sœur', 'commune d'origine',
       'Habite avec les parents', 'electricité', 'conn sur les options', 'MLG',
       'FRS', 'ANG', 'HG', 'SES', 'MATHS', 'PC', 'SVT', 'EPS', '1°S', '2°S',
       'MOY AN'],
      dtype='object')

In [13]:
# Using data
data = pd.merge(data[X.columns[selector.support_]], data['Opt'], how='inner', on=data.index).drop('key_0', axis=1)
data.head()

Unnamed: 0,sexe,nb frère,nb sœur,commune d'origine,Habite avec les parents,electricité,conn sur les options,MLG,FRS,ANG,HG,SES,MATHS,PC,SVT,EPS,1°S,2°S,MOY AN,Opt
0,0,2.0,2.0,15,1,1,1,20.0,19.0,20.5,24.0,23.0,21.0,24.0,26.0,28.0,11.55,11.7,11.6,0
1,0,0.0,3.0,98,1,1,1,26.0,24.0,32.0,36.0,26.0,25.0,16.0,34.0,20.0,13.272727,12.318182,12.954545,2
2,0,2.0,3.0,44,0,0,0,22.0,21.0,12.0,26.0,23.0,19.0,7.0,29.0,20.0,10.136364,12.454545,10.909091,2
3,1,2.0,2.0,44,1,0,0,22.0,19.0,19.0,24.0,25.0,24.0,25.0,38.0,24.0,12.4,11.1,11.966667,2
4,1,1.0,2.0,44,1,0,1,30.0,22.5,31.0,23.5,27.517,19.0,24.0,22.5,21.0,11.90085,11.295455,11.699052,2


In [14]:
data['Opt'].value_counts()

Opt
0    230
2    182
1     40
Name: count, dtype: int64

In [15]:
#create a spacework
from pycaret.classification import setup

# instance and init
session = setup(data, target='Opt', train_size=0.7, data_split_stratify=True, normalize=True, fold = 5, session_id=1032)

# print session
print(session)

Unnamed: 0,Description,Value
0,Session id,1032
1,Target,Opt
2,Target type,Multiclass
3,Original data shape,"(452, 20)"
4,Transformed data shape,"(452, 20)"
5,Transformed train set shape,"(316, 20)"
6,Transformed test set shape,"(136, 20)"
7,Numeric features,19
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment object at 0x7f1cacbca560>


In [16]:
# algo availaible list compatible with this problem
algos = session.models()
print(algos)

                                     Name  \
ID                                          
lr                    Logistic Regression   
knn                K Neighbors Classifier   
nb                            Naive Bayes   
dt               Decision Tree Classifier   
svm                   SVM - Linear Kernel   
rbfsvm                SVM - Radial Kernel   
gpc           Gaussian Process Classifier   
mlp                        MLP Classifier   
ridge                    Ridge Classifier   
rf               Random Forest Classifier   
qda       Quadratic Discriminant Analysis   
ada                  Ada Boost Classifier   
gbc          Gradient Boosting Classifier   
lda          Linear Discriminant Analysis   
et                 Extra Trees Classifier   
xgboost         Extreme Gradient Boosting   
lightgbm  Light Gradient Boosting Machine   
catboost              CatBoost Classifier   
dummy                    Dummy Classifier   

                                                  Refe

In [17]:
# compare and select the models
from pycaret.classification import compare_models
top_models = compare_models(n_select = 3, sort = 'Accuracy', exclude = ['catboost'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6519,0.7222,0.6519,0.6397,0.624,0.3467,0.3636,0.098
gbc,Gradient Boosting Classifier,0.6329,0.0,0.6329,0.6295,0.615,0.3242,0.3331,0.198
rf,Random Forest Classifier,0.6296,0.7246,0.6296,0.5956,0.5974,0.3016,0.3138,0.116
knn,K Neighbors Classifier,0.6108,0.706,0.6108,0.5632,0.5786,0.2655,0.2762,0.61
xgboost,Extreme Gradient Boosting,0.6106,0.7139,0.6106,0.5897,0.594,0.2879,0.2916,0.106
lr,Logistic Regression,0.6075,0.0,0.6075,0.5904,0.5912,0.281,0.2868,0.9
ridge,Ridge Classifier,0.6044,0.0,0.6044,0.5674,0.5761,0.2608,0.2713,0.022
lda,Linear Discriminant Analysis,0.6043,0.0,0.6043,0.5891,0.5898,0.2807,0.2864,0.022
lightgbm,Light Gradient Boosting Machine,0.5981,0.7065,0.5981,0.5697,0.5778,0.2607,0.2642,100.302
nb,Naive Bayes,0.5791,0.6813,0.5791,0.591,0.5774,0.2604,0.2665,0.024


In [18]:
# the first third top models
print(top_models)

[ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=1032, verbose=0,
                     warm_start=False), GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           rand

In [19]:
# model training with knn
et = session.create_model("et")
print(et)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6406,0.728,0.6406,0.6654,0.6224,0.3318,0.3376
1,0.5714,0.6056,0.5714,0.5393,0.5532,0.2092,0.2114
2,0.6508,0.737,0.6508,0.6141,0.6133,0.3275,0.354
3,0.6984,0.743,0.6984,0.6307,0.6617,0.4375,0.4469
4,0.6984,0.7975,0.6984,0.7489,0.6696,0.4275,0.4682
Mean,0.6519,0.7222,0.6519,0.6397,0.624,0.3467,0.3636
Std,0.0468,0.0632,0.0468,0.0684,0.0415,0.0828,0.0915


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=1032, verbose=0,
                     warm_start=False)


In [20]:
# evaluation
pred = session.predict_model(et)

# show of the result
print(pred[['Opt', 'prediction_label', 'prediction_score']])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.6103,0.7124,0.6103,0.6383,0.5864,0.2707,0.2768


     Opt  prediction_label  prediction_score
380    2                 2              0.63
212    2                 0              0.57
28     0                 0              0.60
333    2                 0              0.56
112    0                 0              0.68
..   ...               ...               ...
58     0                 0              0.63
88     0                 2              0.52
124    2                 2              0.52
175    0                 2              0.64
145    2                 2              0.67

[136 rows x 3 columns]


In [21]:
# optimization
best_model = session.tune_model(et, optimize='Accuracy')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5938,0.7272,0.5938,0.5551,0.5474,0.2091,0.2369
1,0.5397,0.5816,0.5397,0.5002,0.5042,0.1135,0.1227
2,0.619,0.6989,0.619,0.6083,0.5627,0.2533,0.3095
3,0.6825,0.7731,0.6825,0.6347,0.6364,0.3919,0.4296
4,0.5873,0.7397,0.5873,0.5719,0.5271,0.1931,0.2425
Mean,0.6045,0.7041,0.6045,0.574,0.5556,0.2322,0.2683
Std,0.0467,0.0657,0.0467,0.0462,0.0449,0.0917,0.1006


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [22]:
# best model
best_model

In [23]:
# verification in data test
session.predict_model(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.6103,0.7124,0.6103,0.6383,0.5864,0.2707,0.2768


Unnamed: 0,sexe,nb frère,nb sœur,commune d'origine,Habite avec les parents,electricité,conn sur les options,MLG,FRS,ANG,...,MATHS,PC,SVT,EPS,1°S,2°S,MOY AN,Opt,prediction_label,prediction_score
380,1,0.0,1.0,99,1,1,1,28.0,24.0,28.5,...,32.0,20.0,36.0,26.0,13.725000,13.525000,13.658334,2,2,0.63
212,1,3.0,2.0,3,0,1,1,24.0,18.0,36.0,...,14.0,14.0,27.0,20.0,13.550000,10.181818,12.427273,2,0,0.57
28,1,1.0,1.0,26,1,1,1,30.0,22.0,26.0,...,11.0,12.0,26.0,20.0,10.295455,9.509091,10.033334,0,0,0.60
333,0,2.0,1.0,44,1,1,1,28.5,32.5,23.5,...,20.0,15.0,38.0,25.0,13.204545,11.575000,12.661364,2,0,0.56
112,1,0.0,1.0,70,0,1,1,27.5,24.0,20.5,...,32.0,12.0,25.0,26.0,12.204545,11.818182,12.075758,0,0,0.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,0,2.0,4.0,85,1,1,1,15.0,21.5,16.5,...,12.0,9.0,16.5,26.0,10.175000,8.950000,9.766666,0,0,0.63
88,1,2.0,1.0,44,1,0,1,15.0,18.5,16.0,...,24.0,20.0,29.5,12.0,10.100000,9.125000,9.775000,0,2,0.52
124,0,1.0,0.0,44,1,0,1,23.0,18.0,29.0,...,22.0,11.0,26.0,12.0,12.700000,10.897727,12.099242,2,2,0.52
175,0,1.0,1.0,92,1,1,1,32.0,21.5,30.0,...,16.0,24.0,27.5,26.0,12.931818,11.704545,12.522727,0,2,0.64


In [24]:
# importance variables
#session.plot_model(best_model, plot='feature')

In [None]:
# coefficients
#coef = pd.DataFrame({'variables': best_model.feature_names_in_, 'coefficients':best_model.coef_[0]})
#coef

In [26]:
# definitive model
final_model = session.finalize_model( et)

# pipeline of this final model
final_model

### Deploiement of the model

In [27]:
#create API for the deploiement
my_api = session.create_api(final_model, "optionclass_api")

API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python optionclass_api.py
