# Importation des librairies

In [2]:
import BankModel
import header
import ProcessRakuten

import numpy as np
import pandas as pd
import spacy

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

# Importation des données

In [3]:
X_train = pd.read_csv(
    "/Users/welto/Library/CloudStorage/OneDrive-CentraleSupelec/2A/CASA/RakutenPjct/data/X_train_update.csv",
    sep=','
)

Y_train = pd.read_csv(
    "/Users/welto/Library/CloudStorage/OneDrive-CentraleSupelec/2A/CASA/RakutenPjct/data/Y_train_CVw08PX.csv",
    sep=','
)

# Preprocessing

### Supression des colonnes inutilisées

In [80]:
X_train = X_train.drop(
    labels=['Unnamed: 0', 'imageid', 'description']
    , axis=1
)
Y_train = Y_train.drop(
    labels='Unnamed: 0',
    axis=1
)

In [81]:
X_train.shape, Y_train.shape

((84916, 2), (84916, 1))

### Séparation train-test

In [82]:
X_train, X_test = train_test_split(
    X_train,
    test_size=0.2,
    random_state=42
)
Y_train, Y_test = train_test_split(
    Y_train,
    test_size=0.2,
    random_state=42
)

In [83]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((67932, 2), (16984, 2), (67932, 1), (16984, 1))

### Conversion en listes

In [84]:
Y_train = Y_train['prdtypecode'].tolist()
Y_test = Y_test['prdtypecode'].tolist()

### Tokenisation et cleaning

In [86]:
X_train_raw_designation = X_train['designation'].tolist()
X_test_raw_designation = X_test['designation'].tolist()

In [87]:
spacy_nlp = spacy.load("fr_core_news_sm")

X_train_raw_designation_clean = []
X_test_raw_designation_clean = []

a = len(X_train_raw_designation)
b = len(X_test_raw_designation)

header.progress_bar(
    0,
    a,
    prefix='Progress:',
    suffix='Complete',
    length=50
)

for k in range(a):
    X_train_raw_designation_clean.append(
        header.raw_to_tokens(
            X_train_raw_designation[k],
            spacy_nlp
        )
    )
    header.progress_bar(
        k + 1,
        a,
        prefix='X_train_raw_designation_clean:',
        suffix='Complete',
        length=50
    )

for k in range(b):
    X_test_raw_designation_clean.append(
        header.raw_to_tokens(
            X_test_raw_designation[k],
            spacy_nlp
        )
    )
    header.progress_bar(
        k + 1,
        b,
        prefix='X_test_raw_designation_clean:',
        suffix='Complete',
        length=50
    )

X_train_raw_designation_clean: |██████████████████████████████████████████████████| 100.0% Complete
X_test_raw_designation_clean: |██████████████████████████████████████████████████| 100.0% Complete


### Vectorisation

In [88]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train_raw_designation_clean)
X_test_tfidf = tfidf.transform(X_test_raw_designation_clean)

print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_test_tfidf shape:", X_test_tfidf.shape)

X_train_tfidf shape: (67932, 69951)
X_test_tfidf shape: (16984, 69951)


In [89]:
X_train_tfidf.todense(), X_test_tfidf.todense()

(matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]))

# Entraînement

In [90]:
fast_coeff = 10

## Gradient Boosting (ne fonctionne pas)

In [55]:
params = {
    'n_estimators': [100, 150, 200, 250, 300],
    'learning_rate': [0.02, 0.03, 0.04, 0.05, 0.1]
}

gb = GradientBoostingClassifier()

gb_grid = GridSearchCV(
    estimator=gb,
    param_grid=params,
    scoring='accuracy',
    cv=10, n_jobs=-1,
    verbose=10
)

gb_grid.fit(
    X_train_tfidf,
    Y_train
)

Y_pred_gb = gb_grid.predict(X_test_tfidf)
print(
    "Accuracy score:",
    accuracy_score(
        Y_test,
        Y_pred_gb
    )
)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 1.0 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 56.44444444444444 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 111.88888888888889 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 167.33333333333331 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 222.77777777777777 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 278.22222222222223 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 333.66666666666663 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 389.1111111111111 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 444.55555555555554 instead.

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'n_estimators' parameter of GradientBoostingClassifier must be an int in the range [1, inf). Got 500.0 instead.


# SVM

13min

In [None]:
svm = SVC()

svm.fit(
    X_train_tfidf[:X_train_tf],
    Y_train
)

Y_pred_svm = svm.predict(X_test_tfidf)

In [78]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'gamma': [10, 1, 0.1, 0.01],
    'kernel': ['rbf']
}

grid = GridSearchCV(
    SVC(),
    param_grid,
    refit=True,
    verbose=2
)
grid.fit(
    X_train_tfidf[:X_train_tfidf.shape[0]//fast_coeff],
    Y_train[:len(Y_train)//fast_coeff]
)

Y_pred_svm = grid.predict(X_test_tfidf[:X_test_tfidf.shape[0]//fast_coeff])

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .......................C=0.01, gamma=10, kernel=rbf; total time=   4.2s
[CV] END .......................C=0.01, gamma=10, kernel=rbf; total time=   4.2s
[CV] END .......................C=0.01, gamma=10, kernel=rbf; total time=   4.2s
[CV] END .......................C=0.01, gamma=10, kernel=rbf; total time=   4.2s
[CV] END .......................C=0.01, gamma=10, kernel=rbf; total time=   4.3s
[CV] END ........................C=0.01, gamma=1, kernel=rbf; total time=   4.1s
[CV] END ........................C=0.01, gamma=1, kernel=rbf; total time=   4.2s
[CV] END ........................C=0.01, gamma=1, kernel=rbf; total time=   4.1s
[CV] END ........................C=0.01, gamma=1, kernel=rbf; total time=   4.1s
[CV] END ........................C=0.01, gamma=1, kernel=rbf; total time=   4.1s
[CV] END ......................C=0.01, gamma=0.1, kernel=rbf; total time=   3.5s
[CV] END ......................C=0.01, gamma=0.1

KeyboardInterrupt: 

In [73]:
grid.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}

In [75]:
print(
    "f1 score :",
    f1_score(
        Y_test[:X_test_tfidf.shape[0]//fast_coeff],
        Y_pred_svm,
        average='macro'
    )
)

f1 score : 0.007734063505788539
