# Importation des librairies

In [25]:
import BankModel
import header
import ProcessRakuten

import numpy as np
import pandas as pd
import spacy

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

# Importation des données

In [3]:
X_train = pd.read_csv(
    "/Users/welto/Library/CloudStorage/OneDrive-CentraleSupelec/2A/CASA/RakutenPjct/data/X_train_update.csv",
    sep=','
)
X_test = pd.read_csv(
    "/Users/welto/Library/CloudStorage/OneDrive-CentraleSupelec/2A/CASA/RakutenPjct/data/X_test_update.csv",
    sep=','
)
Y_train = pd.read_csv(
    "/Users/welto/Library/CloudStorage/OneDrive-CentraleSupelec/2A/CASA/RakutenPjct/data/Y_train_CVw08PX.csv",
    sep=','
)

# Preprocessing

### Supression des colonnes inutilisées

In [4]:
X_train = X_train.drop(
    labels=['Unnamed: 0', 'imageid', 'description']
    , axis=1
)
X_test = X_test.drop(
    labels=['imageid',
            'description', 'Unnamed: 0'],
    axis=1
)
Y_train = Y_train.drop(
    labels='Unnamed: 0',
    axis=1
)

In [10]:
X_train.shape, X_train.shape, Y_train.shape

((84916, 2), (84916, 2), (84916, 1))

### Séparation train-test

In [11]:
Y_train, Y_test = train_test_split(Y_train, test_size=0.2, random_state=42)

In [12]:
Y_train.shape, Y_test.shape

((67932, 1), (16984, 1))

### Conversion en listes

In [13]:
Y_train = Y_train['prdtypecode'].tolist()
Y_test = Y_test['prdtypecode'].tolist()

### Tokenisation et cleaning

In [15]:
X_train_raw_designation = [k for k in X_train['designation'].tolist()[:len(Y_train)]] # On s'assure que X et Y sont de même dimension
X_test_raw_designation = [k for k in X_test['designation'].tolist()[:len(Y_test)]]

In [16]:
spacy_nlp = spacy.load("fr_core_news_sm")

X_train_raw_designation_clean = []
X_test_raw_designation_clean = []

a = len(X_train_raw_designation)
b = len(X_test_raw_designation)

header.progress_bar(
    0,
    a,
    prefix='Progress:',
    suffix='Complete',
    length=50
)

for k in range(a):
    X_train_raw_designation_clean.append(
        header.raw_to_tokens(
            X_train_raw_designation[k],
            spacy_nlp
        )
    )
    header.progress_bar(
        k + 1,
        a,
        prefix='X_train_raw_designation_clean:',
        suffix='Complete',
        length=50
    )

for k in range(b):
    X_test_raw_designation_clean.append(
        header.raw_to_tokens(
            X_test_raw_designation[k],
            spacy_nlp
        )
    )
    header.progress_bar(
        k + 1,
        b,
        prefix='X_test_raw_designation_clean:',
        suffix='Complete',
        length=50
    )

X_train_raw_designation_clean: |██████████████████████████████████████████████████| 100.0% Complete
X_test_raw_designation_clean: |██████████████████████████████████████████████████| 100.0% Complete


### Vectorisation

In [17]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train_raw_designation_clean)
X_test_tfidf = tfidf.transform(X_test_raw_designation_clean)

print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_test_tfidf shape:", X_test_tfidf.shape)

X_train_tfidf shape: (67932, 69739)
X_test_tfidf shape: (13812, 69739)


In [24]:
X_train_tfidf.todense(), X_test_tfidf.todense()

(matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
 matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]))

# Entraînement

## Gradient Boosting (ne fonctionne pas)

In [None]:
params = {
    'n_estimators': [100, 150, 200, 250, 300],
    'learning_rate': [0.02, 0.03, 0.04, 0.05, 0.1]
}

gb = GradientBoostingClassifier()

gb_grid = GridSearchCV(
    estimator=gb,
    param_grid=params,
    scoring='accuracy',
    cv=10, n_jobs=-1,
    verbose=10
)

gb_grid.fit(
    X_train_tfidf,
    Y_train
)

Y_pred_gb = gb_grid.predict(X_test_tfidf)
print(
    "Accuracy score:",
    accuracy_score(
        Y_test,
        Y_pred_gb
    )
)

# SVM

In [None]:
svm = make_pipeline(SVC())

svm.fit(
    X_train_tfidf,
    Y_train
)

Y_pred_svm = svm.predict(X_test_tfidf)

print(
    "Accuracy score :",
    accuracy_score(
        Y_test,
        Y_pred_svm
    )
)