In [8]:
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from datetime import datetime

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score,
    f1_score, accuracy_score, classification_report
)
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve, validation_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from itertools import product


# NLTK downloads (à lancer une seule fois)
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('punkt')

In [9]:
CATEGORY_MAPPING = {
    # Fiction
    'fiction': 'Fiction',
    'english fiction': 'Fiction',
    'american fiction': 'Fiction',
    'detective and mystery stories': 'Fiction',
    '"childrens stories"': 'Fiction',

    # Juvenile
    'juvenile fiction': 'Juvenile',
    'juvenile nonfiction': 'Juvenile',
    'young adult nonfiction': 'Juvenile',
    'childrens stories': 'Juvenile',

    # Religion & Spirituality
    'religion': 'Religion & Spirituality',
    'spirit': 'Religion & Spirituality',
    'bibles': 'Religion & Spirituality',
    'bible': 'Religion & Spirituality',

    # Biography
    'biography': 'Biography',
    'autobiography': 'Biography',
    'true crime': 'Biography',

    # Health & Wellness
    'body, mind': 'Health & Wellness',
    'health': 'Health & Wellness',
    'fitness': 'Health & Wellness',
    'relationships': 'Health & Wellness',
    'family': 'Health & Wellness',
    'self-help': 'Health & Wellness',
    'medical': 'Health & Wellness',

    # Business & Economics
    'business': 'Business & Economics',
    'economics': 'Business & Economics',

    # Social Sciences
    'social science': 'Social Sciences',
    'political science': 'Social Sciences',
    'philosophy': 'Social Sciences',
    'psychology': 'Social Sciences',
    'disciplines': 'Social Sciences',
    'law': 'Social Sciences',

    # History
    'history': 'History',
    'great britain': 'History',

    # Computers & Tech
    'computers': 'Computers & Tech',
    'technology': 'Computers & Tech',
    'engineering': 'Computers & Tech',

    # Science & Nature
    'science': 'Science & Nature',
    'nature': 'Science & Nature',
    'mathematics': 'Science & Nature',
    'animals': 'Science & Nature',

    # Cooking
    'cooking': 'Cooking',

    # Recreation & Sports
    'recreation': 'Recreation & Sports',
    'sports': 'Recreation & Sports',
    'games': 'Recreation & Sports',
    'hobbies': 'Recreation & Sports',
    'crafts': 'Recreation & Sports',

    # Education & Language
    'language arts': 'Education & Language',
    'education': 'Education & Language',
    'study aids': 'Education & Language',
    'foreign language study': 'Education & Language',

    # Arts & Entertainment
    'art': 'Arts & Entertainment',
    'music': 'Arts & Entertainment',
    'performing arts': 'Arts & Entertainment',
    'photography': 'Arts & Entertainment',
    'humor': 'Arts & Entertainment',
    'architecture': 'Arts & Entertainment',
    'design': 'Arts & Entertainment',
    'antiques': 'Arts & Entertainment',
    'collectibles': 'Arts & Entertainment',

    # Travel & Lifestyle
    'travel': 'Travel & Lifestyle',
    'home': 'Travel & Lifestyle',
    'house': 'Travel & Lifestyle',
    'gardening': 'Travel & Lifestyle',
    'pets': 'Travel & Lifestyle',
    'activities': 'Travel & Lifestyle',
    'transportation': 'Travel & Lifestyle',

    # Reference
    'reference': 'Reference',
}


In [None]:
class ModelClassifier_TFIDF_NAIVEBAYES:
    def __init__(self, csv_path, model_path=None):
        self.csv_path = csv_path
        self.model_path = model_path or "models/ModelClassifier_TFIDF_NAIVEBAYES.joblib"
        self.dataset = None
        self.vectorizer = TfidfVectorizer()
        self.model = LogisticRegression()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('classifier', LogisticRegression())
        ])
        self.param_grid = {
            'vectorizer__ngram_range': [(1, 1), (1, 2)],
            'vectorizer__max_df': [0.75],
            'vectorizer__min_df': [1],
            'classifier__C': [0.1, 1],
            'classifier__solver': ['saga'],
            'classifier__penalty': ['l2'],
            'classifier__class_weight': ['balanced'],
            'classifier__max_iter': [500]
        }

    def load_data(self):
        self.dataset = pd.read_csv(self.csv_path)
        self.dataset.drop_duplicates(inplace=True)
        self.dataset.dropna(inplace=True)

    def clean_text(self, sentence):
        sentence = sentence.strip().lower()
        sentence = re.sub(r"[^a-z\s]", '', sentence)
        tokens = word_tokenize(sentence)
        filtered = [w for w in tokens if w not in self.stop_words]
        lemmatized = [self.lemmatizer.lemmatize(w, pos='v') for w in filtered]
        return ' '.join(lemmatized)

    def clean_target(self, category):
        if pd.isna(category) or not isinstance(category, str):
            return ["unknown"]
        else:
            category = re.sub(r"[\[\]']+", "", category)
            return [g.strip().lower() for g in category.split("&")]


    def preprocess(self):
        self.dataset['clean_description'] = self.dataset['description'].apply(self.clean_text)
        self.dataset['clean_categories'] = self.dataset['categories'].apply(self.clean_target)
        self.dataset = self.dataset.explode('clean_categories')

        self.dataset['clean_categories'] = self.dataset['clean_categories'].apply(lambda x: CATEGORY_MAPPING.get(x.lower(), x))
        # print(f'Liste des 30 categories les plus frequentes : {self.dataset['clean_categories'].value_counts().head(30)}')
        
        category_counts = self.dataset['clean_categories'].value_counts()
        valid_categories = category_counts[category_counts >= 100].index
        self.dataset = self.dataset[self.dataset['clean_categories'].isin(valid_categories)]
        print(f'Liste des 30 categories les plus frequentes : {self.dataset['clean_categories'].value_counts().head(30)}')
        self.dataset['publishedDate'] = pd.to_datetime(self.dataset['publishedDate'], format='%Y-%M-%d', errors='coerce')
        self.dataset =  self.dataset.dropna()
        self.dataset['publishedDate'] = self.dataset['publishedDate'].dt.strftime('%Y')
        # self.dataset['date'] =  self.dataset['publishedDate'].dt.year
        self.dataset = self.dataset[['clean_description', 'publishedDate', 'clean_categories']]

    def prepare_data(self):
        self.X = self.dataset['clean_description']
        self.y = self.dataset['clean_categories']

    def train_test_split(self, test_size=0.2, valid_size = 0.05, random_state=42):
        self.label_encoder = LabelEncoder()
        self.X_temp, self.X_valid, self.y_temp, self.y_valid = train_test_split(
            self.X, self.y, test_size=test_size, random_state=random_state)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X_temp, self.y_temp, test_size=valid_size, random_state=random_state)
        self.y_train = self.label_encoder.fit_transform(self.y_train)
        self.y_test = self.label_encoder.transform(self.y_test)
        self.y_valid = self.label_encoder.transform(self.y_valid)

    def train(self):
        X_train_vec = self.vectorizer.fit_transform(self.X_train)
        self.model.fit(X_train_vec, self.y_train)

    def evaluate(self):
        if hasattr(self, 'best_model'):
            y_pred = self.best_model.predict(self.X_test)
        else:
            X_test_vec = self.vectorizer.transform(self.X_test)
            y_pred = self.model.predict(X_test_vec)

        print(classification_report(self.y_test, y_pred))

    def grid_search(self, param_grid=None, cv=5, scoring='accuracy'):
        print("Running Grid Search...")
        if param_grid is None:
            param_grid = self.param_grid

        grid = GridSearchCV(self.pipeline, param_grid, cv=cv, scoring=scoring, n_jobs=-2)
        grid.fit(self.X_train, self.y_train)

        print(f"Best Parameters: {grid.best_params_}")
        print(f"Best CV Score: {grid.best_score_:.4f}")

        self.best_model = grid.best_estimator_

    def evaluate_best_model(self):
        if hasattr(self, 'best_model'):
            y_pred = self.best_model.predict(self.X_test)
            print(classification_report(self.y_test, y_pred))
        else:
            print("Best model not found. Run grid_search() first.")

    def predict_text(self, text, return_proba=False):
        cleaned = self.clean_text(text)

        if hasattr(self, 'best_model'):
            vectorizer = self.best_model.named_steps['vectorizer']
            model = self.best_model.named_steps['classifier']
        else:
            vectorizer = self.vectorizer
            model = self.model

        X_input = vectorizer.transform([cleaned])
        prediction = model.predict(X_input)[0]

        if return_proba and hasattr(model, "predict_proba"):
            proba = model.predict_proba(X_input)[0]
            class_proba = dict(zip(model.classes_, proba))
            return prediction, class_proba

        label = self.label_encoder.inverse_transform([prediction])[0]
        return label

    def save_model(self):
        joblib.dump(self.best_model, self.model_path)
        print(f"💾 Modèle sauvegardé dans {self.model_path}")

    def load_or_train(self, X, y, force_retrain=False):
        if os.path.exists(self.model_path) and not force_retrain:
            print(f"🔁 Chargement du modèle depuis {self.model_path}")
            self.best_model = joblib.load(self.model_path)
        else:
            print("🛠️  Entraînement du modèle avec GridSearchCV...")
            grid_search = GridSearchCV(self.pipeline, self.param_grid, cv=10, scoring='accuracy', n_jobs=-2, verbose=1)
            grid_search.fit(X, y)
            self.best_model = grid_search.best_estimator_
            self.save_model()
            print("✅ Modèle entraîné et sauvegardé.")

    def plot_performance(self, param_name=None, param_range=None, cv=5):
        if not hasattr(self, 'X_train') or not hasattr(self, 'y_train'):
            raise ValueError("Données d'entraînement manquantes.")

        pipeline = self.best_model if hasattr(self, 'best_model') else self.pipeline
        X = self.X_train
        y = self.y_train

        # Courbe d'apprentissage
        train_sizes, train_scores, test_scores = learning_curve(
            pipeline, X, y, cv=cv, n_jobs=-2, scoring='accuracy',
            train_sizes=np.linspace(0.1, 1.0, 5)
        )
        plt.figure(figsize=(8, 5))
        plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label="Entraînement")
        plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label="Validation")
        plt.title("Courbe d'apprentissage")
        plt.xlabel("Taille d'entraînement")
        plt.ylabel("Exactitude")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        # Courbe de validation
        if param_name and param_range is not None:
            train_scores, valid_scores = validation_curve(
                pipeline, X, y, param_name=param_name,
                param_range=param_range, cv=cv, scoring="accuracy", n_jobs=-2
            )
            plt.figure(figsize=(8, 5))
            plt.plot(param_range, np.mean(train_scores, axis=1), 'o-', label="Entraînement")
            plt.plot(param_range, np.mean(valid_scores, axis=1), 'o-', label="Validation")
            plt.title(f"Courbe de validation pour {param_name}")
            plt.xlabel(param_name)
            plt.ylabel("Exactitude")
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()

        # Matrice de confusion et métriques
        if hasattr(self, 'X_test') and hasattr(self, 'y_test'):
            y_pred = pipeline.predict(self.X_test)
            cm = confusion_matrix(self.y_test, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm)
            fig, ax = plt.subplots(figsize=(6, 5))
            disp.plot(ax=ax, cmap=plt.cm.Blues, colorbar=False)
            plt.title("Matrice de confusion")
            plt.tight_layout()
            plt.show()

            precision = precision_score(self.y_test, y_pred, average='weighted', zero_division=0)
            recall = recall_score(self.y_test, y_pred, average='weighted', zero_division=0)
            f1 = f1_score(self.y_test, y_pred, average='weighted', zero_division=0)
            accuracy = accuracy_score(self.y_test, y_pred)

            print("📊 **Métriques de classification sur les données de test**")
            print(f" - Accuracy  : {accuracy:.6f}")
            print(f" - Precision : {precision:.6f}")
            print(f" - Recall    : {recall:.6f}")
            print(f" - F1-Score  : {f1:.6f}")
        else:
            print("⚠️ Données de test non disponibles pour l'évaluation finale.")

    def run_all(self, test_size=0.2, valid_size = 0.05, force_retrain=False):
        print("📥 Chargement des données...")
        self.load_data()

        # self.dataset = self.dataset.head(1000)  # Limiter à 1000 lignes pour les tests

        print("🧹 Nettoyage et prétraitement...")
        self.preprocess()

        print("🛠️ Préparation des variables X et y...")
        self.prepare_data()

        print("✂️ Séparation en jeu d'entraînement et de test...")
        self.train_test_split(test_size=test_size, valid_size=valid_size)

        print("🔁 Chargement ou entraînement du modèle...")
        self.load_or_train(self.X_train, self.y_train, force_retrain=force_retrain)

        print("🧪 Évaluation sur le jeu de test...")
        self.evaluate()

        print("📈 Affichage des courbes de performance...")
        self.plot_performance()

        print("🏁 Fin du pipeline.")


In [29]:
modelLR = ModelClassifier_TFIDF_NAIVEBAYES(csv_path="data/books_data.csv")
modelLR.run_all(test_size=0.2, valid_size=0.05, force_retrain=True)

📥 Chargement des données...
🧹 Nettoyage et prétraitement...
Liste des 30 categories les plus frequentes : clean_categories
Fiction                    11148
Biography                   4792
Juvenile                    4386
Religion & Spirituality     3584
Health & Wellness           3561
Social Sciences             2743
History                     2470
Business & Economics        2120
Arts & Entertainment        1745
Recreation & Sports         1635
Computers & Tech            1415
Science & Nature            1282
Education & Language        1027
Travel & Lifestyle           942
Cooking                      600
young adult fiction          442
literary criticism           411
poetry                       379
comics                       300
graphic novels               300
Reference                    263
drama                        234
literary collections         192
Name: count, dtype: int64
🛠️ Préparation des variables X et y...
✂️ Séparation en jeu d'entraînement et de test...


array([ 7, 14,  6, ..., 15,  7,  6])

(23130,)
🔁 Chargement ou entraînement du modèle...
🛠️  Entraînement du modèle avec GridSearchCV...
Fitting 10 folds for each of 4 candidates, totalling 40 fits


KeyboardInterrupt: 

In [44]:
run_text = modelLR.predict_text('Long ago, legions of creatures called Kaiju came out of the sea, bringing war. To fight the Kaiju, humanity creates giant robots called Jaegers, designed to be driven by two humans locked together in a neutral bridge. On the other hand, even the Jaegers are not enough to defeat the Kaiju, and humanity is on the verge of defeat. The last hope now rests on an ex-pilot, a trainee without experience and an old obsolete Jaeger.')
run_text

decoded_preds = modelLR.label_encoder.inverse_transform([run_text])

In [47]:
run_text = modelLR.predict_text('Duke Leto Atreides of House Atreides, ruler of the ocean world Caladan, is assigned by the Padishah Emperor Shaddam IV to serve as fief ruler of the planet Arrakis. Although Arrakis is a harsh and inhospitable desert planet, it is of enormous importance because it is the only planetary source of melange, or the "spice", a unique and incredibly valuable substance that extends human youth, vitality and lifespan. It is also through the consumption of spice that Spacing Guild Navigators are able to effect safe interstellar travel through a limited ability to see into the future. The Emperor is jealous of the Duke s rising popularity in the Landsraad, the council of Great Houses, and sees House Atreides as a potential rival and threat. He conspires with House Harkonnen, the former stewards of Arrakis and the longstanding enemies of the Atreides, to destroy Leto and his family after their arrival. Leto is aware his assignment is a trap of some kind, but is compelled to obey the Emperor s orders anyway.')
run_text

decoded_preds = modelLR.label_encoder.inverse_transform([run_text])

In [49]:
run_text = modelLR.predict_text('I Know Why the Caged Bird Sings follows Marguerite s (called My or Maya by her brother) life from the age of three to seventeen and the struggles she faces—particularly with racism and self-affirmation—in the Southern United States. Abandoned by their parents, Maya and her older brother Bailey are sent to live with their paternal grandmother (Momma) and disabled uncle (Uncle Willie) in Stamps, Arkansas. Maya and Bailey are haunted by their parents abandonment throughout the book—they travel alone and are labeled like baggage.')
decoded_preds = modelLR.label_encoder.inverse_transform([run_text])

In [50]:
decoded_preds

array(['juvenile fiction'], dtype=object)

In [46]:
modelLR.dataset

Unnamed: 0,clean_description,publishedDate,category_encoded
31,twentyfive years ago height counterculture mov...,2012,1
31,twentyfive years ago height counterculture mov...,2012,0
33,bismarck perhaps famous notorious warship ever...,2018,12
45,lebron jam sixfooteight gift basketball heaven...,2003,28
45,lebron jam sixfooteight gift basketball heaven...,2003,21
...,...,...,...
212374,want lose weight diet doesnt seem work ive try...,2005,11
212374,want lose weight diet doesnt seem work ive try...,2005,10
212394,grace father believe science build daughter do...,2015,9
212399,school trip ellis island dominick avaro tenyea...,2000,13
