In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import LabelEncoder
from custom_layers import AttentionLayer
from custom_losses import FocalLoss
from text_preprocessing import preprocess_text
import pickle

In [2]:
ERROR_CSV = r'C:\Study\Python\WebsiteClassification\bot\classification_errors.csv'
MODEL_PATH = r'C:\Study\Python\WebsiteClassification\bot\text_classification_model.keras'
TOKENIZER_PATH = r'C:\Study\Python\WebsiteClassification\bot\tokenizer.pkl'
LABEL_ENCODER_PATH = r'C:\Study\Python\WebsiteClassification\bot\label_encoder.pkl'
UPDATED_MODEL_PATH = 'updated_text_classification_model.keras'
UPDATED_LABEL_ENCODER_PATH = 'updated_label_encoder.pkl'

In [3]:
def load_tokenizer_and_encoder():
    with open(TOKENIZER_PATH, 'rb') as f:
        tokenizer = pickle.load(f)
    with open(LABEL_ENCODER_PATH, 'rb') as f:
        label_encoder = pickle.load(f)

    return tokenizer, label_encoder


def prepare_data(errors_csv):
    df = pd.read_csv(errors_csv, sep=';', encoding='utf-8-sig')
    X_texts = (df['title'].fillna('') + " " + df['summary'].fillna('')).tolist()
    y_labels = df['true_topic'].tolist()

    return X_texts, y_labels

In [20]:
def retrain_model(X_train, y_train, model_path, tokenizer, label_encoder, maxlen=128):
    model = tf.keras.models.load_model(
        model_path,
        custom_objects={'AttentionLayer': AttentionLayer, 'FocalLoss': FocalLoss}
    )

    sequences = tokenizer.texts_to_sequences(X_train)
    X_padded = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
    y_encoded = label_encoder.transform(y_train)

    model.compile(
        optimizer=Adam(learning_rate=1e-5),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6)

    history = model.fit(
        X_padded,
        y_encoded,
        epochs=50,
        batch_size=2,
        validation_split=0.1,
        callbacks=[early_stop, reduce_lr]
    )

    loss, accuracy = model.evaluate(X_padded, y_encoded, verbose=0)
    print(f"Accuracy: {accuracy:.2f}, Loss: {loss:.2f}")


    model.save(UPDATED_MODEL_PATH)
    with open(UPDATED_LABEL_ENCODER_PATH, 'wb') as f:
        pickle.dump(label_encoder, f)

    return history

In [5]:
X_error_texts, y_error_topics = prepare_data(ERROR_CSV)

In [6]:
tokenizer, label_encoder = load_tokenizer_and_encoder()

In [7]:
X_cleaned = [preprocess_text(text) for text in X_error_texts]
valid_indices = [i for i, text in enumerate(X_cleaned) if len(text.split()) >= 5]
X_cleaned = [X_cleaned[i] for i in valid_indices]
y_train_filtered = [y_error_topics[i] for i in valid_indices]

In [21]:
history = retrain_model(X_cleaned, y_train_filtered, MODEL_PATH, tokenizer, label_encoder)

  saveable.load_own_variables(weights_store.get(inner_path))


Epoch 1/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 154ms/step - accuracy: 0.0734 - loss: 2.3974 - val_accuracy: 0.0000e+00 - val_loss: 2.2999 - learning_rate: 1.0000e-05
Epoch 2/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 153ms/step - accuracy: 0.1999 - loss: 2.1505 - val_accuracy: 0.0000e+00 - val_loss: 2.2710 - learning_rate: 1.0000e-05
Epoch 3/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 164ms/step - accuracy: 0.2176 - loss: 2.0346 - val_accuracy: 0.1333 - val_loss: 2.2545 - learning_rate: 1.0000e-05
Epoch 4/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 136ms/step - accuracy: 0.2673 - loss: 2.0154 - val_accuracy: 0.2000 - val_loss: 2.2415 - learning_rate: 1.0000e-05
Epoch 5/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 122ms/step - accuracy: 0.3832 - loss: 1.8810 - val_accuracy: 0.1333 - val_loss: 2.2451 - learning_rate: 1.0000e-05
Epoch 6/50
[1m64/64[0m [32m━━━━━━━━━