# Librairies Importation

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Preprocessing

In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
# Charger les données
X_train = pd.read_csv('kaggle_data/train_x.csv', index_col=0)
y_train = pd.read_csv('kaggle_data/train_y.csv')
X_test = pd.read_csv('kaggle_data/test_x.csv')
X_val = pd.read_csv('kaggle_data/val_x.csv')
y_val = pd.read_csv('kaggle_data/val_y.csv')

#Change type of string column to string
X_train['string'] = X_train['string'].astype(str)
X_test['string'] = X_test['string'].astype(str)
X_val['string'] = X_val['string'].astype(str)


#Sample dataset
X_train_sample = X_train[:10000]
y_train_sample = y_train[:10000]
X_val_sample = X_val[:10000]
y_val_sample = y_val[:10000]




In [13]:
X_train_sample['string'].shape

(10,)

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

vocab_size = 20000  # C'est une estimation. Vous devrez compter le nombre réel de mots uniques dans vos données.
embedding_dim = 100  # Une valeur couramment utilisée. Vous pouvez l'ajuster en fonction de vos besoins.
max_seq_length = 1500  # Vous avez mentionné que c'est la longueur maximale d'un commentaire.
num_classes = 2  # Par exemple, si vous faites une classification binaire (positif/négatif). Ajustez en fonction du nombre réel de classes dans vos données.

def train_lstm_model(X_train, y_train, target, vocab_size, embedding_dim, max_seq_length, num_classes):
    # Tokenize the input data
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(X_train['string'])
    X_train_sequences = tokenizer.texts_to_sequences(X_train['string'])
    X_train_padded = pad_sequences(X_train_sequences, maxlen=max_seq_length)

    # Encode the target values as one-hot vectors
    y_train_encoded = to_categorical(y_train[target], num_classes=num_classes)

    # Define the model
    model = Sequential()

    # Add an embedding layer
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length))

    # Add LSTM layers
    model.add(LSTM(units=128, return_sequences=True))
    model.add(LSTM(units=64))

    # Add dropout layer
    model.add(Dropout(0.5))

    # Add a dense layer
    model.add(Dense(units=64, activation='relu'))

    # Add output layer
    model.add(Dense(units=num_classes, activation='softmax'))

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train_padded, y_train_encoded, epochs=2, batch_size=200)

    # Set the name of the model
    model_name = f"model_LSTM_{target}"

    return model, model_name


In [24]:
import os
# Create the "models_lstm" directory if it doesn't exist
if not os.path.exists("models_lstm"):
    os.makedirs("models_lstm")

toxicity_categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions','black', 'white']

trained_models = {}

for category in toxicity_categories:
    model, model_name = train_lstm_model(X_train, y_train, category, vocab_size, embedding_dim, max_seq_length, num_classes)
    trained_models[model_name] = model
    model.save(f"models_lstm/V2/{model_name}.h5", save_format='h5')
    
trained_models

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


  saving_api.save_model(


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/4
  16/1346 [..............................] - ETA: 2:10:15 - loss: 0.4298 - accuracy: 0.8981

KeyboardInterrupt: 

In [27]:
import os
# Create the "models_lstm" directory if it doesn't exist
if not os.path.exists("models_lstm"):
    os.makedirs("models_lstm")

toxicity_categories = ['christian', 'muslim', 'other_religions','black', 'white']

trained_models = {}

for category in toxicity_categories:
    model, model_name = train_lstm_model(X_train_sample, y_train_sample, category, vocab_size, embedding_dim, max_seq_length, num_classes)
    trained_models[model_name] = model
    model.save(f"models_lstm/V2/{model_name}.h5", save_format='h5')
    
trained_models

Epoch 1/2
Epoch 2/2


  saving_api.save_model(


Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


{'model_LSTM_christian': <keras.src.engine.sequential.Sequential at 0x39384f0a0>,
 'model_LSTM_muslim': <keras.src.engine.sequential.Sequential at 0x39101b6d0>,
 'model_LSTM_other_religions': <keras.src.engine.sequential.Sequential at 0x2d4574220>,
 'model_LSTM_black': <keras.src.engine.sequential.Sequential at 0x38160f100>,
 'model_LSTM_white': <keras.src.engine.sequential.Sequential at 0x39142fdc0>}

In [28]:
import os
from tensorflow.keras.models import load_model

def load_models(directory):
    models = {}
    for filename in os.listdir(directory):
        if filename.endswith(".h5"):
            model_name = os.path.splitext(filename)[0]
            model_path = os.path.join(directory, filename)
            model = load_model(model_path)
            models[model_name] = model
    return models

models = load_models("models_lstm/V2/")
models

{'model_LSTM_black': <keras.src.engine.sequential.Sequential at 0x3809feeb0>,
 'model_LSTM_female': <keras.src.engine.sequential.Sequential at 0x298b84700>,
 'model_LSTM_other_religions': <keras.src.engine.sequential.Sequential at 0x2d68065b0>,
 'model_LSTM_christian': <keras.src.engine.sequential.Sequential at 0x2d6a0a460>,
 'model_LSTM_white': <keras.src.engine.sequential.Sequential at 0x2d6b558e0>,
 'model_LSTM_male': <keras.src.engine.sequential.Sequential at 0x2d6ddfd90>,
 'model_LSTM_LGBTQ': <keras.src.engine.sequential.Sequential at 0x2d706d280>,
 'model_LSTM_muslim': <keras.src.engine.sequential.Sequential at 0x2d72b7df0>}

In [30]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Change type of string column to string
X_val['string'] = X_val['string'].astype(str)

X_val_sample = X_val[:1000]
y_val_sample = y_val[:1000]

 # Tokenize the input data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_val_sample['string'])
X_val_sequences = tokenizer.texts_to_sequences(X_val_sample['string'])
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_seq_length)



categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions','black', 'white']

# Iterate over the trained models
evaluation_results = {}
for (model_name, model), category in zip(models.items(), categories):
   # Encode the target values as one-hot vectors
    y_val_encoded = to_categorical(y_val_sample[category], num_classes=num_classes)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_val_padded, y_val_encoded)
    print(loss, accuracy)

    # Calculate evaluation metrics
    predictions = model.predict(X_val_padded)
    predicted_labels = np.argmax(predictions, axis=1)
    targets = np.argmax(y_val_encoded, axis=1)
    precision = precision_score(targets, predicted_labels, zero_division=0)
    recall = recall_score(targets, predicted_labels)
    f1 = f1_score(targets, predicted_labels)
    
    # Store the evaluation results
    evaluation_results[model_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1': f1}

evaluation_results


0.6711398363113403 0.8809999823570251
1.0146141052246094 0.8309999704360962
0.6712818741798401 0.9580000042915344
0.6660277247428894 0.9459999799728394
0.6688703298568726 0.9380000233650208
0.43176040053367615 0.890999972820282
0.5748865604400635 0.9089999794960022
0.6707823276519775 0.8790000081062317


{'model_LSTM_black': {'Accuracy': 0.8809999823570251,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_LSTM_female': {'Accuracy': 0.8309999704360962,
  'Precision': 0.3425925925925926,
  'Recall': 0.2740740740740741,
  'F1': 0.3045267489711934},
 'model_LSTM_other_religions': {'Accuracy': 0.9580000042915344,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_LSTM_christian': {'Accuracy': 0.9459999799728394,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_LSTM_white': {'Accuracy': 0.9380000233650208,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0},
 'model_LSTM_male': {'Accuracy': 0.890999972820282,
  'Precision': 0.03260869565217391,
  'Recall': 0.13043478260869565,
  'F1': 0.05217391304347826},
 'model_LSTM_LGBTQ': {'Accuracy': 0.9089999794960022,
  'Precision': 0.05714285714285714,
  'Recall': 0.03333333333333333,
  'F1': 0.042105263157894736},
 'model_LSTM_muslim': {'Accuracy': 0.8790000081062317,
  'Precision': 0.0,
  'Recall': 0.0,
  'F1': 0.0}}

In [35]:
def predict_values(X_test):
    # Create an empty dataframe to store the predictions
    predictions_df = pd.DataFrame(columns=['ID', 'Prediction'])

    # Change type of string column to string
    X_test['string'] = X_test['string'].astype(str)

    # Tokenize the input data
    # Tokenize the input data
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(X_test['string'])
    X_test_sequences = tokenizer.texts_to_sequences(X_test['string'])
    X_test_padded = pad_sequences(X_test_sequences, maxlen=max_seq_length)
    
    # Iterate over the trained models
    for model_name, model in models.items():
        # Make predictions
        predictions = model.predict(X_test_padded)
        predicted_labels = np.argmax(predictions, axis=1)
        
        # Update the predictions dataframe
        predictions_df[model_name] = predicted_labels
    
     # Calculate the final prediction
    predictions_df['Prediction'] = predictions_df.iloc[:, 2:].max(axis=1)
    
    # Add the ID column
    predictions_df['ID'] = X_test['index']
    
    # Return the predictions dataframe
    return predictions_df[['ID', 'pred']]

pred = predict_values(X_test)
    


