Deep Learning MBIM

In [None]:
import os
import pandas as pd
import numpy as np
import string
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import load_model
from deep_translator import GoogleTranslator

# Désactiver les avertissements
# warnings.filterwarnings("ignore")

nlp = spacy.load("en_core_web_sm")

tokenizer = Tokenizer()

def preprocess_text(text):
    text = re.sub(r"<.*?>", "", text)
    doc = nlp(text, disable=["parser", "tagger", "ner", "textcat"])
    simplified_text = " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.is_digit])
    
    return simplified_text

if os.path.exists("sentiment.keras"):
    model = load_model("sentiment.keras")
else:
    label_encoder = LabelEncoder()
    data = pd.read_csv("../data/train.csv")

    data["review"] = data["review"].apply(preprocess_text)
    data["sentiment"] = label_encoder.fit_transform(data["sentiment"])

    X_train, X_test, y_train, y_test = train_test_split(data["review"], data["sentiment"], test_size=0.2, random_state=42)

    tokenizer.fit_on_texts(X_train)
    X_train_sequences = tokenizer.texts_to_sequences(X_train)
    X_test_sequences = tokenizer.texts_to_sequences(X_test)

    max_sequence_length = 100
    X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
    X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

    class_weights = class_weight.compute_class_weight(class_weight = "balanced",classes = np.unique(y_train),y = y_train)
    class_weight_dict = dict(enumerate(class_weights))

    model = tf.keras.Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
        LSTM(64),
        Dense(3, activation='softmax')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

    y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=3)
    model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.1)

    y_test_encoded = tf.keras.utils.to_categorical(y_test, num_classes=3)

    model.save("sentiment.keras")

comment = "You are so beautiful !"
comment = preprocess_text(comment)
# comment = GoogleTranslator(source='auto', target='en').translate(comment)
comment_sequence = tokenizer.texts_to_sequences([comment])
comment_padded = pad_sequences(comment_sequence, maxlen=max_sequence_length, padding='post')

predictions = model.predict(comment_padded)
class_labels = ['Négatif', 'Positif', 'Neutre']
predicted_class = class_labels[np.argmax(predictions)]
print(predictions)
print(f"Première prédiction classé comme: {predicted_class}")

accuracy = model.evaluate(X_test_padded, y_test_encoded)[1]
print(f"Model accuracy: {accuracy * 100:.2f}%")


Deep Learning Toxic Only

In [None]:
import os
import pandas as pd
import numpy as np
import string
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import load_model
from deep_translator import GoogleTranslator

# Désactiver les avertissements
# warnings.filterwarnings("ignore")

nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer()

def preprocess_text(text):
    return text
    # text = re.sub(r"<.*?>", "", text)
    # doc = nlp(text, disable=["parser", "tagger", "ner", "textcat"])
    # simplified_text = " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.is_digit])
    
    return simplified_text

def clear_data(data, var):
    data_copy = data.copy()

    data_copy["comment_text"] = data_copy["comment_text"].astype(str)
    data_copy = data_copy.drop_duplicates(subset="comment_text")
    data_copy["comment_text"] = data_copy["comment_text"].dropna()

    toxic_samples = data_copy[data_copy[var] == 1].drop_duplicates(subset="comment_text")
    non_toxic_samples = data_copy[data_copy[var] == 0].drop_duplicates(subset="comment_text")
    num_toxic_samples = min(50000, len(toxic_samples))
    toxic_samples = toxic_samples.sample(n=num_toxic_samples, random_state=42)
    non_toxic_samples = non_toxic_samples.sample(n=num_toxic_samples, random_state=42)
    new_data = pd.concat([toxic_samples, non_toxic_samples])

    data = new_data.sample(frac=1, random_state=42)
    data["comment_text"] = data["comment_text"].apply(preprocess_text)

    return data

if os.path.exists("toxic.keras"):
    model = load_model("toxic.keras")
else:
    data = pd.read_csv("../data/train_1M.csv")
    data = clear_data(data, "toxic")

    X_train, X_test, y_train, y_test = train_test_split(data["comment_text"], data["toxic"], test_size=0.2, random_state=42)

    tokenizer.fit_on_texts(X_train)
    X_train_sequences = tokenizer.texts_to_sequences(X_train)
    X_test_sequences = tokenizer.texts_to_sequences(X_test)

    max_sequence_length = 100
    X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
    X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

    model = tf.keras.Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
        LSTM(64),
        Dense(3, activation='softmax')
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

    y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=3)
    model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.1)

    y_test_encoded = tf.keras.utils.to_categorical(y_test, num_classes=3)

    model.save("toxic.keras")

comment = "You are so beautiful !"
comment = preprocess_text(comment)
# comment = GoogleTranslator(source='auto', target='en').translate(comment)
comment_sequence = tokenizer.texts_to_sequences([comment])
comment_padded = pad_sequences(comment_sequence, maxlen=max_sequence_length, padding='post')

predictions = model.predict(comment_padded)
class_labels = ['Négatif', 'Positif', 'Neutre']
predicted_class = class_labels[np.argmax(predictions)]
# print(predictions)
print(f"Commentaire classé comme: {predicted_class}")

# accuracy = model.evaluate(X_test_padded, y_test_encoded)[1]
# print(f"Model accuracy: {accuracy * 100:.2f}%")

Deep Learning All Variables

In [None]:
import os
import pandas as pd
import numpy as np
import string
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import load_model
from deep_translator import GoogleTranslator

In [None]:
# Désactiver les avertissements
# warnings.filterwarnings("ignore")

nlp = spacy.load("en_core_web_sm")
tokenizer = Tokenizer()
variables = ["toxic", "obscene", "insult", "identity_hate"]

In [None]:
def preprocess_text(text):
    # return text
    text = re.sub(r"<.*?>", "", text)
    doc = nlp(text, disable=["parser", "ner", "textcat"])
    simplified_text = " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.is_digit])
    
    return simplified_text

def clear_data(data, var):
    data_copy = data.copy()

    data_copy["comment_text"] = data_copy["comment_text"].astype(str)
    data_copy = data_copy.drop_duplicates(subset="comment_text")
    data_copy["comment_text"] = data_copy["comment_text"].dropna()

    toxic_samples = data_copy[data_copy[var] == 1].drop_duplicates(subset="comment_text")
    non_toxic_samples = data_copy[data_copy[var] == 0].drop_duplicates(subset="comment_text")
    num_toxic_samples = min(50000, len(toxic_samples))
    toxic_samples = toxic_samples.sample(n=num_toxic_samples, random_state=42)
    non_toxic_samples = non_toxic_samples.sample(n=num_toxic_samples, random_state=42)
    data = pd.concat([toxic_samples, non_toxic_samples])
    # print(data)

    data["comment_text"] = data["comment_text"].apply(preprocess_text)

    return data

In [None]:
for toxic_var in variables:
    if os.path.exists(f"{toxic_var}.keras"):
        model = load_model(f"{toxic_var}.keras")
    else:
        data = pd.read_csv("../data/train_1M.csv")
        data = clear_data(data, toxic_var)

        X_train, X_test, y_train, y_test = train_test_split(data["comment_text"], data[toxic_var], test_size=0.2, random_state=42)

        tokenizer.fit_on_texts(X_train)
        X_train_sequences = tokenizer.texts_to_sequences(X_train)
        X_test_sequences = tokenizer.texts_to_sequences(X_test)

        max_sequence_length = 100
        X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
        X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

        model = tf.keras.Sequential([
            Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
            LSTM(64),
            Dense(2, activation='softmax')
        ])

        model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

        y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=2)
        model.fit(X_train_padded, y_train_encoded, epochs=15, batch_size=32, validation_split=0.1)

        y_test_encoded = tf.keras.utils.to_categorical(y_test, num_classes=2)

        model.save(f"{toxic_var}.keras")

In [None]:
model = load_model("../static/model/toxic.keras")
comment = "Wow"
comment = preprocess_text(comment)
# comment = GoogleTranslator(source='auto', target='en').translate(comment)
comment_sequence = tokenizer.texts_to_sequences([comment])
comment_padded = pad_sequences(comment_sequence, maxlen=max_sequence_length, padding='post')

predictions = model.predict(comment_padded)
class_labels = ['Négatif', 'Positif', 'Neutre']
predicted_class = class_labels[np.argmax(predictions)]
# print(predictions)
print(f"Commentaire classé comme: {predicted_class}")

# accuracy = model.evaluate(X_test_padded, y_test_encoded)[1]
# print(f"Model accuracy: {accuracy * 100:.2f}%")
