# Import des bilbiothèques nécessaires

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
from gensim.models import Word2Vec, FastText
from sklearn.model_selection import train_test_split
import re
import mlflow

MLFLOW_TRACKING_URI = "file:///G:/Mon Drive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/mlruns_local"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
directory_path = "G:/Mon Drive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/"

In [2]:
# Load preprocessed data
from sklearn.model_selection import train_test_split
import numpy as np
subset_y = np.load(
    'G:/Mon Drive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/subset_y.npy')
data_lemastem = np.load(
    'G:/Mon Drive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/preprocessed_text.npy')


X_temp, X_dont_use, y_temp, y_dont_use = train_test_split(
    data_lemastem, subset_y, test_size=0.3, random_state=42)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_temp, y_temp, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

## Utils

In [8]:
# Fonctions

def preprocess_text(text):
    """
    Fonction pour le nettoyage de base du texte des tweets.
    """
    # Suppression des URL
    text = re.sub(r"http/S+|www/S+|https/S+", '', text, flags=re.MULTILINE)
    # Suppression des mentions et hashtags
    text = re.sub(r'/@/w+|/#', '', text)
    # Suppression des caractères spéciaux et numériques
    text = re.sub(r'/d+', '', text)
    text = re.sub(r'/W+', ' ', text, flags=re.MULTILINE)
    # Minuscules
    text = text.lower()
    return text

# Fonction pour générer des vecteurs moyens à partir des plongements pour chaque tweet


def vectorize_texts(model, sentences):
    """
    Convertit les textes en vecteurs moyens en utilisant le modèle de plongements donné.
    """
    vectorized = []
    for words in tqdm(sentences):
        vector = np.mean([model.wv[word] for word in words if word in model.wv] or [
                         np.zeros(model.vector_size)], axis=0)
        vectorized.append(vector)
    return np.array(vectorized)

# Fonction pour entraîner et évaluer le modèle, et enregistrer les résultats avec MLflow


def train_evaluate(X_train, X_test, y_train, y_test, model_name):
    """
    Entraîne une régression logistique sur les vecteurs fournis et évalue la performance.
    Log les résultats avec MLflow.
    """
    # Initialisation de MLflow
    mlflow.set_experiment("Tweet Sentiment Analysis")

    with mlflow.start_run():
        # Entraînement du modèle
        model = LogisticRegression(max_iter=1000)
        mlflow.tensorflow.autolog(registered_model_name='model')
        model.fit(X_train, y_train)

        # Prédictions et évaluation
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        recall = recall_score(y_test, predictions)

        # Log des paramètres, métriques et modèle
        mlflow.log_params(
            {"model_type": "Logistic Regression", "embedding_type": model_name})
        mlflow.log_metrics({"accuracy": accuracy, "recall": recall})
        mlflow.sklearn.log_model(model, f"model_{model_name}")

        print(
            f"Results for {model_name}: Accuracy = {accuracy:.4f}, Recall = {recall:.4f}")

# Data preprocessing

In [9]:
# Import des données
import pandas as pd
data = pd.read_csv(directory_path+'sentiment140 (1)/training.1600000.processed.noemoticon.csv',
                   names=['target', 'id', 'date',
                          'statut', 'usertag', 'content'],
                   sep=',',
                   encoding_errors='ignore')

data = data[['target', 'content']]
data['gt'] = data['target'].map({0: 0, 4: 1})
# np.save(directory_path+'labels.npy', data['gt'])
# Preprocessing
data['text_processed'] = data['content'].apply(lambda x: preprocess_text(x))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   target   1600000 non-null  int64 
 1   content  1600000 non-null  object
 2   gt       1600000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 36.6+ MB


## Lemmatisation et Stemmatisation

In [None]:
# Subset and split datas
from sklearn.model_selection import train_test_split

X = data['content']
y = data['gt']

# Subsetting the data to avoid OOM issue
subset_x, do_not_use_x, subset_y, do_not_use_y = train_test_split(
    X, y, train_size=0.2, random_state=42)

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
livrables_path = "/content/drive/MyDrive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/"

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


stop_words = set(stopwords.words('english'))

# Fonction pour le prétraitement du texte avec lemmatisation et stemming


def preprocess_text_advanced(text, stop_words):
    # Tokenisation
    words = nltk.word_tokenize(text)

    # Suppression des stopwords
    words = [word for word in words if word not in stop_words]

    # Lemmatisation
    words = [lemmatizer.lemmatize(word) for word in words]

    # Stemming
    words = [stemmer.stem(word) for word in words]

    # Retourne le texte prétraité
    return " ".join(words)


data_lemastem = subset_x.apply(
    lambda x: preprocess_text_advanced(x, stop_words))
np.save(livrables_path + 'subset_y', list(subset_y))
# np.save(livrables_path + 'preprocessed_text', list(data_lemastem) )

## Embedding with Word2Vec and FastText

In [None]:
sentences = [row.split() for row in data['text_processed']]
# Word2Vec
model_w2v = Word2Vec(sentences, vector_size=100,
                     window=5, min_count=1, workers=4)
model_w2v.save(directory_path+'w2v_model.model')

# FastText
model_ft = FastText(sentences, vector_size=100,
                    window=5, min_count=1, workers=4)

In [None]:
# Convert text to vector
w2v_vectors = vectorize_texts(model_w2v, sentences)
ft_vectors = vectorize_texts(model_ft, sentences)

# Save the built vectors

# Sauvegarde des vecteurs Word2Vec
np.save('/content/drive/MyDrive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/w2v_vectors.npy', w2v_vectors)

# Sauvegarde des vecteurs FastText
np.save('/content/drive/MyDrive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/cft_vectors.npy', ft_vectors)

# Save target array
# np.save('/kaggle/input/gt.np', data['gt'])

## Création de la matrice d'embedding

In [None]:
# Create embedding matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec

# Et que `data_texts` est votre liste de textes.

text_processed = np.load(
    '/content/drive/MyDrive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/text_processed.npy', data['text_processed'])
# Création et adaptation d'un tokenizer Keras
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['text_processed'])
vocab_size = len(tokenizer.word_index) + 1

# Taille des vecteurs Word2Vec
embedding_dim = model_w2v.vector_size

# Initialisation de la matrice d'embedding avec des zéros
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Remplissage de la matrice d'embedding
for word, i in tokenizer.word_index.items():
    if word in model_w2v.wv:
        embedding_vector = model_w2v.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
np.save('/content/drive/MyDrive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/embedding_matrix.npy', embedding_matrix)

## Embedding via USE

In [None]:
from sklearn.model_selection import train_test_split

text, text_trash, labels, labels_trash = train_test_split(
    data['text_processed'], data['gt'], train_size=0.3, random_state=42)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

# Chargez le modèle Universal Sentence Encoder de TensorFlow Hub
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


def generate_embeddings(sentences):
    """
    Génère des embeddings pour chaque phrase en utilisant USE et les convertit en arrays numpy.

    :param sentences: Liste de phrases (strings).
    :return: Numpy array des embeddings.
    """
    embeddings = use(sentences)
    embeddings_np = np.array(embeddings)
    return embeddings_np


# Exemple d'utilisation
if __name__ == "__main__":
    # data_lemastem = np.load('/content/drive/MyDrive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/preprocessed_text.npy')
    # sentences = [" ".join(tokens) for tokens in data_lemastem]
    sentences = text
    embeddings_np = generate_embeddings(sentences)
    print("Shape of embeddings:", embeddings_np.shape)

# Entrainement des modèles

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D, LSTM, Conv1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf
import numpy as np

directory_path = "G:/Mon Drive/Documents/Apprentissage/OpenClassroom/Projet_7_Analyse_de_sentiments/"
vocab_size = 10000
labels = np.load(directory_path+'labels.npy')
embedding_matrix = np.load(directory_path+'embedding_matrix.npy')
w2v_vectors = np.load(directory_path+'w2v_vectors.npy')

# Séparation en ensembles d'entraînement et de test
X_train, X_do_not_use, y_train, y_do_not_use = train_test_split(
    w2v_vectors, labels, train_size=0.2, random_state=42)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)

X_train = np.maximum(X_train, 0)
X_test = np.maximum(X_test, 0)
X_val = np.maximum(X_val, 0)

embedding_vector_length = X_train.shape[1]
maxlen = 100
vocab_size = embedding_matrix.shape[0]
embedding_dim = embedding_matrix.shape[1]

## Construction du modèle baseline

In [None]:
# Reprise du modèle avec les meilleures performances
keras_model = Sequential()
keras_model.add(Dense(128, activation='relu',
                input_dim=embedding_vector_length))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(64, activation='relu'))
keras_model.add(Dropout(0.2))
# Pour la classification binaire, utilisez 'softmax' pour la classification multiclasse
keras_model.add(Dense(1, activation='sigmoid'))

early_stopping = EarlyStopping(patience=3,
                               monitor='val_loss',
                               restore_best_weights=True)
checkpoint = ModelCheckpoint(monitor='val_loss',
                             filepath=(directory_path+'Models/baseline.keras'),
                             save_best_only=True,
                             save_weights_only=False)

# Compilation du modèle
keras_model.compile(optimizer='adam', loss='binary_crossentropy',
                    metrics=['accuracy', tf.keras.metrics.AUC()])

In [None]:
# Entraînement du modèle
mlflow.set_experiment('models_training')
mlflow.autolog()
keras_model.fit(X_train, y_train, validation_data=(X_test, y_test),
                epochs=10,
                callbacks=[early_stopping, checkpoint],
                batch_size=32)

# Sauvegarde du modèle

# tf.keras.models.save_model(keras_model, directory_path+'keras_model.keras')

In [None]:
keras_model.summary()

In [None]:
# Evaluation
keras_model.evaluate(X_val, y_val, return_dict=True)

## Keras embedding seul

In [None]:
# Construction du modèle

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu',
                input_dim=embedding_vector_length))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

early_stopping = EarlyStopping(patience=3,
                               monitor='val_loss',
                               restore_best_weights=True)
checkpoint = ModelCheckpoint(monitor='val_loss',
                             filepath=(directory_path +
                                       'Models/embedding_model.keras'),
                             save_best_only=True,
                             save_weights_only=False)

# Compilation du modèle
model.compile(optimizer='adam', loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.AUC()])

In [None]:

# Entraînement du modèle
mlflow.autolog()
model.fit(X_train, y_train, validation_data=(X_test, y_test),
          epochs=10,
          callbacks=[early_stopping, checkpoint],
          batch_size=32)

# Sauvegarde du modèle

# tf.keras.models.save_model(model, directory_path+'model_embedding_new.keras')

In [None]:
# Evaluation

model.evaluate(X_val, y_val, return_dict=True)

## Keras Embedding + LSTM

In [None]:
# Chargement des embeddings
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import tensorflow as tf


# modèle avec LSTM
model_LSTM = Sequential()
model_LSTM.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[
               embedding_matrix], input_length=maxlen, trainable=False))
model_LSTM.add(Conv1D(128, 5, activation='relu'))
model_LSTM.add(LSTM(32))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(64, activation='relu'))
model_LSTM.add(Dropout(0.5))
model_LSTM.add(Dense(1, activation='sigmoid'))

# Compilation du modèle
model_LSTM.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy', tf.keras.metrics.AUC()])


checkpoint = ModelCheckpoint(monitor='val_loss',
                             filepath=(directory_path +
                                       'baseline_new_LSTM.keras'),
                             save_best_only=True,
                             save_weights_only=False)

In [None]:

mlflow.autolog()
model_LSTM.fit(X_train, y_train, validation_data=(X_test, y_test),
               epochs=10,
               callbacks=[early_stopping, checkpoint],
               batch_size=32)

# Sauvegarde du modèle

tf.keras.models.save_model(
    model_LSTM, directory_path+'model_embedding_LSTM_new.keras')

In [None]:
model_LSTM.evaluate(X_val, y_val, return_dict=True)

# Fine-tuning BERT


In [3]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
from transformers import InputExample, InputFeatures
from sklearn.model_selection import train_test_split
import pandas as pd




In [13]:
!pip -qq install torch
df = data[['text_processed', 'gt']]

# Séparation en ensembles d'entraînement et de test
train_examples, trash_examples = train_test_split(
    df, train_size=0.2, random_state=42)
train_examples, temp_examples = train_test_split(
    train_examples, test_size=0.3, random_state=42)
test_examples, validation_examples = train_test_split(
    temp_examples, test_size=0.5, random_state=42)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['fit_denses.0.weight', 'fit_denses.3.bias', 'fit_denses.3.weight', 'fit_denses.0.bias', 'fit_denses.1.bias', 'fit_denses.2.bias', 'fit_denses.1.weight', 'fit_denses.2.weight', 'fit_denses.4.bias', 'fit_denses.4.weight']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classif

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [14]:
# Fonction de prétraitement pour BERT
def convert_example_to_feature(review):
    return tokenizer.encode_plus(review,
                                 add_special_tokens=True,
                                 max_length=160,
                                 pad_to_max_length=True,
                                 return_attention_mask=True,
                                 )


def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
    }, label


def encode_examples(ds, limit=-1):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []

    for index, row in ds.iterrows():
        bert_input = convert_example_to_feature(row['text_processed'])
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([row['gt']])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [15]:
# Prétraitement des données pour BERT et import du modèle
train_data = encode_examples(train_examples)
validation_data = encode_examples(validation_examples)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:

# Chargement du modèle et du tokenizer correspondant
model_name = 'huawei-noah/TinyBERT_General_4L_312D'
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=1, from_pt=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, from_pt=True)


model.compile(optimizer='adam', loss='binary_crossentropy',










              metrics=['accuracy'])  # tf.keras.metrics.AUC()


print('Model compiled')


Model compiled


In [20]:
# Entrainement du modèle
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint = ModelCheckpoint(monitor='val_loss',
                             filepath=(directory_path +
                                       'Models/Tiny_bert.keras'),
                             save_best_only=True,
                             save_weights_only=False,
                             save_format='tf')
early_stopping = EarlyStopping(patience=3, verbose=1, monitor='val_loss')

In [21]:
# Fine-tuning
mlflow.autolog()
model.fit(train_data.shuffle(100).batch(32),
          epochs=10,
          batch_size=32,
          validation_data=validation_data.batch(32),
          callbacks=[early_stopping, checkpoint])

2024/04/09 10:46:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2024/04/09 10:46:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2024/04/09 10:46:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/04/09 10:46:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '96931c825cf34b56b0c4678e4077746b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow


Epoch 1/10


 725/7000 [==>...........................] - ETA: 4:01:59 - loss: 0.7083 - accuracy: 0.4984

KeyboardInterrupt: 

In [23]:
# Evaluation
model.evaluate(
    validation_data.batch(32), return_dict=True)


# model.save_pretrained("./Models/bert_finetuned.keras")



{'loss': 0.6986682415008545, 'accuracy': 0.5036458373069763}