In [None]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Charger le dataset
print("Chargement du dataset...")
data = pd.read_csv("E:/sentement140/training.1600000.processed.noemoticon.csv", encoding='ISO-8859-1', header=None)
data.columns = ["target", "id", "date", "flag", "user", "text"]
print("Dataset chargé avec succès.")

# Sélectionner les colonnes pertinentes
data = data[["target", "text"]]

# Convertir les valeurs cibles en sentiments
data['sentiment'] = data['target'].replace({0: "negative", 4: "positive", 2: "neutral"})
print("Conversion des valeurs cibles en sentiments effectuée.")

# Fonction pour nettoyer le texte
def clean_text(text):
    text = re.sub(r'http\S+', '', text)     # Supprimer les URLs
    text = re.sub(r'@\w+', '', text)        # Supprimer les mentions
    text = re.sub(r'#\w+', '', text)        # Supprimer les hashtags
    text = re.sub(r'[^\w\s]', '', text)     # Supprimer les caractères spéciaux
    text = text.lower()                     # Mettre tout en minuscules
    return text

# Appliquer le nettoyage
print("Nettoyage du texte...")
data['cleaned_text'] = data['text'].apply(clean_text)
print("Nettoyage du texte effectué.")

# Appliquer la tokenisation sur le texte nettoyé
print("Tokenisation des textes...")
data['tokens'] = data['cleaned_text'].apply(word_tokenize)
print("Tokenisation effectuée.")

# Train-Test Split
print("Séparation du dataset en Train et Test...")
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['sentiment'].map({'negative': 0, 'positive': 1}), test_size=0.2, random_state=42)
print("Séparation effectuée.")

# 1. **Model A - Logistic Regression**
print("Application de CountVectorizer...")
vectorizer = CountVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

log_reg = LogisticRegression()
log_reg.fit(X_train_vect, y_train)
y_pred_log_reg = log_reg.predict(X_test_vect)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

# 2. **Model B - LSTM**
print("Tokenisation pour LSTM...")
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=3, batch_size=32)

y_pred_lstm = (lstm_model.predict(X_test_pad) > 0.5).astype("int32")
print("LSTM Accuracy:", accuracy_score(y_test, y_pred_lstm))
print(classification_report(y_test, y_pred_lstm))

# 3. **Model C - BERT**
print("Chargement du modèle BERT...")
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

train_encodings = tokenizer_bert(list(X_train), truncation=True, padding=True, max_length=128, return_tensors="tf")
test_encodings = tokenizer_bert(list(X_test), truncation=True, padding=True, max_length=128, return_tensors="tf")

bert_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
bert_model.fit(train_encodings['input_ids'], y_train, validation_data=(test_encodings['input_ids'], y_test), epochs=2, batch_size=16)

y_pred_bert = np.argmax(bert_model.predict(test_encodings['input_ids']).logits, axis=1)
print("BERT Accuracy:", accuracy_score(y_test, y_pred_bert))
print(classification_report(y_test, y_pred_bert))


Chargement du dataset...
Dataset chargé avec succès.
Conversion des valeurs cibles en sentiments effectuée.
Nettoyage du texte...
Nettoyage du texte effectué.
Tokenisation des textes...
Tokenisation effectuée.
Séparation du dataset en Train et Test...
Séparation effectuée.
Application de CountVectorizer...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.789034375
              precision    recall  f1-score   support

           0       0.80      0.76      0.78    159494
           1       0.78      0.81      0.79    160506

    accuracy                           0.79    320000
   macro avg       0.79      0.79      0.79    320000
weighted avg       0.79      0.79      0.79    320000

Tokenisation pour LSTM...
Epoch 1/3
Epoch 2/3
Epoch 3/3
LSTM Accuracy: 0.8199375
              precision    recall  f1-score   support

           0       0.82      0.82      0.82    159494
           1       0.82      0.82      0.82    160506

    accuracy                           0.82    320000
   macro avg       0.82      0.82      0.82    320000
weighted avg       0.82      0.82      0.82    320000

Chargement du modèle BERT...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
