In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from gensim import corpora, models
from gensim.models import CoherenceModel, Phrases
from gensim.models.phrases import Phraser
from pprint import pprint

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('vader_lexicon')

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
CSV_PATH =os.getenv("CSV_PATH")

df=pd.read_csv(CSV_PATH)
df.dropna(subset=["Message"], inplace=True)
df = df[df["Auteur"] != "Inconnu"]
df.drop_duplicates(inplace=True)
df.head(30)

In [None]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

def detect_lang(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Ajouter une colonne 'lang'
df["lang"] = df["Message"].apply(detect_lang)

# Garder uniquement les messages détectés comme anglais
df = df[df["lang"] == "en"]

In [None]:
custom_stopwords = {
    'im', 'ive', 'ya', 'hi', 'hello', 'everyone', 'guy', 'guys', 'd', 'would', 'like', 'one',
    'get', 'also', 'know', 'dont', 'let', 'us', 'want', 'need', 'use', 'using', 'question',
    'help', 'thank', 'thanks', 'thing', 'things', 'hey', 'still', 'well', 'maybe', 'look',
    'looking','anyone', 'someone', 'please', 'make', 'good', 'work', 'time', 'really', 'trying', 'interested',
      'youre', 'take', 'find', 'something','he','great','un', 'ce', 'dans','new','chaneel','talk','le', 'tous',
    'st','fine','frind','d',
 }

def clean_text(text):
    text = str(text).strip()
    text = " ".join(text.split())
    text=text.lower()
    text = re.sub(r"http\S+", "", text)  
    text = re.sub(r"@\w+", "", text) 
    text = re.sub(r"#\w+", "", text)      
    text = re.sub(r'\d+', '', text) 
    text = re.sub(r"[^a-z\s]", "", text)
    stop_words = set(stopwords.words("english")).union(custom_stopwords)
    word_tokens = word_tokenize(text) 
    text = [word for word in word_tokens if word not in stop_words and len(word) > 1]
    if len(text) < 3:
        return ""
    lemmatizer = WordNetLemmatizer()
    text=[lemmatizer.lemmatize(word) for word in text]         
    return " ".join(text)

df["cleaned"] = df["Message"].apply(clean_text)
df = df[df["cleaned"] != ""] 
df["Autclean"] = df["Auteur"].str.replace("@", "", regex=False)
df.shape
df.head(30)


In [None]:
print(df[['Message', 'cleaned']].head(10))

In [None]:
# 🕵️‍♂️ 1. Emoji le plus fréquent
emoji_counts = df['Emoji'].value_counts()
Emoji_le_plus_utilisé = emoji_counts.idxmax()

print("Emoji le plus utilisé :", Emoji_le_plus_utilisé)
print(emoji_counts.head(10)) 


# 👤 2. Utilisateurs les plus actifs
user_counts = df['Autclean'].value_counts()
most_active_user = user_counts.idxmax()

plt.figure(figsize=(10, 4))
sns.barplot(x=user_counts.head(10).index, y=user_counts.head(10).values, palette="crest")
plt.title("Top 10 des utilisateurs les plus actifs")
plt.xlabel("Utilisateur")
plt.ylabel("Nombre de messages")
plt.xticks(rotation=45)
plt.show()
plt.savefig("Top 10 des utilisateurs les plus actifs")


In [None]:
import locale
locale.setlocale(locale.LC_TIME, 'French_France')

df["Date_clean"] = df["Date"].str.replace("à", "", regex=False).str.strip()

# Conversion en datetime
df["Date_clean"] = pd.to_datetime(df["Date_clean"], format="%d %B %Y %H:%M", errors="coerce")

df["heure"] = df["Date_clean"].dt.hour
frequence_par_heure = df["heure"].value_counts().sort_index()

# Tracer
plt.figure(figsize=(10,5))
sns.barplot(x=frequence_par_heure.index, y=frequence_par_heure.values, palette="viridis")
plt.title("Nombre d'activités par heure")
plt.xlabel("Heure de la journée")
plt.ylabel("Nombre d'activités")
plt.xticks(range(24))
plt.grid(True)
plt.tight_layout()
plt.savefig("frequence_par_heure.png")
plt.show()


In [None]:
# 5. Usage d'emojis par utilisateur (top 10)
emoji_counts_by_user = (
    df[df['Emoji'].astype(bool)]
      .groupby('Autclean')['Emoji']
      .count()
      .sort_values(ascending=False)
      .head(10)
)
plt.figure(figsize=(8,4))
sns.barplot(x=emoji_counts_by_user.values, y=emoji_counts_by_user.index)
plt.title("Top 10 des utilisateurs ayant envoyé le plus d'emojis")
plt.xlabel("Nombre d'emojis")
plt.ylabel("Utilisateur")
plt.tight_layout()
plt.savefig("emoji_par_utilisateur.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from collections import Counter
from wordcloud import WordCloud

all_words = []
for text in df["cleaned"]: 
    all_words.extend(text.split())

freq = Counter(all_words)

common_words = freq.most_common(30)
print(common_words)

# 4. Visualisation : barplot
plt.figure(figsize=(10, 6))
plt.bar([word for word, count in common_words], [count for word, count in common_words], color='skyblue')
plt.xticks(rotation=45)
plt.title("Top 20 mots les plus fréquents")
plt.xlabel("Mots")
plt.ylabel("Fréquence")
plt.tight_layout()
plt.show()
plt.savefig("Top 20 mots les plus fréquents")

#nuage des mots
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(all_words))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Nuage de mots")
plt.show()
plt.savefig("Nuage de mots")


In [None]:
texts = [text.split() for text in df["cleaned"]]

# Création de bigrammes
bigram = Phrases(texts, min_count=5, threshold=30)
bigram_mod = Phraser(bigram)
texts_bigram = [bigram_mod[doc] for doc in texts]

# Dictionnaire et corpus
id2word = corpora.Dictionary(texts_bigram)
id2word.filter_extremes(no_below=5, no_above=0.6)
corpus = [id2word.doc2bow(text) for text in texts_bigram]

# Fonction pour calculer la cohérence
def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step): 
        model = models.LdaModel(corpus=corpus,
                                id2word=dictionary,
                                num_topics=num_topics,
                                random_state=100,
                                update_every=1,
                                chunksize=100,
                                passes=10,
                                alpha='auto',
                                per_word_topics=False)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Tester plusieurs nombres de topics
start, limit, step = 5, 30, 1
model_list, coherence_values = compute_coherence_values(id2word, corpus, texts_bigram, start, limit, step)

# Figure de score de cohérence
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.title("Coherence Score vs Number of Topics")
plt.show()
plt.savefig("Coherence Score vs Number of Topics")

# Meilleur modèle
best_index = coherence_values.index(max(coherence_values))
best_model = model_list[best_index]
best_num_topics = start + best_index * step
print(f" Meilleur nombre de topics : {best_num_topics} avec coherence = {coherence_values[best_index]:.4f}")
pprint(best_model.print_topics())


# Assigner les topics au DataFrame
df["topic"] = [max(best_model[doc], key=lambda x: x[1])[0] for doc in corpus]

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

pyldavis_prepared = gensimvis.prepare(best_model, corpus, id2word)
pyLDAvis.display(pyldavis_prepared)

In [None]:
#sentiments
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
vaders = pd.DataFrame([sia.polarity_scores(text) for text in df['Message']])
df_with_sentiment = pd.concat([df.reset_index(drop=True), vaders], axis=1)
df_with_sentiment.head()

In [None]:
sns.barplot(x="topic", y="compound", data=df_with_sentiment, palette="viridis")
plt.title("Sentiment moyen par Topic")
plt.xlabel("Topic")
plt.ylabel("Score de sentiment")
plt.show()
plt.savefig("Sentiment moyen par Topic")

sns.boxplot(x="topic", y="compound", data=df_with_sentiment, palette="viridis")
plt.title("Distribution des sentiments par topic")
plt.xlabel("Topic")
plt.ylabel("Score de sentiment")
plt.show()
plt.savefig("Distribution des sentiments par topic")