In [None]:
# Pre-processing part
import re 
import string
import pandas as pd 
import numpy as np
import spacy
import warnings
from gensim.models import CoherenceModel
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
from gensim.utils import simple_preprocess
import matplotlib as plt
import os
from lingua import Language, LanguageDetectorBuilder
import sqlalchemy
from cleantext import clean
import chime
import nltk
from nltk.corpus import stopwords
#Statistical analysis
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
warnings.filterwarnings("ignore", category=DeprecationWarning) #Remove warnings

# To replicate results
nlp_model = spacy.load('en_core_web_sm')
nlp_model.max_length = 1000000000
SEED = 1962
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
detector = LanguageDetectorBuilder.from_all_languages.build()
nltk.download('stopwords')

# Récupère les stopwords anglais
stopwords = set(stopwords.words('english')).union(['game', 'videogame', 'video game', 'games', 'video games'])
engine = sqlalchemy.create_engine("postgresql://user:password@localhost:5434/steamreviews")

In [None]:
import logging


def clean_text(text, stopwords):
    text = str(text).lower()  # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)  # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)  # Remove ellipsis (and last word)
    text = re.sub(r"<a[^>]*>(.*?)</a>", r"\1", text) #Remove html tags
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # Remove punctuation
    doc = nlp_model(text)
    tokens = [token.lemma_ for token in doc]
    tokens = [t for t in tokens if not t in stopwords and not t.isdigit() and len(t) > 1] #remove short tokens, stopwords and digits
    return tokens #Clean the Text


def detect_lang(text):
    try:
        language = detector.detect_language_of(text)
        return language
    except:
        logging.error('Failed to identify the language used')
        return pd.NA


df = pd.read_sql("""
select recommendationid, review_text from games_reviews
where regexp_count(trim(review_text), '\w+')>=5 and playtime_at_review_minutes>120
and language = 'english'
LIMIT 1000
""", engine)
df['detected_language'] = df[df['review_text'].apply(lambda x: detect_lang(x))]
vectorizer = np.vectorize(lambda x: clean_text(x, stopwords))
df["tokens"] = df["review_text"].apply(lambda x: clean_text(x, stopwords))

In [None]:
import gensim
import gensim.models.phrases
import gensim.corpora as corpora

data_words = df['tokens'].tolist()
#del df  # Clear some memory

print("Création des n-grammes...")
# Création des bigrammes et trigrammes
bigram = gensim.models.Phrases(data_words, min_count=10)  # Pas de threshold
trigram = gensim.models.Phrases(bigram[data_words])       # Pas de threshold

# Votre logique originale (gardant les mots simples + ajoutant n-grammes)
for idx in range(len(data_words)):
    original_doc = data_words[idx][:]  # Copie
    
    # Ajouter les bigrammes
    for token in bigram[original_doc]:
        if '_' in token and token not in data_words[idx]:
            data_words[idx].append(token)
    
    # Ajouter les trigrammes  
    for token in trigram[original_doc]:
        if '_' in token and token not in data_words[idx]:
            data_words[idx].append(token)

id2word = corpora.Dictionary(data_words)
id2word.filter_extremes(no_below=10, no_above=0.2)
corpus = [id2word.doc2bow(text) for text in data_words]

In [None]:
from gensim.models import LdaMulticore

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    for num_topics in range(start, limit, step):
        model = lda_model = LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=20,
    random_state=SEED,
    workers=4,  # Parallélisation native
    passes=10
)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_val = coherencemodel.get_coherence()
        coherence_values.append(coherence_val)
        print("num_topics = ",num_topics, "has a coherence of:", coherence_val)

    return coherence_values

coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_words, start=2, limit=100, step=5)

#Plotting the results
plt.plot(np.arange(2,20,2), coherence_values[:9], label="c")
plt.xlabel("Num Topics")
plt.ylabel("Coherenc score")
plt.legend(loc='best')
plt.show()