# NLP Analysis

In [1]:
!pip install nltk



In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load the dataset
PATH = "/Users/agathecauhape/EMLyon 2024-25/Canada/Recommender System/projet/data/"

file_path = os.path.join(PATH, "video_game_clean.csv")
df = pd.read_csv(file_path)

## Step 1: Cleaning Text

In [4]:
# Download stopwords if needed
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/agathecauhape/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    words = text.split()
    return " ".join([w for w in words if w not in stop_words])

In [6]:
df['review_cleaned'] = df['user_review_text'].apply(clean_text)

In [7]:
df[['user_review_text', 'review_cleaned']].head()

Unnamed: 0,user_review_text,review_cleaned
0,"Solid game, but too many bugs.",solid game many bugs
1,"Solid game, but too many bugs.",solid game many bugs
2,"Great game, but the graphics could be better.",great game graphics could better
3,"Solid game, but the graphics could be better.",solid game graphics could better
4,"Great game, but too many bugs.",great game many bugs


## Step 2: TF-IDF Vectorization

In [8]:
# Use max_df/min_df to eliminate overly common/rare terms
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')

# Fit on cleaned text and transform
tfidf_matrix = tfidf.fit_transform(df['review_cleaned'])

# convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [9]:
# Save game index if needed
tfidf_df['game_title'] = df['game_title'].values
tfidf_df.set_index('game_title', inplace=True)

In [10]:
# Sum TF-IDF scores per word across all documents
tfidf_sum = tfidf_matrix.sum(axis=0).A1
terms = tfidf.get_feature_names_out()
tfidf_scores = pd.DataFrame({'term': terms, 'score': tfidf_sum})
top_terms = tfidf_scores.sort_values(by='score', ascending=False).head(15)

# Display top 15 words
print("🔠 Top 15 most important words across all reviews:")
print(top_terms)


🔠 Top 15 most important words across all reviews:
            term         score
4           game  13285.987512
0        amazing  12889.007573
2           bugs  10421.693617
1         better   8732.296397
6       graphics   8732.296397
5       gameplay   8728.329397
3  disappointing   7831.273147
7          great   7821.665762
8          solid   7813.900531


In [11]:
# Save
df.to_csv(os.path.join(PATH, "text_clean.csv"), index=False)