In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from langdetect import detect, LangDetectException

nltk.download('punkt_tab')
nltk.download('stopwords')

nlp = spacy.load('en_core_web_sm')

df = pd.read_csv("/content/combined_rpg_reviews_20250623_171534.csv")

In [None]:
games_list = [
    "Baldur's Gate", "Black Desert Online", "Cyberpunk 2077", "Cyberpunk",
    "Dark Souls", "DS", "Divinity", "Dwarf Fortress", "ELDEN RING", "Fallout",
    "God of War", "Hades", "Hollow Knight", "Icewind Dale", "Kenshi",
    "Mass Effect", "Monster Hunter", "Mount & Blade", "Neverwinter Nights",
    "NieR: Automata", "Ori and the Blind Forest", "Ori", "Pathfinder",
    "Persona", "Pillars of Eternity", "Planescape", "Red Dead Redemption 2",
    "rdr2", "RimWorld", "Sekiro", "Slay the Spire", "Stardew Valley",
    "Tales of Arise", "Tales of Berseria", "Tales of Vesperia",
    "Tales of Zestiria", "The Binding of Isaac", "Elder Scrolls",
    "The Witcher", "Undertale"
]

game_pattern = r'\b(' + '|'.join([re.escape(game) for game in sorted(games_list, key=len, reverse=True)]) + r')\b'

In [None]:
def mentions_game(text):
    if not isinstance(text, str):
        return False
    return bool(re.search(game_pattern, text, re.IGNORECASE))

def remove_game_mentions(text):
    if not isinstance(text, str):
        return text
    return re.sub(game_pattern, '', text, flags=re.IGNORECASE)

df['mentions_game'] = df['review'].apply(mentions_game)
df['review_clean'] = df['review'].apply(remove_game_mentions)

df_filtered = df[df['mentions_game']].copy()
print(f"Total reviews: {len(df)}")
print(f"Reviews mentioning specified games: {len(df_filtered)}")

In [None]:
def is_english(text):
    if not isinstance(text, str) or text.strip() == "":
        return False
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

In [None]:
def clean_text(text):
    if not isinstance(text, str):  # Handle non-string inputs (e.g., NaN)
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    return text.strip().lower()

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return tokens

In [None]:
def lemmatize(tokens):
    if not tokens:
        return []
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'ADJ', 'VERB']]

In [None]:
df_filtered['is_english'] = df_filtered['review_clean'].apply(is_english)
df_english = df_filtered[df_filtered['is_english']].copy()
df_english['cleaned_review'] = df_english['review_clean'].apply(clean_text)
df_english['tokens'] = df_english['cleaned_review'].apply(preprocess)
df_english['lemmatized'] = df_english['tokens'].apply(lemmatize)

In [None]:
output_path = "/content/processed_reviews_mentioned_game.csv"
df_english.to_csv(output_path, index=False)

print(f"Total reviews: {len(df)}")
print(f"English reviews: {len(df_english)}")
print(df_english[['review', 'cleaned_review', 'tokens', 'lemmatized']].head())