##2. Data Processing-Text Preparation

In [None]:
#environment
!pip install nltk==3.9.1
!pip install regex

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

###2.1 Data Cleanning

In [None]:
#install matching pandas
import pandas as pd

In [None]:
#import dataset
df = pd.read_csv('/content/AC_Shadow_Reddit_Data.csv')

In [None]:
#check datastructure
print(df.head)
print('NaN content portion is:' + str(100*(df['content'].isna().sum() / len(df)))+'%')

<bound method NDFrame.head of      post_id                                              title  \
0    1jhijvm  Assassin's Creed Shadows Hits 2 Million Player...   
1    1jg4v9i  Amid Japan Concern About Assassin's Creed Shad...   
2    1fp1isv  Ubisoft Cancels Press Previews of Assassin's C...   
3    1hca3d9  Assassin's Creed Shadows adds a "canon mode" t...   
4    1g9fdan  Ubisoft Cancels Assassin's Creed Shadows Early...   
..       ...                                                ...   
216   xhs3de                      games that make you feel cool   
217   191awa  [Killzone: Shadow Fall] What I saw and what my...   
218   v2z9k4      Games you keep going back to but never finish   
219   zjbofw  I’m still mad WB patented the nemesis system a...   
220   hvlct2  Does anyone else feel like Ghost of Tsushima i...   

                                               content  score  num_comments  \
0                                                  NaN  12844          3279   
1      

In [None]:
#Replace NaN content with space
df['text'] = df['title'].fillna('') + ' ' + df['content'].fillna('') + ' ' + df['comments'].fillna('')

#Convert date
df['date'] = pd.to_datetime(df['created_utc'], unit='s')

# Only select necessary collumns
df = df[['score', 'num_comments', 'date', 'text']]

In [None]:
df.head()

Unnamed: 0,score,num_comments,date,text
0,12844,3279,2025-03-22 20:59:47,Assassin's Creed Shadows Hits 2 Million Player...
1,11557,1053,2025-03-21 01:00:58,Amid Japan Concern About Assassin's Creed Shad...
2,18138,2726,2024-09-25 10:52:29,Ubisoft Cancels Press Previews of Assassin's C...
3,11673,956,2024-12-12 01:44:30,"Assassin's Creed Shadows adds a ""canon mode"" t..."
4,15580,1317,2024-10-22 11:00:38,Ubisoft Cancels Assassin's Creed Shadows Early...


###2.3 Text processing with NLP techniques

In [None]:
# Standardised abbreviations and slang
slang_dict = {
    "u": "you",
    "ur": "your",
    "thx": "thanks",
    "idk": "i don't know",
    "imo": "in my opinion",
    "btw": "by the way",
}

def replace_slang(text):
    words = text.split()
    new_words = [slang_dict[word] if word in slang_dict else word for word in words]
    return " ".join(new_words)

# replace slangs
df['clean_text'] = df['text'].apply(replace_slang)


In [None]:
# Text cleaning
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'http\S+|www\S+', '', text)     # URL
    text = re.sub(r'<.*?>', '', text)               #  HTML tag
    text = re.sub(r'[^\w\s]', '', text)             # punctuation mark
    text = re.sub(r'\d+', '', text)                 # re number
    text = re.sub(r'\s+', ' ', text)
    text = text.lower().strip()
    return text


def tokenize_text(text):
    return word_tokenize(text)

# Remove stopwords
def remove_stopwords(tokens, custom_stopwords=None):
    stop_words = set(stopwords.words('english'))
    if custom_stopwords:
        stop_words.update(custom_stopwords)

    return [word for word in tokens if word not in stop_words]


def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# Union all processing function
def preprocess_text(text, custom_stopwords=None):
    cleaned_text = clean_text(text)
    tokens = tokenize_text(cleaned_text)
    tokens = lemmatize_tokens(tokens)
    tokens = remove_stopwords(tokens, custom_stopwords)
    return " ".join(tokens)


## Adjust customed Stopwords
'''always adjust stopwords according to Topic modelling, until results are idea.
'''
custom_stopwords = {'game', 'play', 'one', 'people', 'assassin', 'creed', 'shadow', 'ac','feel'}  # self defined stop words


#apply to each text
df['ready_text'] = df['clean_text'].apply(lambda x: preprocess_text(x, custom_stopwords))

In [None]:
#check cleaned text
df.head()


Unnamed: 0,score,num_comments,date,text,clean_text,ready_text
0,12844,3279,2025-03-22 20:59:47,Assassin's Creed Shadows Hits 2 Million Player...,Assassin's Creed Shadows Hits 2 Million Player...,hit million player day release ubisoft say sur...
1,11557,1053,2025-03-21 01:00:58,Amid Japan Concern About Assassin's Creed Shad...,Amid Japan Concern About Assassin's Creed Shad...,amid japan concern ubisoft confirms dayone pat...
2,18138,2726,2024-09-25 10:52:29,Ubisoft Cancels Press Previews of Assassin's C...,Ubisoft Cancels Press Previews of Assassin's C...,ubisoft cancel press preview bet reassuring go...
3,11673,956,2024-12-12 01:44:30,"Assassin's Creed Shadows adds a ""canon mode"" t...","Assassin's Creed Shadows adds a ""canon mode"" t...",add canon mode make choice fan spent year unsu...
4,15580,1317,2024-10-22 11:00:38,Ubisoft Cancels Assassin's Creed Shadows Early...,Ubisoft Cancels Assassin's Creed Shadows Early...,ubisoft cancel early access wasnt announced wa...


In [None]:
# Save processed text
df.to_csv('ready_text.csv', index=False)