<h2 style = "font-family: Serif; font-size: 35px; font-style: normal; letter-spacing: 3px; background-color: #f6f6f6; color: #0D0D0F; border-radius: 100px 100px; text-align: center;"> 2.3 DATA CLEANING & APPLYING NLTK </h2>

This file contains the code for data cleaning and applying NLTK to clean up our dataset `wukong-dataset.csv`. It is the **Section 2.3** of the main file `end-to-end-sentiment-analysis.ipynb`.

# Import Libraries

In [1]:
# Import useful libraries
import pandas as pd
import re
import time

# Import NLTK
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

# Import Dataset

In [2]:
# Import our dataset (scraped)
df = pd.read_csv('wukong-dataset.csv')

In [3]:
# Check the first five rows of our dataset
df.head()

Unnamed: 0.1,Unnamed: 0,ReviewText,RecommendedOrNot,ReviewLength,PlayHours,DatePosted
0,0,Monke,Recommended,5,20.7 hrs on record,Posted: 14 November
1,1,8/10,Recommended,4,51.6 hrs on record,Posted: 14 November
2,2,"this game is awesome, hard to track all moves ...",Recommended,95,7.3 hrs on record,Posted: 13 November
3,3,Pain,Recommended,4,5.1 hrs on record,Posted: 13 November
4,4,Monke,Recommended,5,30.9 hrs on record,Posted: 13 November


# Data Cleaning

In [4]:
# Clean the 'PlayHours' column

# Define a function to remove commas of all row entries
def convert_playhours(hours_str):
    '''
    Remove commas of all row entries.
    '''
    return hours_str.replace(",", "").split()[0]

# Apply the function on the 'PlayHours' column, remove the 'hrs on record' & convert to float
df['PlayHours'] = df['PlayHours'].apply(convert_playhours)
df['PlayHours'] = df['PlayHours'].map(lambda x: re.sub(' hrs on record', '', x)).astype(float)

# Check if this works
df.head()

Unnamed: 0.1,Unnamed: 0,ReviewText,RecommendedOrNot,ReviewLength,PlayHours,DatePosted
0,0,Monke,Recommended,5,20.7,Posted: 14 November
1,1,8/10,Recommended,4,51.6,Posted: 14 November
2,2,"this game is awesome, hard to track all moves ...",Recommended,95,7.3,Posted: 13 November
3,3,Pain,Recommended,4,5.1,Posted: 13 November
4,4,Monke,Recommended,5,30.9,Posted: 13 November


In [5]:
# Clean the 'DatePosted' column

# Remove the word 'Posted: '
df['DatePosted'] = df['DatePosted'].map(lambda x: re.sub('Posted: ', '', x))

# Create a dictionary of months in numeric forms
month_numeric = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12'
}

# Add two columns 'Day' and 'Month'
df[['Month', 'Day']] = df['DatePosted'].str.extract(r'(\w+) (\d+)', expand=True)
df['Month'] = df['Month'].fillna(df['DatePosted'].str.extract(r'\d+\s+(\w+)', expand=False))
df['Day'] = df['Day'].fillna(df['DatePosted'].str.extract(r'(\d+)\s+\w+', expand=False))

# Convert 'Month' into numeric form
df['Month'] = df['Month'].map(month_numeric)

# Convert 'DatePosted' into date time format
df['DatePosted'] = df['Day'] + '/' + df['Month'] + '/2024'
df['DatePosted'] = pd.to_datetime(df['DatePosted'], format = '%d/%m/%Y').dt.strftime('%d-%m-%Y')

# Drop the 'Day' and 'Month' columns
df = df.drop(['Day', 'Month'], axis = 1)

# Check if this works
df.head(15)

Unnamed: 0.1,Unnamed: 0,ReviewText,RecommendedOrNot,ReviewLength,PlayHours,DatePosted
0,0,Monke,Recommended,5,20.7,14-11-2024
1,1,8/10,Recommended,4,51.6,14-11-2024
2,2,"this game is awesome, hard to track all moves ...",Recommended,95,7.3,13-11-2024
3,3,Pain,Recommended,4,5.1,13-11-2024
4,4,Monke,Recommended,5,30.9,13-11-2024
5,5,good,Recommended,4,116.9,13-11-2024
6,6,THIS GAME IS FREKAING PEAK THIS IS THE REASON ...,Recommended,97,6.3,13-11-2024
7,7,"I literally found myself saying out loud ""I do...",Not Recommended,486,5.7,13-11-2024
8,8,"It is my firs time playing a souls-like game, ...",Recommended,236,5.5,13-11-2024
9,9,"The first four chapters are really impressive,...",Recommended,286,39.5,13-11-2024


# Apply NLTK

In [6]:
%%time

# Initialize stopwords & lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to map POS tag to WordNet POS
def get_wordnet_pos(treebank_tag):
    """
    Convert the treebank POS tags to WordNet POS tags.
    """
    if treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if unsure

# Define a function to clean a single review text
def clean_text(text):
    '''
    Clean a single review text.
    '''
    if not isinstance(text, str):  # Handle missing or non-string values
        return ""
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation except "/"
    text = re.sub(r'[^\w\s/]', '', text)
    
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    
    # POS tagging
    pos_tags = nltk.pos_tag(tokens)
    
    # Remove stopwords and apply lemmatization based on POS
    lemmatized_tokens = []
    for word, pos in pos_tags:
        if word not in stop_words:
            wordnet_pos = get_wordnet_pos(pos)  # Get the WordNet POS
            lemmatized_word = lemmatizer.lemmatize(word, pos=wordnet_pos)  # Lemmatize based on POS
            lemmatized_tokens.append(lemmatized_word)
    
    # Join tokens back into a single string
    return ' '.join(lemmatized_tokens)

# Apply cleaning function to each row in 'ReviewText'
df['CleanedReviewText'] = df['ReviewText'].apply(clean_text)

CPU times: total: 13.3 s
Wall time: 38.7 s


In [7]:
# Display the cleaned DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,ReviewText,RecommendedOrNot,ReviewLength,PlayHours,DatePosted,CleanedReviewText
0,0,Monke,Recommended,5,20.7,14-11-2024,monke
1,1,8/10,Recommended,4,51.6,14-11-2024,8/10
2,2,"this game is awesome, hard to track all moves ...",Recommended,95,7.3,13-11-2024,game awesome hard track move start muscle memo...
3,3,Pain,Recommended,4,5.1,13-11-2024,pain
4,4,Monke,Recommended,5,30.9,13-11-2024,monke


In [8]:
# Drop the 'Unnamed: 0' column
df = df.drop(['Unnamed: 0'], axis=1)

# Create an 'Id' column
df['Id'] = df.index + 1

# Move the 'Id' column to the first column
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

# Drop the 'ReviewText' column and replace it with 'CleanedReviewText'
df = df.drop(['ReviewText'], axis=1)
cols = df.columns.tolist()

# Correctly rearrange columns
cols = [cols[0]] + [cols[-1]] + cols[1:-1]  # Keep 'Id', 'CleanedReviewText', and others
df = df[cols]

In [9]:
# Display the cleaned DataFrame
df.head()

Unnamed: 0,Id,CleanedReviewText,RecommendedOrNot,ReviewLength,PlayHours,DatePosted
0,1,monke,Recommended,5,20.7,14-11-2024
1,2,8/10,Recommended,4,51.6,14-11-2024
2,3,game awesome hard track move start muscle memo...,Recommended,95,7.3,13-11-2024
3,4,pain,Recommended,4,5.1,13-11-2024
4,5,monke,Recommended,5,30.9,13-11-2024


In [10]:
# Save our cleaned dataset as a csv file
df.to_csv('wukong-cleaned-dataset.csv', index=False)