In [424]:
import pandas as pd
import nltk
from gensim import corpora
import re
import spacy

nlp = spacy.load('en_core_web_lg')

In [425]:
# Download NLTK resources if you haven't already
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Jade
[nltk_data]     Rosales\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [427]:
def filter_spoiler_reviews(df):
    return df.drop(df[df['review'] == '[SPOILER ALERT: This review contains spoilers.]'].index)

# Function to categorize the ratings
def categorize_rating(rating):
    if rating <= 4:
        return  'Negative Review'
    elif rating > 7:
        return 'Positive Review'
    
# def text_cleaning(text):
#     text = re.sub(r'[^\w\s]', '',str(text))             #Punctuations
#     text=re.split("\W+",text)                           #Tokenizing
#     text=[word for word in text if word not in stopword]#Stop words
#     text = ' '.join(text)                              
#     return text


In [428]:
pokemon_df = {'scarlet_violet': None, 'sword_shield': None, 'sun_moon': None, 'black_white': None, 'black2_white2': None, 
            'platinum': None, 'emerald': None}

def process_df(df, filenames):
    df = pd.concat([pd.read_csv(filename) for filename in filenames])
    df = filter_spoiler_reviews(df)
    df['rating'] = df['rating'].apply(categorize_rating)
    return df

filenames_dict = {
    'scarlet_violet': ['dataset/scarlet.csv', 'dataset/violet.csv'],
    'sword_shield': ['dataset/sword.csv', 'dataset/shield.csv'],
    'sun_moon': ['dataset/sun.csv', 'dataset/moon.csv'],
    'x_y': ['dataset/x.csv', 'dataset/y.csv'],
    'black_white': ['dataset/black.csv', 'dataset/white.csv'],
    'black2_white2': ['dataset/black2.csv', 'dataset/white2.csv'],
    'platinum': ['dataset/platinum.csv'],
    'emerald': ['dataset/emerald.csv']
}

for df_name, filenames in filenames_dict.items():
    pokemon_df[df_name] = process_df(pd.DataFrame(), filenames)


In [429]:
len(pokemon_df['scarlet_violet']) 

2811

In [430]:
negative_reviews_dict = {}
positive_reviews_dict = {}

for df_name, df in pokemon_df.items():
    negative_reviews_dict[df_name] = df[df['rating'] == 'Negative Review']
    positive_reviews_dict[df_name] = df[df['rating'] == 'Positive Review']

In [431]:
print(len(negative_reviews_dict['scarlet_violet']))
print(len(positive_reviews_dict['scarlet_violet']))

1903
601


In [410]:
#nlp stopwords

spacy.lang.en.stop_words.STOP_WORDS |= {'spoiler', 'review', 'alert', 'contain'}

In [411]:
from nltk.corpus import opinion_lexicon

# Initialize positive and negative word sets
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

In [412]:
from gensim.models.keyedvectors import KeyedVectors

path='GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

In [413]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

porter = nltk.PorterStemmer()

words_to_check = ['pokemon', 'region', 'character', 'story', 'gameplay', 'music', 'direction', 'design', 'graphics', 'personality', 'competitive', 'generation', 'progress', 'mechanics', 'emotional', 'challenge', 'creative', 'memorable', 'level', 'strategy', 'fun', 'objective', 'surprise', 'reward', 'control', 'boring', 'fun', 'conflict', 'aesthetics', 'style', 'feature', 'theme', 'fair', 'unfair', 'time']

similar_words = []
for word in words_to_check:
    similar_words.append(model.most_similar(word, topn=10))

similar_words_list = [item[0] for sublist in similar_words for item in sublist]

similar_words_list.extend(words_to_check)

In [432]:
def tokenize(text):

    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    stemmed_tokens = [porter.stem(token) for token in tokens]

    gram_tokens = nltk.trigrams(tokens)
    stemmed_grams = [' '.join(gram) for gram in gram_tokens]
    tokens = stemmed_tokens+stemmed_grams
    return tokens
    

def get_words(reviews_df, isPositive = False):

    # Tokenize the 'Review' column
    tokenized_reviews = reviews_df['review'].apply(tokenize)

    tokenized_reviews = tokenized_reviews.apply(list)


    dictionary = Dictionary(tokenized_reviews)

    corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

    # Train a TF-IDF model
    tfidf = TfidfModel(corpus)

    # Get TF-IDF weights for each word
    word_weights = tfidf[corpus]

    # Sort the words by their TF-IDF weights
    words = sorted([(dictionary[word_id], weight) for doc in word_weights for word_id, weight in doc], key=lambda x: -x[1])

    # Filter words out if not in similar words list
    words= [(word, weight) for word, weight in words 
                            if any(part in word.lower().split() for part in similar_words_list)]
    
    # Filter single words
    words = [(word, weight) for word, weight in words 
                            if len(word.split()) > 1]
    
    return words
    

def display_frequent_words(words):
    seen_words = set()
    for word, weight in words:
        if word not in seen_words:
            print(f"{word}: {weight}")
            seen_words.add(word)

In [439]:
test = get_words(negative_reviews_dict['scarlet_violet'])
display_frequent_words(test)

gameplay fps boring: 0.6390116191968441
terrible gameplay fps: 0.6390116191968441
good pokemon nice: 0.5778063278454522
nice good pokemon: 0.5778063278454522
horrible state fun: 0.526845997947128
state fun horrible: 0.526845997947128
potential story honestly: 0.5113439362322166
story honestly worth: 0.5113439362322166
fun low frame: 0.48747529844979665
good time play: 0.46337268274509397
time play sadly: 0.46337268274509397
gameplay good technical: 0.4549902359472829
away bad level: 0.45410920275066813
bad level bad: 0.45410920275066813
level bad man: 0.45410920275066813
Pokemon Company understand: 0.45342167732428573
bad Pokemon Company: 0.45342167732428573
Pokemon world bug: 0.4525919263586624
despite wonderful Pokemon: 0.4525919263586624
wonderful Pokemon world: 0.4525919263586624
bad pokemon value: 0.4441922617908039
pokemon value dollar: 0.4441922617908039
story go intro: 0.44345632240462973
bad hope pokemon: 0.44240001195944145
hope pokemon studio: 0.44240001195944145
pokemon stu

In [440]:
test = get_words(positive_reviews_dict['scarlet_violet'])
display_frequent_words(test)

despite glitch fun: 0.5201499931618709
fun despite glitch: 0.5201499931618709
lot fun despite: 0.5201499931618709
new formula Pokemon: 0.5134065236607397
problem good pokemon: 0.4655157734764425
experience story glitchy: 0.45233789641675803
explore experience story: 0.45233789641675803
fun explore experience: 0.45233789641675803
love fun explore: 0.45233789641675803
add fun story: 0.4485068560309065
buggy add fun: 0.4485068560309065
fun story excellent: 0.4485068560309065
fix fun enjoyable: 0.44333572290734435
fun enjoyable entertaining: 0.44333572290734435
issue fix fun: 0.44333572290734435
fun start buggy: 0.43967808882431386
love fun start: 0.43967808882431386
fun well gen: 0.43642118896209964
gen stop cry: 0.43642118896209964
well gen stop: 0.43642118896209964
pokemon scarlet violet: 0.4326511095275295
game gameplay storywise: 0.4290164864770631
issue good pokemon: 0.4290164864770631
pokemon game gameplay: 0.4290164864770631
fun way fun: 0.4273507232592662
issue fun way: 0.42735072

In [441]:
negative_reviews_results = {}
for dataset_name, negative_reviews_df in negative_reviews_dict.items():
    negative_reviews_results[dataset_name] = get_words(negative_reviews_df)

In [446]:
display_frequent_words(negative_reviews_results['black2_white2'])

atrocious plot pokemon: 0.15120575460055682
feature nice good: 0.15120575460055682
new feature nice: 0.15120575460055682
plot pokemon regret: 0.15120575460055682
pokemon regret buying: 0.15120575460055682
prequel new feature: 0.15120575460055682
bad pokemon game: 0.13906379646231107
character decade old: 0.13906379646231107
different ulgy character: 0.13906379646231107
horrible story ugly: 0.13906379646231107
mainline pokemon game: 0.13906379646231107
nostalgia mainline pokemon: 0.13906379646231107
play horrible story: 0.13906379646231107
pokemon game play: 0.13906379646231107
pokemon game well: 0.13906379646231107
story ugly handheld: 0.13906379646231107
ulgy character decade: 0.13906379646231107
boring singleplayer experience: 0.13389655280323554
dreadfully boring singleplayer: 0.13389655280323554
fun multiplayer ruin: 0.13389655280323554
multiplayer ruin requirement: 0.13389655280323554
play dreadfully boring: 0.13389655280323554
Pokemon HUD combat: 0.1322043592859321
Pokemon game i

In [443]:
positive_reviews_results = {}
for dataset_name, positive_reviews_df in positive_reviews_dict.items():
    positive_reviews_results[dataset_name] = get_words(positive_reviews_df, isPositive=True)

In [445]:
display_frequent_words(positive_reviews_results['black2_white2'])

favourite time play: 0.47861024044889416
perfection favourite time: 0.47861024044889416
time play regret: 0.47861024044889416
GOOD Pokemon game: 0.4148095464785463
Pokemon game see: 0.4148095464785463
game GOOD Pokemon: 0.4148095464785463
graphic story time: 0.3972907209423734
music play graphic: 0.3972907209423734
play graphic story: 0.3972907209423734
story time value: 0.3972907209423734
time value overall: 0.3972907209423734
Pokemon peak story: 0.3910285216561284
character music touch: 0.3910285216561284
music touch heart: 0.3910285216561284
peak story character: 0.3910285216561284
black good pokemon: 0.367008652641005
White good pokemon: 0.36681232055018925
character music world: 0.36185498608430794
good pokemon story: 0.36185498608430794
music world pokemon: 0.36185498608430794
pokemon love franchise: 0.36185498608430794
pokemon story character: 0.36185498608430794
world pokemon love: 0.36185498608430794
story character music: 0.33866521469868166
gen game recommend: 0.332495156481