In [243]:
import pandas as pd
import nltk
from gensim import corpora
import re
import spacy

nlp = spacy.load('en_core_web_lg')

In [244]:
# Download NLTK resources if you haven't already
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Jade
[nltk_data]     Rosales\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [245]:
def filter_spoiler_reviews(df):
    return df.drop(df[df['review'] == '[SPOILER ALERT: This review contains spoilers.]'].index)

# Function to categorize the ratings
def categorize_rating(rating):
    if rating <= 5:
        return  'Negative Review'
    else:
        return 'Positive Review'
    
# def text_cleaning(text):
#     text = re.sub(r'[^\w\s]', '',str(text))             #Punctuations
#     text=re.split("\W+",text)                           #Tokenizing
#     text=[word for word in text if word not in stopword]#Stop words
#     text = ' '.join(text)                              
#     return text


In [246]:
pokemon_df = {'scarlet_violet': None, 'sword_shield': None, 'sun_moon': None, 'black_white': None, 'black2_white2': None, 
            'platinum': None, 'emerald': None}

def process_df(df, filenames):
    df = pd.concat([pd.read_csv(filename) for filename in filenames])
    df = filter_spoiler_reviews(df)
    df['rating'] = df['rating'].apply(categorize_rating)
    return df

filenames_dict = {
    'scarlet_violet': ['dataset/scarlet.csv', 'dataset/violet.csv'],
    'sword_shield': ['dataset/sword.csv', 'dataset/shield.csv'],
    'sun_moon': ['dataset/sun.csv', 'dataset/moon.csv'],
    'black_white': ['dataset/black.csv', 'dataset/white.csv'],
    'black2_white2': ['dataset/black2.csv', 'dataset/white2.csv'],
    'platinum': ['dataset/platinum.csv'],
    'emerald': ['dataset/emerald.csv']
}

for df_name, filenames in filenames_dict.items():
    pokemon_df[df_name] = process_df(pd.DataFrame(), filenames)


In [247]:
len(pokemon_df['scarlet_violet']) 

2811

In [248]:
negative_reviews_dict = {}
positive_reviews_dict = {}

for df_name, df in pokemon_df.items():
    negative_reviews_dict[df_name] = df[df['rating'] == 'Negative Review']
    positive_reviews_dict[df_name] = df[df['rating'] == 'Positive Review']

In [249]:
print(len(negative_reviews_dict['scarlet_violet']))
print(len(positive_reviews_dict['scarlet_violet']))

2013
798


In [250]:
#nlp stopwords

spacy.lang.en.stop_words.STOP_WORDS |= {'spoiler', 'review', 'alert', 'contain'}

In [251]:
from nltk.corpus import opinion_lexicon

# Initialize positive and negative word sets
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

In [333]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

porter = nltk.PorterStemmer()

def tokenize(text):

    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    stemmed_tokens = [porter.stem(token) for token in tokens]

    gram_tokens = nltk.trigrams(tokens)
    stemmed_grams = ['_'.join(gram) for gram in gram_tokens]
    tokens = stemmed_tokens+stemmed_grams
    return tokens
    

def get_words(reviews_df, isPositive = False):

    # Tokenize the 'Review' column
    tokenized_reviews = reviews_df['review'].apply(tokenize)

    tokenized_reviews = tokenized_reviews.apply(list)


    dictionary = Dictionary(tokenized_reviews)

    corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

    # Train a TF-IDF model
    tfidf = TfidfModel(corpus)

    # Get TF-IDF weights for each word
    word_weights = tfidf[corpus]

    # Sort the words by their TF-IDF weights
    sorted_words = sorted([(dictionary[word_id], weight) for doc in word_weights for word_id, weight in doc], key=lambda x: -x[1])

    filtered_sorted_words = [(word, weight) for word, weight in sorted_words 
                         if all(part not in (positive_words if not isPositive else negative_words) for part in word.lower().split())]
    
    return filtered_sorted_words
    

def display_frequent_words(words):
    seen_words = set()
    for word, weight in words:
        if word not in seen_words:
            print(f"{word}: {weight}")
            seen_words.add(word)

In [334]:
test = get_words(negative_reviews_dict['platinum'])
display_frequent_words(test)

hate: 0.5490236693461165
que: 0.35355339059327373
y: 0.35355339059327373
Pokemon_good_gen: 0.29288068630995373
boring_Pokemon_good: 0.29288068630995373
gen_play_ruby: 0.29288068630995373
good_gen_play: 0.29288068630995373
play_ruby_sapphire: 0.29288068630995373
rubi: 0.29288068630995373
ruby_sapphire_instead: 0.29288068630995373
sapphir: 0.29288068630995373
super_boring_Pokemon: 0.29288068630995373
game_like_wow: 0.27451183467305823
hate_slow_pokemon: 0.27451183467305823
like_wow_hate: 0.27451183467305823
pokemon_game_like: 0.27451183467305823
pokemon_pokemon_game: 0.27451183467305823
slow_pokemon_pokemon: 0.27451183467305823
wow_hate_pokemon: 0.27451183467305823
pokemon: 0.2479088892040267
difficulti: 0.21904977574038106
death: 0.21527947963794355
gen: 0.20471481458000448
instead: 0.20471481458000448
experi: 0.20434575174078617
favorit: 0.20209840419300193
GBA_fix_problem: 0.19350760705736977
Pokémon_design_concept: 0.19350760705736977
ambient: 0.19350760705736977
ambientation_bland_b

In [289]:
negative_reviews_results = {}
for dataset_name, negative_reviews_df in negative_reviews_dict.items():
    negative_reviews_results[dataset_name] = get_words(negative_reviews_df)

In [290]:
display_frequent_words(negative_reviews_results['sun_moon'])

t_r: 0.6557380799992074
 _h: 0.6241140562703504
n: 0.5744824351020308
d: 0.5444878174442518
u: 0.4929113529200641
k_e: 0.4707488644292069
f_o: 0.463760507746945
t: 0.4620775979774134
m: 0.4471955891967641
l_i: 0.4172178352278566
s_u: 0.41365269846969777
v: 0.4084697904515067
 _f: 0.40682133910464363
o: 0.3897689671709335
a: 0.3860804266425546
e_g: 0.3860804266425546
e: 0.38488531305078455
i_k: 0.38448212296975537
a_n: 0.3807231538368584
i_m: 0.37946002614007945
p_r: 0.37946002614007945
l_a: 0.3549179205200965
c: 0.35322088679233365
l: 0.3518701475482244
g_i: 0.33542880484565024
i_c: 0.33542880484565024
k: 0.33542880484565024
i_d: 0.3351925238926003
s_t: 0.3351925238926003
u_p: 0.3351925238926003
r_s: 0.3294304463121515
u_n: 0.3294304463121515
v_e: 0.3294304463121515
n_j: 0.3294304463121515
o_y: 0.3294304463121515
 : 0.3278120076982933
k_é: 0.319731074700283
p: 0.319731074700283
e_x: 0.3185442373169268
p_a: 0.3185442373169268
s: 0.3185442373169268
r: 0.31809185897461945
o_v: 0.310239523

In [None]:
positive_reviews_results = {}
for dataset_name, positive_reviews_df in positive_reviews_dict.items():
    positive_reviews_results[dataset_name] = get_words(positive_reviews_df, isPositive=True)

In [None]:
display_frequent_words(positive_reviews_results['sun_moon'])

treasur: 1.0
ye: 1.0
blank: 1.0
de: 0.4321245804614886
great pokemon great: 0.41300163362131137
great story single: 0.41300163362131137
single pokemon cut: 0.41300163362131137
story single pokemon: 0.41300163362131137
giochi: 0.3872017854528214
ass: 0.36801865302887543
ass variant pokemon: 0.36801865302887543
big ass variant: 0.36801865302887543
introduce big ass: 0.36801865302887543
variant pokemon respect: 0.36801865302887543
la: 0.3600448227103882
pokemon great story: 0.3517387609476907
ultim: 0.3514956009035569
pokémon: 0.3407129889998465
excellent competitive scene: 0.3388795788858557
far spectacular story: 0.3388795788858557
generation far spectacular: 0.3388795788858557
good generation far: 0.3388795788858557
spectacular story excellent: 0.3388795788858557
story excellent competitive: 0.3388795788858557
battle rng fun: 0.3204971923813408
fun overall recommend: 0.3204971923813408
overall recommend play: 0.3204971923813408
play pokemon entire: 0.3204971923813408
pokemon entire lif