In [1]:
import pandas as pd
import nltk
from gensim import corpora
import re
import spacy
import pyLDAvis
import pyLDAvis.gensim #dont skip this
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
# Download NLTK resources if you haven't already
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Jade
[nltk_data]     Rosales\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def filter_spoiler_reviews(df):
    return df.drop(df[df['review'] == '[SPOILER ALERT: This review contains spoilers.]'].index)

# Function to categorize the ratings
def categorize_rating(rating):
    if rating <= 5:
        return  'Negative Review'
    else:
        return 'Positive Review'
    
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [4]:
game_df = {}

def process_df(df, filenames):
    df = pd.concat([pd.read_csv(filename) for filename in filenames])
    df = filter_spoiler_reviews(df)
    df['rating'] = df['rating'].apply(categorize_rating)
    df['review'] = df['review'].apply(clean_text)
    return df

filenames_dict = {
    'engage': ['dataset/engage.csv'],
    'threehouses': ['dataset/threehouses.csv'],
    'echoes': ['dataset/echoes.csv'],
    'conquest': ['dataset/conquest.csv'],
    'birthright': ['dataset/revelations.csv'],
    'revelations': ['dataset/birthrite.csv'],
    'awakening': ['dataset/awakening.csv'],
}

for df_name, filenames in filenames_dict.items():
    game_df[df_name] = process_df(pd.DataFrame(), filenames)


In [5]:
negative_reviews_dict = {}
positive_reviews_dict = {}

for df_name, df in game_df.items():
    negative_reviews_dict[df_name] = df[df['rating'] == 'Negative Review']
    positive_reviews_dict[df_name] = df[df['rating'] == 'Positive Review']

In [6]:
total=0
print('Overall')
for attr, df in game_df.items():  # Assuming game_df is a dictionary
    total += len(df)

print(total)

for attr, df in game_df.items():
    print(attr)
    print('Total Reviews: ',len(df))
    print('Negative Reviews: ',len(negative_reviews_dict[attr]))
    print('Positive Reviews: ',len(positive_reviews_dict[attr]))

Overall
2045
engage
Total Reviews:  468
Negative Reviews:  170
Positive Reviews:  298
threehouses
Total Reviews:  987
Negative Reviews:  82
Positive Reviews:  905
echoes
Total Reviews:  95
Negative Reviews:  8
Positive Reviews:  87
conquest
Total Reviews:  47
Negative Reviews:  9
Positive Reviews:  38
birthright
Total Reviews:  34
Negative Reviews:  8
Positive Reviews:  26
revelations
Total Reviews:  76
Negative Reviews:  12
Positive Reviews:  64
awakening
Total Reviews:  338
Negative Reviews:  21
Positive Reviews:  317


In [7]:
from nltk.corpus import opinion_lexicon

# # Initialize positive and negative word sets
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

In [8]:
# from gensim.models.keyedvectors import KeyedVectors

# path='GoogleNews-vectors-negative300.bin.gz'
# model = KeyedVectors.load_word2vec_format(path, binary=True)

In [9]:
from gensim.utils import simple_preprocess

# words_to_check = ['gameplay', 'objectives', 'story', 'aesthetics', 'strategy', 'constraints', 'fun',  'boring', 'interactive', 'music', 'audio', 'challenge', 'reward', 'graphics', 'animation', 'level', 'design', 'character', 'challenge', 'theme', 'creative', 'feature', 'style', 'emotional', 'memorable', 'personality', 'map', 'world', 'development']

# similar_words = []
# for word in words_to_check:
#     similar_words.append(model.most_similar(word, topn=10))

# similar_words_list = [item[0] for sublist in similar_words for item in sublist]

# similar_words_list.extend(words_to_check)

nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])

In [10]:
#nlp stopwords
import string

from nltk.corpus import stopwords
stop_words= stopwords.words('english') + list(string.punctuation)
stop_words += list(['fire', 'emblem', 'game','awakening', 'conquest', 'fate', 'birthright', 'revelations', 'echo', 'shadow', 'valentia' ,'three', 'house', 'engage', 'play', 'buy', 'rpg'])
stop_words

# spacy.lang.en.stop_words.STOP_WORDS |= {'fire', 'emblem'}

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import TfidfModel
from gensim.models import CoherenceModel
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import LsiModel

snow_stemmer = SnowballStemmer(language='english')


def lemmatization(texts,allowed_postags=['NOUN','ADJ','VERB','ADV']):
    """https://spacy.io/api/annotation"""
    data_words=simple_preprocess(str(texts))
    doc=nlp(' '.join(data_words))
    tokens = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if word not in positive_words]
    tokens = [word for word in tokens if word not in negative_words]
    # tokens = [snow_stemmer.stem(token) for token in tokens]

    # gram_tokens = nltk.bigrams(tokens)
    # stemmed_grams = ['_'.join(gram) for gram in gram_tokens]
    # tokens = stemmed_tokens+stemmed_grams
    return tokens

def create_corpus(df):
    # Tokenize the 'Review' column
    tokenized_reviews = df['review'].apply(lemmatization, allowed_postags=['NOUN'])

    # Add n-grams
    phrases = Phrases(tokenized_reviews, min_count=5, threshold=100)
    bigram = Phraser(phrases)
    tokenized_reviews = list(bigram[tokenized_reviews])

    dictionary = Dictionary(tokenized_reviews)

    corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]

    return (corpus, dictionary, tokenized_reviews)

def calc_coherence_values(dictionary, corpus, texts, num_topics=10):
    # Train a LDA model
    model = LdaMulticore(corpus=corpus,id2word = dictionary, num_topics = num_topics, alpha=.1, eta=0.1, random_state = 42)
    print('model created')
    
    # Calculate for the coherence score
    coherencemodel = CoherenceModel(model = model, texts = texts, dictionary = dictionary, coherence = 'c_v')
    coherence_value = coherencemodel.get_coherence()
    return model, coherence_value


def tfidf(corpus, dictionary):
    # Train a TF-IDF model
    tfidf = TfidfModel(corpus=corpus)

    # Get TF-IDF weights for each word
    tfidf_corpus = tfidf[corpus]

    # Sort the words by their TF-IDF weights
    words = sorted([(dictionary[word_id], weight) for doc in tfidf_corpus for word_id, weight in doc], key=lambda x: -x[1])


    return tfidf_corpus, words
      

def display_frequent_words(words):
    seen_words = set()
    for word, weight in words:
        if word not in seen_words:
            print(f"{word}: {weight}")
            seen_words.add(word)

In [12]:
for attr, value in negative_reviews_dict.items():
    print('Results for ' + attr)
    corpus, dict, token = create_corpus(value)
    tfidf_corpus, words = tfidf(corpus, dict)
    lda_model, coherence_value = calc_coherence_values(dictionary = dict, corpus = tfidf_corpus, texts = token, num_topics=25)
    print(coherence_value)
    pyLDAvis.enable_notebook()
    vis=pyLDAvis.gensim.prepare(lda_model,corpus,dict,mds='mmds')
    pyLDAvis.save_html(vis, 'results/'+attr+'_negative.html')
    

Results for engage
model created


In [None]:
for attr, value in positive_reviews_dict.items():
    print('Results for ' + attr)
    corpus, dict, token = create_corpus(value)
    tfidf_corpus, words = tfidf(corpus, dict)
    lda_model, coherence_value = calc_coherence_values(dictionary = dict, corpus = tfidf_corpus, texts = token, num_topics=25)
    print(coherence_value)
    pyLDAvis.enable_notebook()
    vis=pyLDAvis.gensim.prepare(lda_model,corpus,dict,mds='mmds')
    pyLDAvis.save_html(vis, 'results/'+attr+'_positive.html')

Results for Unnamed: 0


KeyError: 'review'

In [None]:
all_df = pd.concat(game_df.values(), ignore_index=True)
negative_reviews_dict = all_df[all_df['rating'] == 'Negative Review']
positive_reviews_dict = all_df[all_df['rating'] == 'Positive Review']

In [None]:
print('Results for negative overall')
corpus, dict, token = create_corpus(all_df)
token

Results for negative overall


In [None]:
tfidf_corpus, words = tfidf(corpus, dict)
lda_model, coherence_value = calc_coherence_values(dictionary = dict, corpus = tfidf_corpus, texts = token, num_topics=25)
print(coherence_value)


model created
0.5718206735668736


In [None]:
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}')

Topic: 0 
Words: 0.008*"character" + 0.008*"strategy" + 0.006*"story" + 0.006*"route" + 0.005*"map" + 0.005*"battle" + 0.005*"time" + 0.004*"series" + 0.004*"graphic" + 0.004*"title"
Topic: 1 
Words: 0.005*"lot" + 0.005*"year" + 0.005*"ability" + 0.005*"replayability" + 0.005*"replay" + 0.004*"route" + 0.004*"fan" + 0.004*"story" + 0.004*"character" + 0.004*"gameplay"
Topic: 2 
Words: 0.008*"gameplay" + 0.008*"story" + 0.007*"character" + 0.007*"people" + 0.006*"switch" + 0.006*"series" + 0.005*"battle" + 0.005*"hour" + 0.005*"mechanic" + 0.004*"nintendo"
Topic: 3 
Words: 0.007*"switch" + 0.006*"story" + 0.006*"battle" + 0.005*"route" + 0.005*"character" + 0.005*"hour" + 0.005*"time" + 0.005*"part" + 0.005*"gameplay" + 0.004*"combat"
Topic: 4 
Words: 0.007*"character" + 0.006*"gameplay" + 0.005*"series" + 0.005*"one" + 0.005*"level" + 0.005*"story" + 0.004*"mechanic" + 0.004*"music" + 0.004*"switch" + 0.004*"graphic"
Topic: 5 
Words: 0.009*"character" + 0.008*"series" + 0.005*"story" +

In [None]:
pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(lda_model,corpus,dict,mds='mmds')
pyLDAvis.save_html(vis, 'results/all_negative.html')
vis



In [None]:
print('Results for positive overall')
corpus, dict, token = create_corpus(all_df)
token

In [None]:
tfidf_corpus, words = tfidf(corpus, dict)
lda_model, coherence_value = calc_coherence_values(dictionary = dict, corpus = tfidf_corpus, texts = token, num_topics=25)
print(coherence_value)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}')

In [None]:
pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(lda_model,corpus,dict,mds='mmds')
pyLDAvis.save_html(vis, 'results/all_positive.html')
vis