In [17]:
import numpy as np
import pandas as pd
import random

import string

import gensim
from gensim import corpora
from gensim.models import CoherenceModel, LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS

import os
%matplotlib inline

In [18]:
# read the file that is produced from the review cleaning process (cleaned dataset)
model_df = pd.read_csv('./dataframes/final_df.csv', index_col=0)
model_df.head()
len(model2_df)

21350

In [19]:
# remove punctuation for all columns of the clean dataset
model_df['clean_reviews'] = model_df['clean_reviews'].map(
    lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split())
model_df['3gram_reviews'] = model_df['3gram_reviews'].map(
    lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split())
model_df['3grams_nouns'] = model_df['3grams_nouns'].map(
    lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split())
model_df['3grams_nouns_verbs'] = model_df['3grams_nouns_verbs'].map(
    lambda x: ''.join(c for c in x if c == '_' or c not in string.punctuation).split())
model_df.head()

Unnamed: 0,timestamp_created,review,clean_reviews,3gram_reviews,3grams_nouns,3grams_nouns_verbs
0,1614854521,"It's kind of fun, maybe. I dunno. You punch pe...","[kind, fun, maybe, dunno, punch, people, punch]","[kind, fun, maybe, dunno, punch, people, punch]","[fun, dunno, punch, people]","[fun, dunno, punch, people, punch]"
1,1614775887,*Program opener simulator.*\nI don't want to w...,"[program, opener, simulator, want, waste, 20, ...","[program, opener, simulator, want, waste, 20_m...","[program, opener, simulator, waste, 20_minutes...","[program, opener, simulator, want, waste, 20_m..."
2,1614723370,Damn this game make me feel like kid again! \n...,"[damn, game, feel, like, kid, old, timer, 90, ...","[damn, game, feel_like, kid, old, timer, 90, p...","[game, feel_like, kid, timer, tekken, playstat...","[game, feel_like, kid, timer, play, tekken, pl..."
3,1614699638,One of my favorite game of all time \nit is qu...,"[favorite, game, time, balanced, exclude, dlc,...","[favorite, game, time, balanced, exclude, dlc_...","[game, time, exclude, dlc_characters, fahkumra...","[game, time, exclude, dlc_characters, break, f..."
4,1614650519,"Very nice, i think that most dlc characters ar...","[nice, think, dlc, characters, easier, rewardi...","[nice, think, dlc_characters, easier, rewardin...","[dlc_characters, beginner, game, character, sk...","[think, dlc_characters, beginner, game, realiz..."


In [20]:
# use this dictionary to convert numbers to text
num_dict = {'0': 'zero',
            '1': 'one',
            '2': 'two',
            '3': 'three',
            'ii': 'two',
            'iii': 'three'
            }


def number_2_words(d):
    if (len(d) == 1 and d in '0123') or (d in ['ii', 'iii']):
        word = num_dict[d]

    elif len(str(d)) == 1 and str(d) in '0123':
        word = num_dict(str(d))

    else:
        word = d

    return word


# added stopwords based on the output of the model. Meaningless words that generated noise were added progressively
# while training the model
english_stopwords = list(set(STOPWORDS))
english_stopwords.extend(
    ['good', 'better', 'great', 'lot', 'game', 'like', 'I', 'i', 'one', 'two', 'three', 'one_best_fighting_games',
     'fighting_games', 'fighting_games', 'thing', 'bit', 'street_fighter', 'time', '10_10', 'love', 'fun', 'play',
     'hour', 'sfv',
     'fighting', 'tekken', 'best_fighting', 'character', 'story_mode', 'fighter', 'fighting', 'street_fighter',
     'arcade_mode', 'try', 'come', 'better', 'learn', 'great',
     'best_fighting_game', 'lot', 'game', "tekken", "play", "fun", "love", "character", "fuck", "suck", "shit", 'thing',
     "capcom", 'krypt', "best_fighting",
     "fighting_games", "street_fighter", "tekken", "mortal_kombat", "mk", "mkx", "hate", "fighting_game",
     'street_fighter', 'towers_time', 'tower'])


# function that changes numbers to words by calling Number_2_words() and removes stopwords
def remove_stopwords(doc):
    words = [number_2_words(w) for w in doc if w != '' and w not in english_stopwords]
    return words


In [21]:
model_df['3grams_nouns'] = model_df['3grams_nouns'].map(lambda x: remove_stopwords(x))
model_df['3grams_nouns_verbs'] = model_df['3grams_nouns_verbs'].map(lambda x: remove_stopwords(x)) 

In [22]:
# build a dictionary & corpus based on the nouns column
documents = list(model_df['3grams_nouns'])
dictionary = gensim.corpora.Dictionary(documents)
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(word) for word in documents]

In [23]:
# run the training in a loop since we can amass a large number of models and evaluate them later
for loopNum in range(2):
    # randomize the number of topics and passes within a reasonable range as the variation has a chance to produce an
    # effective model
    num_topics = random.randint(5, 8)
    passes = random.randint(100, 120)
    eval_every = None
    seed = np.random.randint(0, 999999)
    print("Seed:", seed, "\n")

    ldamodel = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, alpha='asymmetric',
                            eval_every=eval_every, workers=3, random_state=seed)

    # Check resulting topics.
    topic_list = ldamodel.print_topics(num_topics=num_topics, num_words=15)
    for index, i in enumerate(topic_list):
        str1 = str(i[1])
        for c in "0123456789+*\".":
            str1 = str1.replace(c, "")
        str1 = str1.replace("  ", " ")
        print(str1)

    # calculate & display perplexity
    print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # calculate & display coherence
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=documents, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

    # assign a file name based on the loop number so that models aren't overridden during successive iterations.
    newpath = './models/both/nouns_only'
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    ldamodel.save(f'./models/both/nouns_only/model1-{loopNum}.model')

Seed: 461708 

CPU times: user 1min 34s, sys: 35.1 s, total: 2min 9s
Wall time: 3min 39s
story gameplay graphic series mechanic fan people content way franchise playing year roster experience opinion
people match _ player launch issue problem server connection netcode update day way release review
player combo match opponent way people mode mechanic fight attack skill tournament practice level option
friend ass man alot garbage people guy life balance fatal_blow god community buy combo season
pc issue problem controller console port crash work pc_version patch steam pc_port fix keyboard run
content money price fight_money costume dlc season_pass season stage buy base sale purchase currency way
story skin grind way item currency nrs fatality microtransaction fight player issue people gameplay gear

Perplexity:  -7.458187609887885

Coherence Score:  0.5751055030910919
Seed: 724546 

CPU times: user 1min 52s, sys: 42.6 s, total: 2min 35s
Wall time: 4min 18s
pc gameplay series graphic issu

In [24]:
# save the model dataframe for use in later sections.
model_df.to_csv('./dataframes/model_df.csv')