# Dark Souls II Reviews (2025)

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from get_data import *




In [2]:
reviews = get_data()

In [3]:
reviews.shape

(66999, 6)

In [4]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=4000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords.words("english"))

In [6]:
review_txt = reviews.review.values.flatten()
tfidf_array = tfidf.fit_transform(review_txt).toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = tfidf.get_feature_names_out()



### Topic Modeling:
- Exploring certain aspects on why people like the game
    - Also get critiques of the game in positive reviews (if any but there sure is considering DS2's reputation in the community)

- Exploring why people don't like the game:
    - Also get positive aspects within this subset of the reviews
    
- Algorithms I can use to perform topic modeling:
    1. Latent Dirichlet Allocation (LDA) 
    2. Non-negative Matrix Factorization (NMF)

Splitting the reviews by how many do and don't recommend buying the game:

In [7]:
pos_reviews = reviews[reviews['voted_up'] == True]
neg_reviews = reviews[reviews['voted_up'] == False]

In [8]:
pos_reviews.shape, neg_reviews.shape

((55777, 6), (11222, 6))

Function to display the output of the models:

In [9]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx + 1)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx + 1)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

LDA: Probabilistic graphical modeling, and uses CountVectorizer as input

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
count_vector = CountVectorizer()

tf = count_vector.fit_transform(reviews.review)
tf_feat_names = count_vector.get_feature_names_out()

pos_tf = count_vector.fit_transform(pos_reviews.review) 
pos_tf_feat_names = count_vector.get_feature_names_out()

neg_tf = count_vector.fit_transform(neg_reviews.review)
neg_tf_feat_names = count_vector.get_feature_names_out()

In [12]:
lda = LatentDirichletAllocation(n_components=5, random_state=42069)
lda.fit(tf)

In [13]:
no_top_words = 10
display_topics(lda, tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,die,8949.5,like,10727.1,good,10748.2,play,7852.8,best,2415.9
1,get,6783.7,enemy,8569.6,play,6204.3,love,5794.7,hate,2376.8
2,time,5106.0,make,8383.9,bad,4413.5,game,5167.4,death,1884.5
3,enemy,3779.6,boss,8254.8,buy,4196.3,one,4735.6,keep,1697.7
4,make,3745.5,good,8000.8,get,3740.6,fun,4287.4,trash,1559.8
5,like,3618.6,feel,7510.6,pc,3335.9,great,3907.4,iron,1434.8
6,go,3329.7,play,6975.5,version,3299.1,like,3781.1,gud,1301.3
7,play,3096.5,area,6739.5,still,2879.9,best,3469.4,git,828.2
8,kill,3064.9,one,6517.0,like,2460.5,ii,3202.8,de,744.2
9,player,2645.3,bad,6244.6,controller,2238.7,fuck,3073.4,seek,683.1


In [14]:
lda.fit(pos_tf)

In [15]:
display_topics(lda, pos_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,like,9665.4,play,6469.2,love,4485.8,die,6442.7,good,8425.5
1,play,8239.7,best,4941.4,hate,2258.4,get,3221.4,fun,2536.7
2,good,7586.5,one,3202.1,great,2026.8,enemy,2928.3,like,2343.0
3,one,6984.3,get,2615.5,ii,1661.5,player,2631.9,death,2208.4
4,still,6699.5,game,2556.5,play,1350.7,time,2516.5,yes,2063.7
5,make,6462.1,ever,2472.3,pc,1291.0,make,1931.1,bad,2022.3
6,boss,6065.4,time,2367.5,controller,1255.4,new,1930.5,hard,1852.2
7,game,6039.8,buy,2299.9,good,1210.4,go,1792.2,get,1676.9
8,get,5896.3,make,1672.9,keep,1170.9,kill,1745.3,still,1369.1
9,series,5559.7,hour,1659.4,fuck,1024.9,bos,1650.7,pretty,1146.8


In [16]:
lda.fit(neg_tf)

In [17]:
display_topics(lda, neg_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,enemy,4899.0,pc,1256.2,trash,1187.0,bad,2159.3,fuck,1486.1
1,like,4180.0,control,1148.4,hate,449.6,play,1625.3,version,1365.2
2,make,3832.3,play,926.5,pvp,336.2,shit,867.6,buy,1305.6
3,get,3494.0,controller,914.6,play,251.9,like,789.4,keep,734.8
4,play,3083.1,port,666.8,player,232.5,good,538.3,new,689.6
5,time,3075.6,keyboard,657.2,get,192.0,ever,516.7,play,667.0
6,one,2757.3,mouse,635.0,friend,164.6,dont,450.5,get,642.0
7,boss,2568.1,even,592.6,ban,140.8,suck,447.7,release,633.2
8,feel,2536.5,get,578.3,memory,128.0,game,447.1,iron,623.5
9,good,2391.1,like,537.6,fun,123.8,one,432.5,original,537.3


NMF: Linear algebra and uses the TF-IDF vectorizer as input

In [18]:
from sklearn.decomposition import NMF

In [19]:
nmf = NMF(n_components=5, random_state=42069)
nmf.fit(tfidf_array)

In [20]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,good,7.5,play,1.9,best,5.5,die,4.9,great,4.5
1,pretty,0.3,like,1.6,ever,1.2,would,1.0,still,0.3
2,still,0.3,bad,1.5,one,0.4,prepare,0.3,series,0.1
3,really,0.1,get,1.3,series,0.3,time,0.2,challenge,0.1
4,bad,0.1,one,1.2,game,0.2,gud,0.2,pvp,0.1
5,elden,0.1,fun,1.1,play,0.1,lot,0.2,time,0.1
6,get,0.1,game,1.1,pvp,0.1,praise,0.1,gameplay,0.1
7,ring,0.0,make,1.0,make,0.1,git,0.1,story,0.1
8,actually,0.0,time,1.0,still,0.1,sun,0.1,recommend,0.1
9,hard,0.0,still,1.0,rpg,0.1,get,0.1,sun,0.1


In [21]:
pos_txt = pos_reviews.review.values.flatten()
pos_tfidf_array = tfidf.fit_transform(pos_txt).toarray()
nmf.fit(pos_tfidf_array)



In [22]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,good,7.2,play,1.9,best,5.2,die,4.8,great,4.8
1,pretty,0.3,like,1.7,ever,1.2,would,1.1,still,0.2
2,still,0.3,fun,1.4,one,0.4,prepare,0.3,challenge,0.1
3,bad,0.1,get,1.3,series,0.3,time,0.2,series,0.1
4,really,0.1,one,1.3,game,0.2,gud,0.2,time,0.1
5,elden,0.1,game,1.2,play,0.1,lot,0.2,gameplay,0.1
6,get,0.0,still,1.1,pvp,0.1,git,0.1,pvp,0.1
7,ring,0.0,bad,1.1,make,0.1,praise,0.1,story,0.1
8,actually,0.0,time,1.0,rpg,0.1,sun,0.1,atmosphere,0.0
9,stuff,0.0,love,1.0,trilogy,0.1,get,0.1,overall,0.0


In [23]:
neg_txt = neg_reviews.review.values.flatten()
neg_tfidf_array = tfidf.fit_transform(neg_txt).toarray()
nmf.fit(neg_tfidf_array)



In [24]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,like,1.0,bad,3.7,shit,3.7,suck,2.5,play,1.3
1,good,0.8,ever,0.3,piece,0.3,fuck,1.6,buy,1.3
2,enemy,0.8,one,0.1,dog,0.2,hate,0.1,pc,1.0
3,make,0.8,series,0.1,fuck,0.2,dick,0.1,control,0.7
4,get,0.8,game,0.1,ass,0.1,still,0.1,controller,0.6
5,one,0.7,play,0.1,like,0.1,trash,0.1,port,0.6
6,time,0.7,far,0.1,still,0.1,fucking,0.1,dont,0.6
7,feel,0.6,really,0.1,pile,0.1,beat,0.1,keyboard,0.5
8,boss,0.6,like,0.1,compare,0.1,ball,0.0,version,0.5
9,design,0.5,design,0.0,holy,0.1,dogshit,0.0,mouse,0.5
