# Dark Souls II Reviews (2025)

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from get_data import *




In [2]:
reviews = get_data()

In [3]:
reviews.shape

(66999, 6)

In [4]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=4000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords.words("english"))

In [6]:
review_txt = reviews.review.values.flatten()
tfidf_array = tfidf.fit_transform(review_txt).toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = tfidf.get_feature_names_out()



### Topic Modeling:
- Exploring certain aspects on why people like the game
    - Also get critiques of the game in positive reviews (if any but there sure is considering DS2's reputation in the community)

- Exploring why people don't like the game:
    - Also get positive aspects within this subset of the reviews
    
- Algorithms I can use to perform topic modeling:
    1. Latent Dirichlet Allocation (LDA) 
    2. Non-negative Matrix Factorization (NMF)

Splitting the reviews by how many do and don't recommend buying the game:

In [7]:
pos_reviews = reviews[reviews['voted_up'] == True]
neg_reviews = reviews[reviews['voted_up'] == False]

In [8]:
pos_reviews.shape, neg_reviews.shape

((55777, 6), (11222, 6))

Function to display the output of the models:

In [9]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx + 1)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx + 1)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

LDA: Probabilistic graphical modeling, and uses CountVectorizer as input

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
count_vector = CountVectorizer()

tf = count_vector.fit_transform(reviews.review)
tf_feat_names = count_vector.get_feature_names_out()

pos_tf = count_vector.fit_transform(pos_reviews.review) 
pos_tf_feat_names = count_vector.get_feature_names_out()

neg_tf = count_vector.fit_transform(neg_reviews.review)
neg_tf_feat_names = count_vector.get_feature_names_out()

In [12]:
lda = LatentDirichletAllocation(n_components=3, random_state=42069)
lda.fit(tf)

In [13]:
no_top_words = 10
display_topics(lda, tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,die,9038.3,like,13268.1,play,13240.6
1,get,5126.9,enemy,11332.5,good,12964.1
2,time,3326.2,make,10581.5,best,5973.3
3,death,3300.4,boss,9150.0,bad,5823.0
4,love,3259.4,play,8995.8,one,5590.2
5,fuck,2857.8,get,8628.7,game,5586.5
6,kill,2410.2,one,8610.0,get,5465.4
7,like,2379.1,feel,8566.1,like,5187.8
8,hard,2196.5,good,8189.4,still,5115.2
9,go,2130.3,time,7566.8,buy,4958.1


In [14]:
lda.fit(pos_tf)

In [15]:
display_topics(lda, pos_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,like,11166.1,play,6384.9,good,8576.0
1,play,9862.8,get,5013.8,love,4311.9
2,good,8651.6,best,4550.8,die,4150.9
3,one,8484.9,like,3448.2,great,2718.2
4,make,8312.3,time,3156.9,still,2278.3
5,get,8307.3,die,2968.1,hate,2225.0
6,enemy,7308.8,one,2818.3,ii,1564.4
7,still,7095.7,buy,2698.8,fun,1441.9
8,time,6982.0,death,2655.9,play,1415.3
9,game,6905.9,ever,2429.7,fuck,1355.8


In [16]:
lda.fit(neg_tf)

In [17]:
display_topics(lda, neg_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,enemy,4971.0,play,2416.2,trash,1188.6
1,like,4684.7,fuck,1718.9,hate,498.8
2,make,4054.5,buy,1713.8,pvp,475.7
3,play,3667.1,bad,1544.6,play,470.8
4,get,3638.9,pc,1437.5,creator,290.3
5,time,3183.9,version,1336.8,bad,270.7
6,bad,3099.7,get,1160.9,tyler,251.3
7,one,3000.2,control,1132.6,get,248.2
8,feel,2729.3,like,931.3,player,228.2
9,good,2689.3,controller,909.6,good,226.1


NMF: Linear algebra and uses the TF-IDF vectorizer as input

In [18]:
from sklearn.decomposition import NMF

In [19]:
nmf = NMF(n_components=3, random_state=42069)
nmf.fit(tfidf_array)

In [20]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,good,7.5,play,1.8,best,5.5
1,pretty,0.3,like,1.5,ever,1.2
2,still,0.3,bad,1.3,one,0.4
3,really,0.1,get,1.2,series,0.4
4,bad,0.1,great,1.1,game,0.3
5,elden,0.1,one,1.1,play,0.2
6,get,0.1,fun,1.1,pvp,0.1
7,ring,0.0,game,1.0,make,0.1
8,actually,0.0,die,1.0,still,0.1
9,hard,0.0,time,1.0,rpg,0.1


In [21]:
pos_txt = pos_reviews.review.values.flatten()
pos_tfidf_array = tfidf.fit_transform(pos_txt).toarray()
nmf.fit(pos_tfidf_array)



In [22]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,good,7.1,play,1.8,best,5.3
1,pretty,0.3,like,1.5,ever,1.2
2,still,0.3,great,1.4,one,0.4
3,bad,0.1,die,1.3,series,0.3
4,really,0.1,fun,1.3,game,0.2
5,elden,0.1,get,1.2,play,0.2
6,get,0.1,one,1.1,pvp,0.1
7,ring,0.0,still,1.1,make,0.1
8,actually,0.0,game,1.1,rpg,0.1
9,hard,0.0,time,1.0,trilogy,0.1


In [23]:
neg_txt = neg_reviews.review.values.flatten()
neg_tfidf_array = tfidf.fit_transform(neg_txt).toarray()
nmf.fit(neg_tfidf_array)



In [24]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,play,1.1,bad,3.7,shit,2.6
1,like,0.9,ever,0.3,suck,1.4
2,good,0.8,play,0.1,fuck,1.3
3,get,0.8,one,0.1,piece,0.2
4,make,0.8,series,0.1,trash,0.2
5,one,0.7,game,0.1,buy,0.2
6,enemy,0.7,far,0.1,dog,0.2
7,time,0.6,really,0.1,still,0.1
8,even,0.6,port,0.1,play,0.1
9,game,0.6,terrible,0.1,hate,0.1
