# Dark Souls II Reviews (2025)

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from get_data import *




## Steam Reviews as of 4/7/25:

In [2]:
reviews = get_data()

In [3]:
reviews.shape

(66999, 10)

In [4]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=4000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords.words("english"))

In [6]:
review_txt = reviews.review.values.flatten()
tfidf_array = tfidf.fit_transform(review_txt).toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = tfidf.get_feature_names_out()



### Topic Modeling:
- Exploring certain aspects on why people like the game
    - Also get critiques of the game in positive reviews (if any but there sure is considering DS2's reputation in the community)

- Exploring why people don't like the game:
    - Also get positive aspects within this subset of the reviews
    
- Algorithms I can use to perform topic modeling:
    1. Latent Dirichlet Allocation (LDA) 
    2. Non-negative Matrix Factorization (NMF)

Splitting the reviews by how many do and don't recommend buying the game:

In [7]:
pos_reviews = reviews[reviews['voted_up'] == True]
neg_reviews = reviews[reviews['voted_up'] == False]

In [8]:
pos_reviews.shape, neg_reviews.shape

((55777, 10), (11222, 10))

Function to display the output of the models:

In [9]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx + 1)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx + 1)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

LDA: Probabilistic graphical modeling, and uses CountVectorizer as input

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
count_vector = CountVectorizer()

tf = count_vector.fit_transform(reviews.review)
tf_feat_names = count_vector.get_feature_names_out()

pos_tf = count_vector.fit_transform(pos_reviews.review) 
pos_tf_feat_names = count_vector.get_feature_names_out()

neg_tf = count_vector.fit_transform(neg_reviews.review)
neg_tf_feat_names = count_vector.get_feature_names_out()

In [12]:
lda = LatentDirichletAllocation(n_components=3, random_state=42069)
lda.fit(tf)

In [13]:
no_top_words = 10
display_topics(lda, tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,die,9124.2,like,13136.3,play,13213.7
1,get,5370.0,enemy,11077.1,good,12905.2
2,time,3599.4,make,10375.1,best,5924.1
3,death,3374.8,boss,9102.9,bad,5758.8
4,love,3195.9,play,8839.0,one,5579.0
5,kill,2525.4,feel,8491.7,get,5535.7
6,like,2481.2,one,8461.2,game,5534.3
7,hard,2248.8,get,8315.3,like,5217.5
8,go,2245.4,good,8219.4,still,5054.2
9,make,2229.2,area,7496.9,buy,4968.6


- Interpreted topics that were identified:
    1. People saying how good the game is
    2. Bosses/enemies
    3. People expressing their likes or dislikes of the game

In [14]:
lda.fit(pos_tf)

In [15]:
display_topics(lda, pos_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,like,11125.7,play,6483.8,good,8511.1
1,play,9799.5,get,5061.3,love,4314.9
2,good,8703.6,best,4570.0,die,4162.5
3,one,8458.4,like,3488.2,great,2649.9
4,make,8279.0,time,3184.6,still,2235.6
5,get,8268.8,die,2960.4,hate,2230.2
6,enemy,7305.3,one,2859.7,ii,1551.1
7,still,7135.6,buy,2727.7,fun,1413.1
8,time,6950.5,death,2659.5,fuck,1399.0
9,game,6886.0,ever,2439.9,play,1379.7


In [16]:
lda.fit(neg_tf)

In [17]:
display_topics(lda, neg_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,enemy,4971.9,play,2539.0,trash,1186.1
1,like,4697.8,buy,1735.0,hate,478.2
2,make,4075.6,fuck,1726.8,pvp,431.4
3,play,3725.7,bad,1628.7,creator,290.3
4,get,3652.9,pc,1441.0,play,289.4
5,time,3191.5,version,1349.6,tyler,251.3
6,bad,3142.6,get,1221.1,player,180.8
7,one,3054.9,control,1143.7,memory,180.0
8,good,2729.2,like,989.0,get,174.1
9,feel,2727.1,even,945.2,friend,165.6


- Interpreted topics that were identified:
    1. Bosses/enemies
    2. Controls/PC port of the game
    3. Players' comments on that it's the worst Dark Souls game they've played

NMF: Linear algebra and uses the TF-IDF vectorizer as input

In [18]:
from sklearn.decomposition import NMF

In [19]:
nmf = NMF(n_components=3, random_state=42069)
nmf.fit(tfidf_array)

In [20]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,good,7.5,play,1.8,best,5.5
1,pretty,0.3,like,1.5,ever,1.2
2,still,0.3,bad,1.3,one,0.4
3,really,0.1,get,1.2,series,0.4
4,bad,0.1,great,1.1,game,0.3
5,elden,0.1,one,1.1,play,0.2
6,get,0.1,fun,1.1,pvp,0.1
7,ring,0.0,game,1.0,make,0.1
8,actually,0.0,die,1.0,still,0.1
9,hard,0.0,time,1.0,rpg,0.1


- Interpreted topics that were identified:
    1. 
    2. 
    3. 

In [21]:
pos_txt = pos_reviews.review.values.flatten()
pos_tfidf_array = tfidf.fit_transform(pos_txt).toarray()
nmf.fit(pos_tfidf_array)



In [22]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,good,7.1,play,1.8,best,5.3
1,pretty,0.3,like,1.5,ever,1.2
2,still,0.3,great,1.4,one,0.4
3,bad,0.1,die,1.3,series,0.3
4,really,0.1,fun,1.3,game,0.2
5,elden,0.1,get,1.2,play,0.2
6,get,0.1,one,1.1,pvp,0.1
7,ring,0.0,still,1.1,make,0.1
8,actually,0.0,game,1.1,rpg,0.1
9,hard,0.0,time,1.0,trilogy,0.1


- Interpreted topics that were identified:
    1. Positive outloooks on the game
    2. similar to 1st topic
    3. People expressing their opinion on the game, ranging from good to bad

In [23]:
neg_txt = neg_reviews.review.values.flatten()
neg_tfidf_array = tfidf.fit_transform(neg_txt).toarray()
nmf.fit(neg_tfidf_array)



In [24]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights
0,play,1.1,bad,3.7,shit,2.6
1,like,0.9,ever,0.3,suck,1.4
2,good,0.8,play,0.1,fuck,1.3
3,get,0.8,one,0.1,piece,0.2
4,make,0.8,series,0.1,trash,0.2
5,one,0.7,game,0.1,buy,0.2
6,enemy,0.7,far,0.1,dog,0.2
7,time,0.6,really,0.1,still,0.1
8,even,0.6,port,0.1,play,0.1
9,game,0.6,terrible,0.1,hate,0.1


- Interpreted topics that were identified:
    1. Vague but concerned w/ enemies
    2. Very negative perspectives on the game
    3. Negative experience regarding bosses, hitboxes, and game design