# Dark Souls II Reviews (2025)

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from preproc import *




In [2]:
reviews, vanilla, scholar = get_data()

In [3]:
reviews.shape

(66999, 6)

In [6]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

In [7]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=4000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords.words("english"))

In [8]:
review_txt = reviews.review.values.flatten()
tfidf_array = tfidf.fit_transform(review_txt).toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = tfidf.get_feature_names_out()



### Topic Modeling:
- Exploring certain aspects on why people like the game
    - Also get critiques of the game in positive reviews (if any but there sure is considering DS2's reputation in the community)

- Exploring why people don't like the game:
    - Also get positive aspects within this subset of the reviews
    
- Algorithms I can use to perform topic modeling:
    1. Latent Dirichlet Allocation (LDA) 
    2. Non-negative Matrix Factorization (NMF)

Splitting the reviews by how many do and don't recommend buying the game:

In [9]:
pos_reviews = reviews[reviews['voted_up'] == True]
neg_reviews = reviews[reviews['voted_up'] == False]

In [10]:
pos_reviews.shape, neg_reviews.shape

((55777, 6), (11222, 6))

Function to display the output of the models:

In [11]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx + 1)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx + 1)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

LDA: Probabilistic graphical modeling, and uses CountVectorizer as input

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
count_vector = CountVectorizer()

tf = count_vector.fit_transform(reviews.review)
tf_feat_names = count_vector.get_feature_names_out()

pos_tf = count_vector.fit_transform(pos_reviews.review) 
pos_tf_feat_names = count_vector.get_feature_names_out()

neg_tf = count_vector.fit_transform(neg_reviews.review)
neg_tf_feat_names = count_vector.get_feature_names_out()

In [14]:
lda = LatentDirichletAllocation(n_components=5, random_state=42069)
lda.fit(tf)

In [15]:
no_top_words = 10
display_topics(lda, tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,enemy,8329.3,bad,3548.0,play,15904.8,good,12418.8,love,4375.9
1,die,6312.7,hate,3483.5,good,7614.5,boss,9613.2,die,3278.7
2,boss,5852.9,fuck,3219.4,like,6959.8,like,5193.8,ii,2969.2
3,time,4855.5,like,2414.8,great,5528.6,area,4614.1,death,2666.0
4,like,4584.2,not,2394.6,time,5027.4,feel,4338.5,buy,2609.3
5,player,3532.8,good,2298.7,fun,4867.9,series,4291.3,play,2472.5
6,attack,3384.8,shit,2110.0,well,4757.4,design,4262.7,time,1812.6
7,level,3266.9,play,1923.5,bad,4109.8,enemy,3567.8,version,1783.0
8,weapon,3020.7,iron,1557.8,feel,3696.9,bad,3450.0,good,1776.2
9,kill,2918.1,suck,1404.0,recommend,3609.3,world,3048.8,like,1738.4


In [16]:
lda.fit(pos_tf)

In [17]:
display_topics(lda, pos_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,good,11806.8,boss,10348.7,love,4214.8,play,8001.5,die,7736.3
1,bad,2546.0,like,9213.0,hate,2728.6,great,4767.7,time,3143.9
2,fun,1994.5,enemy,6832.3,like,2205.6,ii,3190.6,death,2707.0
3,series,1694.0,feel,6460.4,yes,1977.5,well,2745.8,play,2338.2
4,pretty,1042.9,good,5992.2,fuck,1434.4,like,2636.7,good,1702.4
5,play,886.9,play,5594.5,play,1250.9,recommend,2503.2,get,1697.9
6,rolin,726.2,area,5457.2,well,1044.2,version,2424.7,hard,1669.5
7,de,609.1,time,5210.2,iron,862.2,buy,2413.9,hour,1629.5
8,great,589.9,level,4602.8,not,839.9,new,2228.2,kill,1343.3
9,hard,549.8,lot,4566.4,shit,830.8,time,1929.9,buy,881.0


In [18]:
lda.fit(neg_tf)

In [19]:
display_topics(lda, neg_tf_feat_names, no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,version,1264.6,fuck,1486.0,bad,1967.6,play,2033.6,boss,4755.2
1,buy,890.9,trash,1201.1,play,1550.1,pc,1392.1,enemy,4560.9
2,new,607.5,enemy,425.4,shit,991.4,control,1260.4,like,4217.9
3,release,513.4,hit,377.5,not,920.7,controller,915.2,feel,3119.6
4,dlc,303.1,attack,372.4,suck,518.2,like,913.3,play,2942.0
5,original,275.3,time,294.0,don,513.0,good,797.2,time,2610.4
6,hate,261.9,creator,290.0,fucking,422.9,buy,691.2,bad,2430.8
7,dx,241.2,tyler,251.2,like,420.7,don,684.9,design,2258.3
8,don,234.8,die,213.4,dogshit,387.2,port,666.2,level,2224.8
9,update,202.9,kill,190.1,fromsoft,374.4,keyboard,657.2,good,2179.0


NMF: Linear algebra and uses the TF-IDF vectorizer as input

In [20]:
from sklearn.decomposition import NMF

In [21]:
nmf = NMF(n_components=5, random_state=42069)
nmf.fit(tfidf_array)

In [22]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,good,7.6,play,2.6,die,5.4,bad,5.4,great,5.1
1,pretty,0.2,like,2.0,time,0.3,people,0.3,series,0.1
2,series,0.2,fun,1.5,prepare,0.3,series,0.2,pvp,0.1
3,pvp,0.1,time,1.1,lot,0.2,well,0.1,time,0.1
4,play,0.1,well,1.1,gud,0.1,far,0.1,gameplay,0.1
5,rpg,0.0,boss,1.0,git,0.1,say,0.1,challenge,0.1
6,time,0.0,love,1.0,alot,0.1,design,0.1,recommend,0.1
7,trilogy,0.0,hard,1.0,sun,0.1,pretty,0.1,story,0.1
8,actually,0.0,feel,0.9,praise,0.1,actually,0.1,challenging,0.1
9,imo,0.0,enemy,0.8,kill,0.1,terrible,0.1,overall,0.1


In [23]:
pos_txt = pos_reviews.review.values.flatten()
pos_tfidf_array = tfidf.fit_transform(pos_txt).toarray()
nmf.fit(pos_tfidf_array)



In [24]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,good,7.5,play,2.6,die,5.1,great,4.9,fun,4.6
1,pretty,0.2,like,2.0,time,0.3,time,0.1,hard,0.6
2,series,0.1,bad,1.5,prepare,0.2,series,0.1,lot,0.2
3,bad,0.1,well,1.3,lot,0.2,gameplay,0.1,pretty,0.2
4,pvp,0.1,love,1.2,gud,0.1,pvp,0.1,friend,0.2
5,play,0.0,time,1.1,try,0.1,challenge,0.1,challenging,0.1
6,rpg,0.0,boss,0.9,git,0.1,recommend,0.1,challenge,0.1
7,time,0.0,series,0.9,alot,0.1,story,0.1,time,0.1
8,trilogy,0.0,feel,0.8,want,0.1,challenging,0.1,pvp,0.1
9,imo,0.0,hard,0.8,kill,0.1,overall,0.1,difficult,0.1


In [25]:
neg_txt = neg_reviews.review.values.flatten()
neg_tfidf_array = tfidf.fit_transform(neg_txt).toarray()
nmf.fit(neg_tfidf_array)



In [26]:
display_topics(nmf, tfidf.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,like,1.2,bad,3.6,shit,3.4,suck,3.0,play,2.3
1,good,1.1,series,0.1,fuck,0.4,fucking,0.6,buy,1.0
2,enemy,0.9,far,0.1,piece,0.3,ass,0.3,pc,0.7
3,boss,0.9,like,0.1,ass,0.2,fuck,0.2,instead,0.5
4,feel,0.8,play,0.1,dog,0.2,dick,0.1,controller,0.5
5,time,0.7,terrible,0.0,like,0.1,hate,0.1,control,0.5
6,design,0.6,design,0.0,fucking,0.1,beat,0.1,keyboard,0.4
7,level,0.6,port,0.0,trash,0.1,ball,0.1,version,0.4
8,hard,0.6,easily,0.0,holy,0.1,control,0.1,port,0.4
9,way,0.5,life,0.0,compare,0.1,adp,0.0,mouse,0.4
