# Dark Souls II Reviews (2025)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import re

## Steam Reviews as of 3/30/24:

In [2]:
df = pd.read_csv('reviews.csv')
reviews = df.copy()
reviews = reviews.set_index('recommendationid')
reviews.drop(columns={'Unnamed: 0', 'in_early_access'}, inplace=True)

Converting date of review from unix:

In [3]:
reviews['month_name'] = pd.to_datetime(reviews.update_date, unit='s').dt.month_name()
reviews['month']      = pd.to_datetime(reviews.update_date, unit='s').dt.month
reviews['year']       = pd.to_datetime(reviews.update_date, unit='s').dt.year
reviews['day']        = pd.to_datetime(reviews.update_date, unit='s').dt.day

Focusing on just the English reviews:

In [4]:
reviews = reviews[reviews.language == 'english']

## Cleaning up the reviews

In [5]:
reviews = reviews.dropna(subset=['review'])
reviews.shape

(45563, 9)

In [6]:
reviews['review'] = reviews.review.str.lower()

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text)
    return ' '.join([word for word in words if word.lower() not in stop_words])

In [8]:
reviews['review'] = reviews.review.apply(remove_stopwords)

In [9]:
# Removing urls:
r = [re.sub(r'http\S+', '', review).lower().strip() if pd.notna(review) else review for review in reviews.review]

# Removing esc sequences, punctuation, and numbers:
    # There's some ASCII art in some of the reviews
r = [re.sub(r'[^a-z]', ' ', review).strip() if pd.notna(review) else review for review in r]

In [10]:
# Removing multiple and trailing whitespaces:
r = [re.sub(r' +', ' ', review).strip() if pd.notna(review) else review for review in r]

In [11]:
reviews['review'] = r

In [12]:
reviews

Unnamed: 0_level_0,review,language,init_date,update_date,voted_up,month_name,month,year,day
recommendationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
190511148,dont give skeleton,english,1742267451,1742267451,True,March,3,2025,18
190504311,boia,english,1742259081,1742259081,True,March,3,2025,18
190502415,love game pieces s worst souls game ever made ...,english,1742256864,1742256864,True,March,3,2025,18
190501465,probably n t recommend awful iron passage frig...,english,1742255757,1742255757,True,March,3,2025,17
190500200,peak souls,english,1742254339,1742254339,True,March,3,2025,17
...,...,...,...,...,...,...,...,...,...
15162268,try tongue hole,english,1427932431,1428081346,True,April,4,2015,3
15162220,far good played mins far problems i k r gb gb ...,english,1427932153,1427932153,True,April,4,2015,1
15162161,still n t died bonedrinker rufus keep streak a...,english,1427931845,1427931845,True,April,4,2015,1
15162057,needs cow bell,english,1427931196,1427931196,True,April,4,2015,1


For sake of analysis specifically on the actual reviews, drop any rows that have no reviews:

In [13]:
reviews = reviews.dropna(subset=['review'])
reviews.shape

(45563, 9)

In [14]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [15]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=4000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords.words("english"))

In [16]:
review_txt = reviews.review.values.flatten()
tfidf_array = tfidf.fit_transform(review_txt).toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = tfidf.get_feature_names_out()



- Most common word among the reviews isn't very informative - including some of the other popular words
    - Looking at subsets of the reviews could be useful

## Topic Modeling:
- Exploring certain aspects on why people like the game
    - Also get critiques of the game in positive reviews (if any but there sure is considering DS2's reputation in the community)

- Exploring why people don't like the game:
    - Also get positive aspects within this subset of the reviews

Splitting the reviews by how many do and don't recommend buying the game:

In [17]:
pos_reviews = reviews[reviews['voted_up'] == True]
neg_reviews = reviews[reviews['voted_up'] == False]

In [18]:
pos_reviews.shape, neg_reviews.shape

((37387, 9), (8176, 9))

### Top2Vec:

In [19]:
from top2vec import Top2Vec

In [20]:
docs = reviews['review'].tolist()
pos_docs = pos_reviews['review'].tolist()
neg_docs = neg_reviews['review'].tolist()

In [21]:
def topics(documents):
    mdl = Top2Vec(
        documents=documents,
        # embedding_model='universal-sentence-encoder',
        contextual_top2vec=True
    )
    return mdl

In [None]:
mdl = topics(docs)

2025-04-02 18:10:04,112 - top2vec - INFO - Pre-processing documents for training


2025-04-02 18:10:12,279 - top2vec - INFO - Creating vocabulary embedding
Embedding vocabulary:  14%|█▍        | 12/87 [42:00<7:29:34, 359.66s/it] 

In [None]:
pos_mdl = topics(pos_docs)

In [None]:
neg_mdl = topics(neg_docs)