# Dark Souls II Reviews (2025)

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import re

## Steam Reviews as of 3/30/24:

In [4]:
df = pd.read_csv('reviews.csv')
reviews = df.copy()
reviews = reviews.set_index('recommendationid')
reviews.drop(columns={'Unnamed: 0', 'in_early_access'}, inplace=True)

Converting date of review from unix:

In [5]:
reviews['month_name'] = pd.to_datetime(reviews.update_date, unit='s').dt.month_name()
reviews['month']      = pd.to_datetime(reviews.update_date, unit='s').dt.month
reviews['year']       = pd.to_datetime(reviews.update_date, unit='s').dt.year
reviews['day']        = pd.to_datetime(reviews.update_date, unit='s').dt.day

Focusing on just the English reviews:

In [6]:
reviews = reviews[reviews.language == 'english']

## Cleaning up the reviews

In [7]:
reviews = reviews.dropna(subset=['review'])
reviews.shape

(45563, 9)

In [8]:
reviews['review'] = reviews.review.str.lower()

In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text)
    return ' '.join([word for word in words if word.lower() not in stop_words])

In [10]:
reviews['review'] = reviews.review.apply(remove_stopwords)

In [11]:
# Removing urls:
r = [re.sub(r'http\S+', '', review).lower().strip() if pd.notna(review) else review for review in reviews.review]

# Removing esc sequences, punctuation, and numbers:
    # There's some ASCII art in some of the reviews
r = [re.sub(r'[^a-z]', ' ', review).strip() if pd.notna(review) else review for review in r]

In [12]:
# Removing multiple and trailing whitespaces:
r = [re.sub(r' +', ' ', review).strip() if pd.notna(review) else review for review in r]

In [13]:
reviews['review'] = r

In [14]:
reviews

Unnamed: 0_level_0,review,language,init_date,update_date,voted_up,month_name,month,year,day
recommendationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
190511148,dont give skeleton,english,1742267451,1742267451,True,March,3,2025,18
190504311,boia,english,1742259081,1742259081,True,March,3,2025,18
190502415,love game pieces s worst souls game ever made ...,english,1742256864,1742256864,True,March,3,2025,18
190501465,probably n t recommend awful iron passage frig...,english,1742255757,1742255757,True,March,3,2025,17
190500200,peak souls,english,1742254339,1742254339,True,March,3,2025,17
...,...,...,...,...,...,...,...,...,...
15162268,try tongue hole,english,1427932431,1428081346,True,April,4,2015,3
15162220,far good played mins far problems i k r gb gb ...,english,1427932153,1427932153,True,April,4,2015,1
15162161,still n t died bonedrinker rufus keep streak a...,english,1427931845,1427931845,True,April,4,2015,1
15162057,needs cow bell,english,1427931196,1427931196,True,April,4,2015,1


For sake of analysis specifically on the actual reviews, drop any rows that have no reviews:

In [15]:
reviews = reviews.dropna(subset=['review'])
reviews.shape

(45563, 9)

In [16]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [17]:
tfidf = TfidfVectorizer(sublinear_tf=True,
                        analyzer='word',
                        max_features=4000,
                        tokenizer=word_tokenize,
                        stop_words=stopwords.words("english"))

In [18]:
review_txt = reviews.review.values.flatten()
tfidf_array = tfidf.fit_transform(review_txt).toarray()
tfidf_df = pd.DataFrame(tfidf_array)
tfidf_df.columns = tfidf.get_feature_names_out()



- Most common word among the reviews isn't very informative - including some of the other popular words
    - Looking at subsets of the reviews could be useful

## Topic Modeling:
- Exploring certain aspects on why people like the game
    - Also get critiques of the game in positive reviews (if any but there sure is considering DS2's reputation in the community)

- Exploring why people don't like the game:
    - Also get positive aspects within this subset of the reviews

Splitting the reviews by how many do and don't recommend buying the game:

In [19]:
pos_reviews = reviews[reviews['voted_up'] == True]
neg_reviews = reviews[reviews['voted_up'] == False]

In [20]:
count_vector = CountVectorizer()

tf = count_vector.fit_transform(reviews.review)
tf_feat_names = count_vector.get_feature_names_out()

pos_tf = count_vector.fit_transform(pos_reviews.review) 
pos_tf_feat_names = count_vector.get_feature_names_out()

neg_tf = count_vector.fit_transform(neg_reviews.review)
neg_tf_feat_names = count_vector.get_feature_names_out()

In [21]:
pos_reviews.shape, neg_reviews.shape

((37387, 9), (8176, 9))

### LSA:

In [1]:
from sklearn.decomposition import TruncatedSVD

In [23]:
# Step 1: Convert text data to term-document matrix
count_vector = CountVectorizer()
tf = count_vector.fit_transform(reviews.review)  
tf_feat_names = count_vector.get_feature_names_out()

# Step 2: Apply Truncated SVD for dimensionality reduction
n_topics = 4

In [24]:
lsa = TruncatedSVD(n_components=n_topics, random_state=42)
lsa_matrix = lsa.fit_transform(tf)

# Step 3: Display top words per topic
def display_topics(model, feature_names, num_top_words=5):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

display_topics(lsa, tf_feat_names)

Topic 0:
death game souls dark love
Topic 1:
game souls love dark keep
Topic 2:
iron keep hate never death
Topic 3:
trash someone dino rex buy


In [25]:
# Repeat for positive and negative reviews separately
pos_tf = count_vector.fit_transform(pos_reviews.review)
pos_lsa_matrix = lsa.fit_transform(pos_tf)
pos_tf_feat_names = count_vector.get_feature_names_out()
print("\nPositive Reviews Topics:")
display_topics(lsa, pos_tf_feat_names)


Positive Reviews Topics:
Topic 0:
death game souls dark love
Topic 1:
game love souls dark first
Topic 2:
keep iron hate never go
Topic 3:
rolin love game souls dark


In [26]:
neg_tf = count_vector.fit_transform(neg_reviews.review)
neg_lsa_matrix = lsa.fit_transform(neg_tf)
neg_tf_feat_names = count_vector.get_feature_names_out()
print("\nNegative Reviews Topics:")
display_topics(lsa, neg_tf_feat_names)


Negative Reviews Topics:
Topic 0:
trash game like souls better
Topic 1:
keep iron game souls dark
Topic 2:
game souls dogshit dark like
Topic 3:
creator tyler souls dark series
