In [3]:
import steamreviews
import pandas as pd
import numpy as np
import re
import os

In [4]:
## For: Fetching reviews by titleID with Steam API ##
request_params = dict()
request_params['language'] = 'english'

### AppIDs ###
# Elden Ring: 1245620
# Dark Souls, Remaster: 570940
# Dark Souls 2: 236430
# Dark Souls 2, Scholar: 335300
# Dark Souls 3: 374320
# Sekiro GOTY: 814380
# Ember Knights: 1135230

app_id = [570940, 236430, 335300, 374320, 814380]
#review_dict = steamreviews.download_reviews_for_app_id_batch(app_id, chosen_request_params=request_params)


In [None]:
# Reviews from SteamAPI are in JSON format
# List all JSON files in a directory
json_files = [pos_json for pos_json in os.listdir('data') if pos_json.endswith('.json')]

# Create an empty DataFrame to store all reviews
df = pd.DataFrame()

# Loop through all JSON files and concatenate them into the DataFrame
for file in json_files:
    temp_df = pd.read_json('data/' + file)
    df = pd.concat([df, temp_df], ignore_index=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338575 entries, 0 to 338574
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   reviews        338526 non-null  object
 1   query_summary  42 non-null      object
 2   cursors        7 non-null       object
dtypes: object(3)
memory usage: 7.7+ MB


In [7]:
# Only keep rows where 'reviews' is a dictionary
df = df[df['reviews'].apply(lambda x: isinstance(x, dict))]

# Normalize the 'reviews' column
expanded_reviews_df = pd.json_normalize(df['reviews'])

# Concatenate it with original dataframe
df = pd.concat([df, expanded_reviews_df], axis=1)

# Drop the original 'reviews' column as it is no longer needed
df = df.drop(columns=['reviews'])

# Save your dataframe to a CSV
df.to_csv('data/reviews_from_json.csv')

In [11]:
df.head()

Unnamed: 0,query_summary,cursors,recommendationid,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,...,steam_china_location,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played,timestamp_dev_responded,developer_response
5,,,163599277,english,Came in expecting a mid game. Come out being i...,1714104000.0,1714104000.0,True,0.0,0.0,...,,76561198283741884,0.0,18.0,434.0,434.0,218.0,1714184000.0,,
6,,,163582001,english,Would be nice if there where more maps,1714077000.0,1714077000.0,True,0.0,0.0,...,,76561198146019791,29.0,3.0,1369.0,1304.0,957.0,1714267000.0,,
7,,,163416228,english,i love these flamey lil guys :3,1713838000.0,1713838000.0,True,0.0,0.0,...,,76561198873912658,0.0,12.0,1870.0,1870.0,1420.0,1714183000.0,,
8,,,163359938,english,this game is so good it is hard but fun at the...,1713757000.0,1713757000.0,True,0.0,0.0,...,,76561198251814512,0.0,1.0,531.0,1.0,531.0,1713587000.0,,
9,,,163351625,english,"I got this game free at PAX 2024, not only did...",1713744000.0,1713744000.0,True,1.0,0.0,...,,76561198059454624,0.0,2.0,1298.0,874.0,924.0,1713826000.0,,


In [10]:
#Drop Nulls
df.dropna(subset=['review'], inplace=True)

#Keep only English reviews
df = df[df['language'] == 'english']

# Drops columns: query_summary, cursors, recommendationid, timestamp_created
# timestamp_updated, hidden_in_steam_china, steam_china_location, author.steamid
#df = df.drop(df.columns[[0, 1, 2, 5, 6, 15, 16, 17]], axis=1)

# Create a mask where each review has more than one word and at least one alphabetic character
mask = df['review'].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))) > 5 and bool(re.search('[a-zA-Z]', str(x))))

# Apply the mask to the DataFrame to filter out reviews
df = df[mask]

In [12]:
### For: ABSA Data preparation tool ###
df_reviews_only = pd.DataFrame()
df_reviews_only = df['review']
reviews_only_sample = df_reviews_only.sample(1000)
reviews_only_sample.to_csv('data/reviews_only_sample.csv', index=False)

In [19]:
## For: Annotation program ##

# Number of reviews
n = 15000

# Only take reviews of minimum length
df_filtered = df[df['review'].str.len() >= 100]

# Only take reviews with a helpful score > 1
df_filtered = df[df['helpful'] > 1]

# Ensure a good distribution of different titles
title_distribution = df_filtered['title'].value_counts(normalize=True)

sampled_reviews = []
for title, proportion in title_distribution.items():
    n_samples = int(proportion * n)  # Number of samples per title
    title_reviews = df_filtered[df_filtered['title'] == title]
    sampled_reviews.append(title_reviews.sample(n=n_samples, replace=False))  #'replace'=False ensures no duplicates
final_sample = pd.concat(sampled_reviews)
final_sample = final_sample.sample(frac = 1)
final_sample.to_csv('data/sample_reviews.csv')

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
reviews_dataset = load_dataset('csv', data_files='data/annotated_reviews.csv', split="train[:70%]")

In [None]:
## SetFitABSA test
!pip install "setfit[absa]"
!spacy download en_core_web_lg
from setfit import AbsaTrainer, AbsaModel

model = AbsaModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

trainer = AbsaTrainer(model, train_dataset=reviews_dataset)
trainer.train()

In [None]:
# To save the model (composed from 2 submodels)
model.save_pretrained(
    "models/setfit-absa-model-aspect", 
    "models/setfit-absa-model-polarity"
)

In [None]:
# E.g.

preds = model.predict([
    "Amazing graphics and satisfying combat mechanics.",
    "Poor performance and sluggish controls make this a slog to play",
    "Unfortunately, the lazy developers only do the bare minimum and stuff the game full of horrible microtransactions"
])