## Sentiment Analysis

In [18]:
import pandas as pd
import numpy as np

games = pd.read_csv('../data/processed/games_cleaned.csv')

In [35]:
from transformers import pipeline
classifier = pipeline(
    "text-classification", 
    model="j-hartmann/emotion-english-distilroberta-base", 
    top_k = None,
    truncation=True,  # Automatically truncates to fit the model
    device="cuda",
    framework="pt"
)

Device set to use cuda


In [36]:
sentences = games['about_the_game'][0].split(".")
predictions = classifier(sentences)
predictions[0]

[{'label': 'neutral', 'score': 0.7835099101066589},
 {'label': 'joy', 'score': 0.11311975121498108},
 {'label': 'surprise', 'score': 0.06659185886383057},
 {'label': 'disgust', 'score': 0.01789470575749874},
 {'label': 'anger', 'score': 0.009464814327657223},
 {'label': 'sadness', 'score': 0.005986660718917847},
 {'label': 'fear', 'score': 0.003432320198044181}]

In [37]:
emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
app_id = []
emotion_scores = {label: [] for label in emotion_labels}

def calculate_max_emotion_scores(predictions):
    per_emotion_scores = {label: [] for label in emotion_labels}
    for prediction in predictions:
        sorted_predictions = sorted(prediction, key=lambda x: x["label"])
        for index, label in enumerate(emotion_labels):
            per_emotion_scores[label].append(sorted_predictions[index]["score"])
    return {label: np.max(scores) for label, scores in per_emotion_scores.items()}

In [38]:
from tqdm import tqdm

emotion_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise", "neutral"]
app_id = []
emotion_scores = {label: [] for label in emotion_labels}

for i in tqdm(range(len(games))):
    app_id.append(games["app_id"][i])
    sentences = games['about_the_game'][i].split(".")
    predictions = classifier(sentences)
    max_scores = calculate_max_emotion_scores(predictions)
    for label in emotion_labels:
        emotion_scores[label].append(max_scores[label])

100%|██████████| 90130/90130 [1:09:49<00:00, 21.52it/s]  


In [39]:
emotions_df = pd.DataFrame(emotion_scores)
emotions_df["app_id"] = app_id
emotions_df

Unnamed: 0,anger,disgust,fear,joy,sadness,surprise,neutral,app_id
0,0.019282,0.017895,0.004198,0.816362,0.947172,0.023381,0.150551,20200
1,0.913658,0.104007,0.435311,0.198899,0.916418,0.743380,0.307450,655370
2,0.168709,0.020957,0.006915,0.549593,0.944257,0.007979,0.253707,1732930
3,0.095404,0.333308,0.816178,0.017172,0.886388,0.032531,0.167621,1355720
4,0.170290,0.104007,0.238217,0.920194,0.965194,0.200422,0.078766,1139950
...,...,...,...,...,...,...,...,...
90125,0.102202,0.708776,0.061535,0.947020,0.948472,0.149724,0.368772,3080940
90126,0.051505,0.006538,0.028494,0.935088,0.375939,0.014690,0.058292,2593970
90127,0.454001,0.301669,0.794739,0.187950,0.967745,0.111690,0.285848,3137150
90128,0.044632,0.160773,0.981339,0.942352,0.877451,0.340593,0.253688,3124670


In [None]:
games = pd.merge(games, emotions_df, on='app_id')

In [42]:
games.to_csv('../data/processed/games_with_emotions.csv', index=False)