In [7]:
!pip install torch



In [8]:
!pip install transformers



In [9]:
! pip install gensim



In [10]:
! pip install contractions



In [11]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel

import contractions
# For progress bars


In [66]:
from tqdm import tqdm

In [12]:
review_path = "data/raw_reviews.csv"

review_df = pd.read_csv(review_path)[["recommendationid", "Appname", "review_text"]]
review_df.head()

Unnamed: 0,recommendationid,Appname,review_text
0,212664845,ARC Raiders,Addictive. Stressful. Time waster.
1,212664820,ARC Raiders,fuak arc\r\n
2,212664759,ARC Raiders,W
3,212664729,ARC Raiders,awesome game!
4,212664705,ARC Raiders,If the Steam comments section is like every ot...


# Part 2: Feature Generation

In [87]:
output_path = 'data/review_tokenized.csv'

review_df = pd.read_csv(output_path)
review_df.head(5)

Unnamed: 0,recommendationid,Appname,review_cleaned
0,212664845,ARC Raiders,addictive stressful time waster
1,212664705,ARC Raiders,steam comment section like every comment secti...
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...
3,212664560,ARC Raiders,well make game every time hop experience somet...
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...


## TF-IDF

In [16]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl (8.6 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed scikit-learn-1.7.2 threadpoolctl-3.6.0


In [17]:
# import tfidf helper
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
def compute_tfidf_sklearn(docs, max_features=300, ngram_range=(1,1), stop_words=None):
    """
    Compute TF-IDF scores for a list of documents.

    Args:
        docs (list of str): Input text documents.
        max_features (int): Maximum number of terms to keep (default=300).
        ngram_range (tuple): N-gram range, e.g. (1,2) for unigrams+bigrams.
        stop_words (str or list): 'english' for built-in stopwords, or a list of stopwords.

    Returns:
        pd.DataFrame: TF-IDF score matrix [n_docs x vocab_size].
    """
    vectorizer = TfidfVectorizer(max_features=max_features,
                                 ngram_range=ngram_range,
                                 stop_words=stop_words)

    X_tfidf = vectorizer.fit_transform(docs)
    features = vectorizer.get_feature_names_out()

    return pd.DataFrame(X_tfidf.toarray(), columns=features)

In [23]:
## compute tf idf for these docs
raw_review = review_df['review_cleaned'].to_list() ## need input to tfidf in list
tfidf_raw_sklearn = compute_tfidf_sklearn(raw_review, max_features=500, ngram_range=(1,1))
tfidf_raw_sklearn.head()

Unnamed: 0,ability,able,absolute,absolutely,access,achievement,act,action,actual,actually,...,world,worth,would,write,wrong,yeah,year,yes,yet,zombie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074768,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.248872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.261403,0.0,0.220695,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
## merge reviews df with tfidf info
status_tfidf = pd.merge(left=review_df,
right=tfidf_raw_sklearn,
left_index=True,
right_index=True
)
print(status_tfidf.shape)

(48662, 503)


### Results

In [25]:
## Write tf-idf results
output_path = "data/tfidf.csv"
status_tfidf.to_csv(output_path, index=False)

status_tfidf = pd.read_csv(output_path)
status_tfidf.head(5)

Unnamed: 0,recommendationid,Appname,review_cleaned,ability,able,absolute,absolutely,access,achievement,act,...,world,worth,would,write,wrong,yeah,year,yes,yet,zombie
0,212664845,ARC Raiders,addictive stressful time waster,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,212664705,ARC Raiders,steam comment section like every comment secti...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.248872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,212664560,ARC Raiders,well make game every time hop experience somet...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.261403,0.0,0.220695,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## TF-IDF By Game

In [88]:
!pip install scikit-learn



In [89]:
# import tfidf helper
from sklearn.feature_extraction.text import TfidfVectorizer

In [90]:
def compute_tfidf_sklearn(docs, max_features=300, ngram_range=(1,1), stop_words=None):
    """
    Compute TF-IDF scores for a list of documents.

    Args:
        docs (list of str): Input text documents.
        max_features (int): Maximum number of terms to keep (default=300).
        ngram_range (tuple): N-gram range, e.g. (1,2) for unigrams+bigrams.
        stop_words (str or list): 'english' for built-in stopwords, or a list of stopwords.

    Returns:
        pd.DataFrame: TF-IDF score matrix [n_docs x vocab_size].
    """
    vectorizer = TfidfVectorizer(max_features=max_features,
                                 ngram_range=ngram_range,
                                 stop_words=stop_words)

    X_tfidf = vectorizer.fit_transform(docs)
    features = vectorizer.get_feature_names_out()

    return pd.DataFrame(X_tfidf.toarray(), columns=features)

In [94]:
game_df = review_df.groupby(['Appname'], as_index = False).agg({'review_cleaned': ' '.join})

In [95]:
## compute tf idf for these docs
raw_review = game_df['review_cleaned'].to_list() ## need input to tfidf in list
tfidf_raw_sklearn = compute_tfidf_sklearn(raw_review, max_features=500, ngram_range=(1,1))
tfidf_raw_sklearn.head()

Unnamed: 0,ability,able,absolute,absolutely,access,achievement,act,action,actual,actually,...,world,worth,would,write,wrong,yeah,year,yes,yet,zombie
0,0.002579,0.008978,0.0,0.015272,0.017722,0.014897,0.003287,0.008415,0.002805,0.017454,...,0.022535,0.029694,0.029455,0.008309,0.005223,0.0,0.077912,0.004544,0.004453,0.519725
1,0.008647,0.007525,0.001633,0.007315,0.007922,0.003996,0.006613,0.003762,0.007524,0.013166,...,0.01511,0.01991,0.042322,0.007428,0.014009,0.001811,0.008707,0.007617,0.005971,0.0
2,0.005507,0.026354,0.007797,0.009316,0.0,0.0,0.007018,0.002994,0.005989,0.030277,...,0.019245,0.018113,0.044918,0.005913,0.002788,0.005765,0.050825,0.007275,0.00713,0.0
3,0.0,0.01437,0.019485,0.010477,0.009455,0.0,0.0,0.0,0.00449,0.017461,...,0.028856,0.027159,0.067353,0.0,0.00836,0.004322,0.013856,0.0,0.017818,0.009453
4,0.004972,0.002163,0.014081,0.018928,0.014236,0.002872,0.0,0.0,0.010816,0.016825,...,0.060823,0.018401,0.05273,0.008009,0.0,0.002603,0.020862,0.01314,0.015022,0.0


In [96]:
## merge reviews df with tfidf info
status_tfidf = pd.merge(left=game_df,
right=tfidf_raw_sklearn,
left_index=True,
right_index=True
)
print(status_tfidf.shape)

(247, 502)


### Results

In [97]:
## Write tf-idf results
output_path = "data/tfidf_game.csv"
status_tfidf.to_csv(output_path, index=False)

status_tfidf = pd.read_csv(output_path)
status_tfidf.head(5)

Unnamed: 0,Appname,review_cleaned,ability,able,absolute,absolutely,access,achievement,act,action,...,world,worth,would,write,wrong,yeah,year,yes,yet,zombie
0,7 Days to Die,game bad animation sound possible like prototy...,0.002579,0.008978,0.0,0.015272,0.017722,0.014897,0.003287,0.008415,...,0.022535,0.029694,0.029455,0.008309,0.005223,0.0,0.077912,0.004544,0.004453,0.519725
1,A Total War Saga: TROY,good like mom one best total war title bad myt...,0.008647,0.007525,0.001633,0.007315,0.007922,0.003996,0.006613,0.003762,...,0.01511,0.01991,0.042322,0.007428,0.014009,0.001811,0.008707,0.007617,0.005971,0.0
2,ARC Raiders,addictive stressful time waster steam comment ...,0.005507,0.026354,0.007797,0.009316,0.0,0.0,0.007018,0.002994,...,0.019245,0.018113,0.044918,0.005913,0.002788,0.005765,0.050825,0.007275,0.00713,0.0
3,ARK: Survival Ascended,love quite easily play ase hour several map pl...,0.0,0.01437,0.019485,0.010477,0.009455,0.0,0.0,0.0,...,0.028856,0.027159,0.067353,0.0,0.00836,0.004322,0.013856,0.0,0.017818,0.009453
4,Abiotic Factor,one survival game understand fun enemy get har...,0.004972,0.002163,0.014081,0.018928,0.014236,0.002872,0.0,0.0,...,0.060823,0.018401,0.05273,0.008009,0.0,0.002603,0.020862,0.01314,0.015022,0.0


## GloVE

### Dictionary

In [27]:
# create a corpus (a list of strings) of all cleaned reviews (lemmas)
reviews = review_df['review_cleaned'].tolist()
print("The number of reviews: ", len(reviews))
print("The data type of reviews: ", type(reviews))
print("The data type of the first review:", type(reviews[0]))
reviews[0]

The number of reviews:  48662
The data type of reviews:  <class 'list'>
The data type of the first review: <class 'str'>


'addictive stressful time waster'

In [28]:
import gensim
from gensim import corpora

# create our list of lists corpus (a list of lists of tokens)
corpus = [doc.split() for doc in reviews]
print(*corpus[0])

addictive stressful time waster


In [29]:
# create a corpora dictionary
dictionary = gensim.corpora.Dictionary(corpus)

In [32]:
## get one-percent cutoff
len(reviews) / 100
one_percent_cutoff = 486

In [33]:
## filter tokens to just top one percent
token_list = [dictionary.get(token_id) for token_id, count in dictionary.dfs.items() if count > one_percent_cutoff]
print(len(token_list))
print(token_list[:3]) # See the list of tokens

406
['time', 'steam', 'like']


In [34]:
review_token_list = [word.split(' ') for word in reviews]
print(*review_token_list[0])

addictive stressful time waster


In [35]:
token_set = set(token_list)
# Use a list comprehension to process the entire list at once
reduced_token_list = [[word for word in item if word in token_set] for item in review_token_list]
## filtering out words that are in less than one percent of documents

In [37]:
reduced_token_list[0:10]

[['time'],
 ['steam',
  'like',
  'every',
  'game',
  'expect',
  'get',
  'review',
  'ruin',
  'solo',
  'play',
  'see',
  'genre',
  'maybe',
  'mode',
  'well',
  'hard',
  'say',
  'fix',
  'much',
  'make',
  'game',
  'interesting',
  'however',
  'game',
  'make',
  'long',
  'first',
  'anyone',
  'try',
  'every',
  'player',
  'run',
  'cool',
  'guy',
  'even',
  'free',
  'weapon',
  'game',
  'little',
  'feel',
  'like',
  'require',
  'strategy',
  'help',
  'many',
  'either',
  'never',
  'anything',
  'area',
  'open',
  'line',
  'fun',
  'every',
  'area',
  'want',
  'make',
  'fact',
  'anything',
  'amount',
  'time',
  'craft',
  'let',
  'alone',
  'item',
  'may',
  'well',
  'run',
  'free',
  'item',
  'actually',
  'matter',
  'many',
  'solo',
  'play',
  'game',
  'instead',
  'want',
  'map',
  'try',
  'others',
  'back',
  'gameplay'],
 ['like',
  'around',
  'part',
  'people',
  'kill',
  'nothing',
  'start',
  'level',
  'game',
  'make',
  'wor

### GloVe Word Embeddings

In [38]:
import gensim.downloader
# Show all available models in gensim-dataIn [7]:In [8]:In [ ]:
print(*list(gensim.downloader.info()['models'].keys()), sep='\n')

fasttext-wiki-news-subwords-300
conceptnet-numberbatch-17-06-300
word2vec-ruscorpora-300
word2vec-google-news-300
glove-wiki-gigaword-50
glove-wiki-gigaword-100
glove-wiki-gigaword-200
glove-wiki-gigaword-300
glove-twitter-25
glove-twitter-50
glove-twitter-100
glove-twitter-200
__testing_word2vec-matrix-synopsis


In [39]:
glove_vectors = gensim.downloader.load('glove-twitter-100')

[===-----------------------------------------------] 6.1% 23.4/387.1MB downloaded

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





In [40]:
# rename the object to be a model
glove_model = glove_vectors

In [41]:
# get pretrained vectors for words that are in the model
vector_list = [
glove_model[word] for word in token_list
if word in glove_model.key_to_index
]

In [42]:
## Get word list for words that are in the model
words_filtered = [
  word for word in token_list
  if word in glove_model.key_to_index
  ]
words_filtered_count = len(words_filtered)
print(words_filtered_count)
print(words_filtered)

406
['time', 'steam', 'like', 'every', 'game', 'expect', 'get', 'review', 'ruin', 'solo', 'play', 'see', 'genre', 'maybe', 'mode', 'well', 'hard', 'say', 'fix', 'much', 'make', 'interesting', 'however', 'long', 'first', 'anyone', 'try', 'player', 'run', 'cool', 'guy', 'even', 'free', 'weapon', 'little', 'feel', 'require', 'strategy', 'help', 'many', 'either', 'never', 'anything', 'area', 'open', 'line', 'fun', 'want', 'fact', 'amount', 'craft', 'let', 'alone', 'item', 'may', 'actually', 'matter', 'instead', 'map', 'others', 'back', 'gameplay', 'around', 'part', 'people', 'kill', 'nothing', 'start', 'level', 'worth', 'despite', 'hour', 'experience', 'something', 'completely', 'new', 'think', 'would', 'honestly', 'stop', 'playing', 'though', 'end', 'world', 'good', 'pick', 'incredible', 'graphic', 'almost', 'year', 'big', 'fan', 'must', 'one', 'best', 'ever', 'amazing', 'feature', 'since', 'come', 'wish', 'love', 'turn', 'cannot', 'could', 'bring', 'money', 'great', 'community', 'bad', '

In [43]:
# zip the vector list and the word list
glove_zip = zip(words_filtered, vector_list)
glove_zip

<zip at 0x39becfa00>

In [44]:
# turn the zip into a dataframe
glove_dict = dict(glove_zip)
df_word = pd.DataFrame.from_dict(glove_dict, orient='index')
print(df_word.head(5))

             0         1         2        3         4        5         6   \
time  -0.040258 -0.023896  0.739960 -0.67700  0.221240  0.66331  0.615000   
steam  0.084174 -0.007851 -0.044732 -0.11360  0.831620  0.52398 -0.618860   
like  -0.210630 -0.010992 -0.175520  0.47233 -0.217380  0.41698  0.159160   
every  0.080694 -0.033576  0.257730  0.22087  0.149360  0.23916  0.554930   
game   0.267110  0.306850  0.287650 -0.51790 -0.030231  1.00190  0.034737   

             7         8         9   ...        90        91        92  \
time  -0.184020  0.353980  0.206980  ...  0.001430  0.303030 -0.263860   
steam -0.347390  0.716310 -0.245970  ... -1.289600 -0.262710  0.045145   
like  -0.048584 -0.113570  0.252440  ...  0.075953  0.075035 -0.468350   
every  0.072094 -0.054858 -0.084905  ...  0.541120  0.163770  0.387860   
game   0.407150  0.135380  0.405360  ... -1.046000  0.144130 -0.894590   

             93       94        95        96        97       98        99  
time  -0.310200 

In [45]:
# calculates an average of all word vectors within a document
def document_vector(model, doc):
  return np.mean(model[doc], axis =0)

In [46]:
## filtering out words that aren't in the model
reviews_clean = [[word for word in status.split() if word in glove_model.key_to_index]
for status in reviews]
reviews_clean[0]

['addictive', 'stressful', 'time', 'waster']

In [47]:
## run each cleaned review through the model
## if the document has length of 0, replace that vector with a vector of 0s
## eventually, we'd prune out those rows with vectors of all zero because they are not useful
status_vectors_a = np.array(
[document_vector(glove_model, doc)
if  len(doc) > 0 else np.zeros(100)
for doc in reviews_clean])
# review_vectors_a

In [48]:
status_vectors_a

array([[ 0.003068  , -0.4952465 ,  0.27879751, ...,  0.162469  ,
        -0.13253775, -0.62061995],
       [ 0.081705  ,  0.075504  , -0.02672468, ..., -0.0807679 ,
         0.02408937,  0.01680799],
       [-0.05272999,  0.09562876,  0.08474381, ..., -0.12332678,
         0.12654392, -0.05947653],
       ...,
       [-0.0411425 , -0.09535751, -0.0294    , ...,  0.1378375 ,
        -0.02903075, -0.53593051],
       [ 0.01746473,  0.10137194,  0.16459236, ...,  0.02780686,
         0.03214663, -0.16177617],
       [ 0.15610944,  0.146854  ,  0.10446522, ..., -0.12248405,
         0.10969622, -0.14634635]], shape=(48662, 100))

In [49]:

## make a column for each element in the word embeddings

# 101 makes 100 vector labels
num_tokens = 101
labels = ['GLWE' + str(i) for i in range (1, num_tokens)]

# create df with index, then each other column is the doc embeddings
glwe_df = pd.DataFrame(status_vectors_a,
index=review_df.index.tolist(),
columns=labels)

# display
print(glwe_df.head())
print(glwe_df.shape)


      GLWE1     GLWE2     GLWE3     GLWE4     GLWE5     GLWE6     GLWE7  \
0  0.003068 -0.495246  0.278798 -0.299187  0.370652  0.436277  0.461215   
1  0.081705  0.075504 -0.026725  0.040429  0.044176  0.201207  0.229808   
2 -0.052730  0.095629  0.084744 -0.012630 -0.121921  0.384559  0.178765   
3  0.214481  0.091914  0.124746 -0.123716 -0.018185  0.440613  0.456368   
4 -0.110131  0.164658  0.092555 -0.114117 -0.185766  0.188016  0.214819   

      GLWE8     GLWE9    GLWE10  ...    GLWE91    GLWE92    GLWE93    GLWE94  \
0 -0.585365  0.123820 -0.023682  ... -0.033881  0.607760  0.147187  0.026437   
1 -0.152356  0.075710  0.060329  ... -0.216757  0.209657 -0.028605 -0.062276   
2 -0.093564 -0.078872  0.207186  ... -0.244610  0.192405  0.018402 -0.119586   
3 -0.210375  0.124840  0.027385  ... -0.097712  0.352841 -0.035521 -0.250624   
4  0.104832 -0.173306  0.298742  ... -0.009167  0.074861 -0.070553 -0.004708   

     GLWE95    GLWE96    GLWE97    GLWE98    GLWE99   GLWE100  
0 -0

In [50]:
## join the embeddings back to the reviews
status_glwe = pd.merge(left=review_df,
right=glwe_df,
left_index=True,
right_index=True
)

print(status_glwe.shape)
status_glwe.head(3)

(48662, 103)


Unnamed: 0,recommendationid,Appname,review_cleaned,GLWE1,GLWE2,GLWE3,GLWE4,GLWE5,GLWE6,GLWE7,...,GLWE91,GLWE92,GLWE93,GLWE94,GLWE95,GLWE96,GLWE97,GLWE98,GLWE99,GLWE100
0,212664845,ARC Raiders,addictive stressful time waster,0.003068,-0.495246,0.278798,-0.299187,0.370652,0.436277,0.461215,...,-0.033881,0.60776,0.147187,0.026437,-0.038797,0.221966,-0.063209,0.162469,-0.132538,-0.62062
1,212664705,ARC Raiders,steam comment section like every comment secti...,0.081705,0.075504,-0.026725,0.040429,0.044176,0.201207,0.229808,...,-0.216757,0.209657,-0.028605,-0.062276,0.069856,0.042705,-0.035662,-0.080768,0.024089,0.016808
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,-0.05273,0.095629,0.084744,-0.01263,-0.121921,0.384559,0.178765,...,-0.24461,0.192405,0.018402,-0.119586,0.005132,0.136383,0.020331,-0.123327,0.126544,-0.059477


### Results

In [52]:
## write out results
output_path = "data/glove.csv"
status_glwe.to_csv(output_path, index=False)

status_glwe = pd.read_csv(output_path)
status_glwe.head(5)

Unnamed: 0,recommendationid,Appname,review_cleaned,GLWE1,GLWE2,GLWE3,GLWE4,GLWE5,GLWE6,GLWE7,...,GLWE91,GLWE92,GLWE93,GLWE94,GLWE95,GLWE96,GLWE97,GLWE98,GLWE99,GLWE100
0,212664845,ARC Raiders,addictive stressful time waster,0.003068,-0.495246,0.278798,-0.299187,0.370652,0.436277,0.461215,...,-0.033881,0.60776,0.147187,0.026437,-0.038797,0.221966,-0.063209,0.162469,-0.132538,-0.62062
1,212664705,ARC Raiders,steam comment section like every comment secti...,0.081705,0.075504,-0.026725,0.040429,0.044176,0.201207,0.229808,...,-0.216757,0.209657,-0.028605,-0.062276,0.069856,0.042705,-0.035662,-0.080768,0.024089,0.016808
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,-0.05273,0.095629,0.084744,-0.01263,-0.121921,0.384559,0.178765,...,-0.24461,0.192405,0.018402,-0.119586,0.005132,0.136383,0.020331,-0.123327,0.126544,-0.059477
3,212664560,ARC Raiders,well make game every time hop experience somet...,0.214481,0.091914,0.124746,-0.123716,-0.018185,0.440613,0.456368,...,-0.097712,0.352841,-0.035521,-0.250624,-0.049312,-0.129814,0.115082,0.023404,-0.025352,-0.241096
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...,-0.110131,0.164658,0.092555,-0.114117,-0.185766,0.188016,0.214819,...,-0.009167,0.074861,-0.070553,-0.004708,-0.205371,-0.006124,-0.075116,0.109899,0.2821,0.00584


## Longformer

### Setup the BERT Tokenizer

In [85]:
review_path = "data/raw_reviews.csv"

review_df = pd.read_csv(review_path)[["recommendationid", "Appname", "review_text"]]
review_df.head()

Unnamed: 0,recommendationid,Appname,review_text
0,212664845,ARC Raiders,Addictive. Stressful. Time waster.
1,212664820,ARC Raiders,fuak arc\r\n
2,212664759,ARC Raiders,W
3,212664729,ARC Raiders,awesome game!
4,212664705,ARC Raiders,If the Steam comments section is like every ot...


In [86]:
review_df[
    review_df["review_text"].apply(lambda x: x == "review_text")
    ]

Unnamed: 0,recommendationid,Appname,review_text
300,recommendationid,Appname,review_text
601,recommendationid,Appname,review_text
902,recommendationid,Appname,review_text
1203,recommendationid,Appname,review_text
1603,recommendationid,Appname,review_text
...,...,...,...
68055,recommendationid,Appname,review_text
68356,recommendationid,Appname,review_text
68358,recommendationid,Appname,review_text
68659,recommendationid,Appname,review_text


In [69]:
# review_df['review_text']
review_df['review_text'] = review_df['review_text'].astype('str')

In [70]:
# review_df.loc((review_df['review_text'].str.len() > 0)).shape
review_df = review_df[
    review_df["review_text"].apply(lambda x: len(x) > 20)
    ]
review_df.shape

(48713, 3)

In [71]:
import torch
from transformers import LongformerTokenizer, LongformerModel

In [72]:
# Load pre-trained model tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
# Load pre-trained model
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

### Calculate embeddings for longformer BERT

In [73]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# Move the model to the selected device
model.to(device)
# Ensure the model is in evaluation mode
model.eval()

Using device: cpu


LongformerModel(
  (embeddings): LongformerEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0-11): 12 x LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
    

In [74]:
# For progress bars
# from tqdm import tqdm
def generate_embeddings_longformer(df, batch_size=16):
  embeddings = [] # To store embeddings
  # Process in batches
  for i in tqdm(range(0, len(df), batch_size)):
    batch = df[i:i+batch_size]
    inputs = tokenizer(
      batch.tolist(),
      padding=True, # Pad texts to the same length
      truncation=True, # Truncate to max_length
      add_special_tokens=True, # Add [CLS] and [SEP]
      return_tensors="pt", # Return PyTorch tensors
      max_length=4096, # set to longformer max length
      return_attention_mask=True #Generate attention masks
      )

    # Create a global attention mask with the same shape as the input_ids
    # Set global attention on the first token ([CLS] token)
    global_attention_mask = torch.zeros_like(inputs['input_ids'])
    global_attention_mask[:, 0] = 1
    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    #the model is in inference mode, not training model
    # so no gradients are computed
    with torch.no_grad():
      outputs = model(**inputs) #unpack input

    # Use pooled_output for sentence-level embeddings
    batch_embeddings = outputs.pooler_output.detach().cpu().numpy()
    embeddings.append(batch_embeddings)

  # Concatenate all batch embeddings
  embeddings = np.concatenate(embeddings, axis=0)
  return embeddings

In [75]:
# load data with a different name, make sure using a different pd variable
review_df_long = review_df.copy()
review_df_long.head()

Unnamed: 0,recommendationid,Appname,review_text
0,212664845,ARC Raiders,Addictive. Stressful. Time waster.
4,212664705,ARC Raiders,If the Steam comments section is like every ot...
5,212664692,ARC Raiders,I like the gathering and sneaking around the A...
6,212664560,ARC Raiders,"Very well made game, every time I hop on I exp..."
7,212664471,ARC Raiders,I thought this would be too sweaty for me. Hon...


In [77]:
%%time
# gnerating embedings in batches
embeddings = generate_embeddings_longformer(review_df_long['review_text'],
batch_size=16)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3045/3045 [23:14:24<00:00, 27.48s/it]

CPU times: user 8h 25min 58s, sys: 5h 49s, total: 13h 26min 47s
Wall time: 23h 14min 24s





In [78]:
longformer_df = pd.DataFrame(embeddings)
longformer_df.shape

(48713, 768)

In [79]:
print(longformer_df.head(3))

        0         1         2         3         4         5         6    \
0  0.160998 -0.314331  0.128970 -0.002612  0.363178 -0.224010 -0.496514   
1  0.171792 -0.321767  0.087026 -0.013351  0.341068 -0.235470 -0.495652   
2  0.172855 -0.294004  0.098145 -0.008861  0.330265 -0.219347 -0.502270   

        7         8         9    ...       758       759       760       761  \
0 -0.436305 -0.123515 -0.263082  ... -0.242919  0.057376  0.166702  0.129182   
1 -0.378920 -0.132720 -0.231967  ... -0.243866  0.088027  0.174401  0.125420   
2 -0.400064 -0.099364 -0.246443  ... -0.233194  0.068008  0.172816  0.134450   

        762       763       764       765       766       767  
0 -0.262234  0.042280 -0.297840 -0.106902  0.149733  0.019978  
1 -0.211613  0.038621 -0.270587 -0.055174  0.198233  0.028517  
2 -0.217051  0.045791 -0.247482 -0.063542  0.169679  0.000969  

[3 rows x 768 columns]


### Save output and display

In [None]:
## pickle model in case
longformer_df.to_pickle(f"{midterm_path}/df_longformer.pkl")

In [80]:
## join the reviews to the embeddings
df_longformer = pd.concat(
[review_df_long.reset_index(drop=True), longformer_df],
axis=1)

In [81]:
print(df_longformer.head(3))

  recommendationid      Appname  \
0        212664845  ARC Raiders   
1        212664705  ARC Raiders   
2        212664692  ARC Raiders   

                                         review_text         0         1  \
0                Addictive.  Stressful. Time waster.  0.160998 -0.314331   
1  If the Steam comments section is like every ot...  0.171792 -0.321767   
2  I like the gathering and sneaking around the A...  0.172855 -0.294004   

          2         3         4         5         6  ...       758       759  \
0  0.128970 -0.002612  0.363178 -0.224010 -0.496514  ... -0.242919  0.057376   
1  0.087026 -0.013351  0.341068 -0.235470 -0.495652  ... -0.243866  0.088027   
2  0.098145 -0.008861  0.330265 -0.219347 -0.502270  ... -0.233194  0.068008   

        760       761       762       763       764       765       766  \
0  0.166702  0.129182 -0.262234  0.042280 -0.297840 -0.106902  0.149733   
1  0.174401  0.125420 -0.211613  0.038621 -0.270587 -0.055174  0.198233   
2  0.172

In [82]:
## write results
output_path = 'data/longformer.csv'
df_longformer.to_csv(output_path, index=False)

In [83]:
## read results back in case
df_longformer = pd.read_csv(output_path)
df_longformer.head()

Unnamed: 0,recommendationid,Appname,review_text,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,212664845,ARC Raiders,Addictive. Stressful. Time waster.,0.160998,-0.314331,0.12897,-0.002612,0.363178,-0.22401,-0.496514,...,-0.242919,0.057376,0.166702,0.129182,-0.262234,0.04228,-0.29784,-0.106902,0.149733,0.019978
1,212664705,ARC Raiders,If the Steam comments section is like every ot...,0.171792,-0.321767,0.087026,-0.013351,0.341068,-0.23547,-0.495652,...,-0.243866,0.088027,0.174401,0.12542,-0.211613,0.038621,-0.270587,-0.055174,0.198233,0.028517
2,212664692,ARC Raiders,I like the gathering and sneaking around the A...,0.172855,-0.294004,0.098145,-0.008861,0.330265,-0.219347,-0.50227,...,-0.233194,0.068008,0.172816,0.134451,-0.217051,0.045791,-0.247482,-0.063542,0.169679,0.000969
3,212664560,ARC Raiders,"Very well made game, every time I hop on I exp...",0.192783,-0.339317,0.094364,0.009529,0.344692,-0.200341,-0.490265,...,-0.261185,0.022664,0.213612,0.160738,-0.22349,0.02688,-0.2638,-0.075613,0.200625,0.042513
4,212664471,ARC Raiders,I thought this would be too sweaty for me. Hon...,0.166418,-0.311148,0.098709,-0.004635,0.343317,-0.196987,-0.491329,...,-0.209657,0.029193,0.16678,0.151393,-0.2231,0.029565,-0.26766,-0.06099,0.182768,0.028096


In [84]:
df_longformer[
    df_longformer["review_text"].apply(lambda x: x == "review_text")
    ]

Unnamed: 0,recommendationid,Appname,review_text,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767


# Part 3: Reflection