In [1]:
from pathlib import Path
data_dir = Path('../data/raw')
csv_path = data_dir / "games.csv"

models_path = Path("../models")
doc2vec_path = models_path / "doc2vec_trained"


import pandas as pd

df = pd.read_csv(csv_path)

In [6]:
from gensim.utils import simple_preprocess
from gensim.models import doc2vec
import string
import numpy as np

doc2vec_model = doc2vec.Doc2Vec.load(str(doc2vec_path))

def process_line_doc2vec(line : str) ->str:
    processed = line.translate(str.maketrans('','',string.punctuation))
    return processed.lower()

def desc_to_vector(desc : str) -> np.array:
    line = process_line_doc2vec(desc)
    line = simple_preprocess(line)
    vec = doc2vec_model.infer_vector(line)
    return vec

In [12]:
from typing import Iterable
import json

def str_tags_to_set(tags : str)->Iterable[str]:
    return set(tags.split(","))

with open("tag_dictionary.json", "r", encoding="utf-8") as f:
    tag_dict = json.load(f)

def vectorize_str_tags(tags: str):
    vec = np.zeros(448)
    if tags.lower()=="nan":
        return vec
    tags_set = str_tags_to_set(tags)
    for tag in tags_set:
        vec[tag_dict[tag]]=1
    return vec

In [9]:
df["About the game"] = df["About the game"].astype(str)
df["Tags"] = df["Tags"].astype(str)

In [13]:
df["Description_vector"] = df["About the game"].apply(desc_to_vector)
df["Tags_vector"] = df["Tags"].apply(vectorize_str_tags)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85103 entries, 0 to 85102
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   AppID                       85103 non-null  int64  
 1   Name                        85097 non-null  object 
 2   Release date                85103 non-null  object 
 3   Estimated owners            85103 non-null  object 
 4   Peak CCU                    85103 non-null  int64  
 5   Required age                85103 non-null  int64  
 6   Price                       85103 non-null  float64
 7   DLC count                   85103 non-null  int64  
 8   About the game              85103 non-null  object 
 9   Supported languages         85103 non-null  object 
 10  Full audio languages        85103 non-null  object 
 11  Reviews                     9743 non-null   object 
 12  Header image                85103 non-null  object 
 13  Website                     394

In [15]:
df.to_pickle("games_with_vectors.pickle")

In [18]:
df.iloc[0]

AppID                                                                     20200
Name                                                           Galactic Bowling
Release date                                                       Oct 21, 2008
Estimated owners                                                      0 - 20000
Peak CCU                                                                      0
Required age                                                                  0
Price                                                                     19.99
DLC count                                                                     0
About the game                Galactic Bowling is an exaggerated and stylize...
Supported languages                                                 ['English']
Full audio languages                                                         []
Reviews                                                                     NaN
Header image                  https://cd