This code will remove the unwanted entries from the original dataset. Specifically, we add the number of recommendations, positive/negative votes, estimated owner count and total number of tags to rank games.

In [None]:
import json
import operator
import pandas as pd

cat_list = ['name', 'categories', 'short_description', 'tags'] # 'recommendations', 'positive', 'negative', 'estimated_owners']

data = None
with open('data/games.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Calculate a score and store it with each game's ID
scored_games = []
for app_id, game_data in data.items():

    # Calculate the score based on recommendations, positive, negative, estimated_owners (average), and tag values
    score = game_data.get('recommendations', 0) + game_data.get('positive', 0) + game_data.get('negative', 0)

    # Add average of estimated_owners to the score
    estimated_owners_str = game_data.get('estimated_owners', '0 - 0').replace(',', '') # Remove commas for parsing
    try:
        owners_range = [int(x.strip()) for x in estimated_owners_str.split('-')]
        if len(owners_range) == 2:
            score += sum(owners_range) / 2
        elif len(owners_range) == 1:
            score += owners_range[0]
    except ValueError:
        # Handle cases where estimated_owners is not in the expected format
        pass


    # Add the sum of tag values to the score
    tags = game_data.get('tags', {})
    if isinstance(tags, dict): # Check if 'tags' is a dictionary
        score += sum(tags.values())
    elif isinstance(tags, list): # Handle case where 'tags' is a list
         # If tags is a list, we can count the number of tags as a score contribution
         score += len(tags)


    scored_games.append((app_id, score))

# sort the games by score in descending order
scored_games.sort(key=operator.itemgetter(1), reverse=True)


# create a new dict with only the top 100k games with unique names and the desired keys
filtered_data = {}
added_names = set() # Keep track of added game names
unique_game_count = 0

for app_id, score in scored_games:
    game = data[app_id]
    game_name_lower = game.get('name', '').lower() # Get lowercase name

    # Skip if the lowercase name has already been added
    if game_name_lower in added_names:
        continue

    # Add the game if its name is unique and we haven't reached 100,000 unique games yet
    if unique_game_count < 100000:
        new_game_data = {key: game[key] for key in cat_list if key in game}
        filtered_data[app_id] = new_game_data
        added_names.add(game_name_lower) # Add the lowercase name to the set
        unique_game_count += 1
    else:
        break # Stop once we have 100,000 unique games


with open('data/modified_games.json', 'w', encoding='utf-8') as file:
    json.dump(filtered_data, file, indent=2, ensure_ascii=False)

In [None]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy import sparse

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # lowercase
    text = text.lower()
    # remove punctuation and numbers (keeping only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # tokenize
    tokens = text.split()
    #remove stop words, lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    #join
    return ' '.join(tokens)

dataset = None
with open('data/modified_games.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

game_index = {}; idx = -1
str_array = []

for app in dataset:
    idx+= 1
    game = dataset[app]

    game_index[name] = idx
    name = game['name'].lower()

    description = game['short_description']
    description = clean_text(description)
    name = name + " " + description

    categories = game['categories']
    for c in categories:
        name+= " " + c.lower()

    tags = game['tags']
    counter = 0
    if isinstance(tags, dict): # Check if 'tags' is a dictionary
        for tag_name, tag_num in tags.items():
            counter+= 1
            name+= " " + tag_name.lower()
            if counter == 5:
                break
    elif isinstance(tags, list): # Handle case where 'tags' is a list
        for tag_name in tags:
            counter+= 1
            if isinstance(tag_name, str): # Ensure tag_name is a string
                name+= " " + tag_name.lower()
                if counter == 5:
                    break


    str_array.append(name)

vectorizer = TfidfVectorizer(stop_words='english', max_features=90)
X = vectorizer.fit_transform(str_array)

df = pd.DataFrame(X.toarray(), index=game_index, columns=vectorizer.get_feature_names_out())
df.to_parquet('data/games_vectors.parquet')

In [15]:
from collections import Counter

# Create a list of lowercase game names
game_names = [game['name'].lower() for game in dataset.values()]

# Count the occurrences of each game name
name_counts = Counter(game_names)

# Find names that appear more than once
repeated_names = {name: count for name, count in name_counts.items() if count > 1}

# Display the repeated names and their counts
if repeated_names:
    print("Repeated game names (lowercase) and their counts:")
    for name, count in repeated_names.items():
        print(f"- {name}: {count}")
else:
    print("No repeated game names found after converting to lowercase.")

No repeated game names found after converting to lowercase.
