This code will remove the unwanted entries from the original dataset. Specifically, we add the number of recommendations, positive/negative votes, estimated owner count and total number of tags, then subtract number of non-english characters in the name and detailed description, and use this number to rank games. Non-english words are stigmatized because they are likely to interact negatively with the later algorithms.

In [None]:
import json
import operator
import pandas as pd
import re

cat_list = ['name', 'categories', 'short_description', 'tags', 'metacritic_score', 'positive', 'negative', 'header_image']

# Regex pattern to remove unusual symbols (superscripts, copyright, ®, ™, etc.)
# Keep basic punctuation, letters, digits, and whitespace
clean_pattern = re.compile(r'[^\w\s.,!?;:()\-\']')  # remove anything except word chars, whitespace, and common punctuation

data = None
with open('data/games.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

scored_games = []

for app_id, game_data in data.items():
    # Compute base score
    score = game_data.get('recommendations', 0) + game_data.get('positive', 0) + game_data.get('negative', 0)

    # Add estimated owners
    estimated_owners_str = game_data.get('estimated_owners', '0 - 0').replace(',', '')
    try:
        owners_range = [int(x.strip()) for x in estimated_owners_str.split('-')]
        if len(owners_range) == 2:
            score += sum(owners_range) / 2
        elif len(owners_range) == 1:
            score += owners_range[0]
    except ValueError:
        pass

    # Add tag values
    tags = game_data.get('tags', {})
    if isinstance(tags, dict):
        score += sum(tags.values())
    elif isinstance(tags, list):
        score += len(tags)

    # Count non-English characters
    non_english_char_count = 0
    name = game_data.get('name', '')
    detailed_description = game_data.get('detailed_description', '')
    non_english_char_count += len(re.findall(r'[^\x00-\x7F]+', name))
    non_english_char_count += len(re.findall(r'[^\x00-\x7F]+', detailed_description))
    score -= non_english_char_count * 0.1

    # Clean text fields
    for key in ['name', 'short_description']:
        if key in game_data and isinstance(game_data[key], str):
            game_data[key] = clean_pattern.sub('', game_data[key])

    # Clean tags if dictionary or list
    if 'tags' in game_data:
        if isinstance(game_data['tags'], dict):
            cleaned_tags = {}
            for t, val in game_data['tags'].items():
                clean_tag = clean_pattern.sub('', t)
                cleaned_tags[clean_tag] = val
            game_data['tags'] = cleaned_tags
        elif isinstance(game_data['tags'], list):
            game_data['tags'] = [clean_pattern.sub('', t) for t in game_data['tags']]

    # Clean categories
    if 'categories' in game_data and isinstance(game_data['categories'], list):
        game_data['categories'] = [clean_pattern.sub('', c) for c in game_data['categories']]

    scored_games.append((app_id, score))

# Sort and filter top 100k unique games
scored_games.sort(key=operator.itemgetter(1), reverse=True)

filtered_data = {}
added_names = set()
unique_game_count = 0

for app_id, score in scored_games:
    game = data[app_id]
    game_name_lower = game.get('name', '').lower()
    if game_name_lower in added_names:
        continue
    if unique_game_count < 100000:
        new_game_data = {key: game[key] for key in cat_list if key in game}
        filtered_data[app_id] = new_game_data
        added_names.add(game_name_lower)
        unique_game_count += 1
    else:
        break

with open('modified_games.json', 'w', encoding='utf-8') as file:
    json.dump(filtered_data, file, indent=2, ensure_ascii=False)


The following will vectorize the corpus associated with each game, using TF-IDF, and then reduce dimensions of the resulting feature matrix via svd.

In [None]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
from scipy import sparse
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tempfile
import os

DATA_DIR = 'preprocessing/data/'

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Write processed text to a temporary file
temp_file = tempfile.NamedTemporaryFile(mode='w+', delete=False, encoding='utf-8')
temp_file_path = temp_file.name
game_names = []  # Store game names separately (much smaller than full text)

if __name__ == "__main__":
  try:
      with open(DATA_DIR + 'modified_games.json', 'r', encoding='utf-8') as file:
          dataset = json.load(file)

      print(f"Processing {len(dataset)} games...")

      for idx, app in enumerate(dataset):
          game = dataset[app]

          name = game['name']
          game_names.append(name)

          # Build the text representation
          text_parts = [name.lower()]*3

          description = game['short_description']*3
          text_parts.append(clean_text(description))

          categories = game['categories']
          text_parts.extend([c.lower() for c in categories])

          tags = game['tags']
          counter = 0
          if isinstance(tags, dict):
              for tag_name, tag_num in tags.items():
                  text_parts.append(tag_name.lower())
                  counter += 1
                  if counter == 5:
                      break
          elif isinstance(tags, list):
              for tag_name in tags:
                  if isinstance(tag_name, str):
                      text_parts.append(tag_name.lower())
                      counter += 1
                      if counter == 5:
                          break

          temp_file.write(' '.join(text_parts) + '\n')

          if (idx + 1) % 10000 == 0:
              print(f"Processed {idx + 1} games...")

      temp_file.close()

      print("Vectorizing with TF-IDF...")
      # Limit vocabulary size to reduce memory
      vectorizer = TfidfVectorizer(
          stop_words='english',
          max_features=10000,  # Limit to top 10k features
          max_df=0.75,  # Ignore terms in >75% of docs
          min_df=2     # Ignore terms in <2 docs
      )

      with open(temp_file_path, 'r', encoding='utf-8') as f:
          X = vectorizer.fit_transform(f)

      print(f"TF-IDF matrix shape: {X.shape}")
      print(f"TF-IDF matrix size: {X.data.nbytes / (1024**2):.2f} MB (sparse)")

      # Apply TruncatedSVD for dimensionality reduction
      print("Applying TruncatedSVD...")
      n_components = 600  # Reduce to 100 dimensions (adjust as needed)
      svd = TruncatedSVD(n_components=n_components, random_state=42)
      X_reduced = svd.fit_transform(X)

      print(f"Reduced matrix shape: {X_reduced.shape}")
      print(f"Explained variance ratio: {svd.explained_variance_ratio_.sum():.4f}")

      # Now save as parquet - much smaller and stays dense
      df = pd.DataFrame(
          X_reduced,
          index=game_names,
          columns=[f'component_{i}' for i in range(n_components)]
      )

      print("Saving to parquet...")
      df.to_parquet('games_vectors.parquet')

      print(f"Done! Saved {len(game_names)} games with {n_components} dimensions")

  finally:
      # Clean up the temporary file
      if os.path.exists(temp_file_path):
          os.unlink(temp_file_path)

  # Save the vectorizer and svd objects
      import joblib

      joblib.dump(vectorizer, DATA_DIR + 'tfidf_vectorizer.pkl')
      joblib.dump(svd, DATA_DIR + 'svd_model.pkl')
      joblib.dump(lemmatizer, DATA_DIR + 'lemmatizer.pkl')
      joblib.dump(stop_words, DATA_DIR + 'stop_words.pkl')

Processing 100000 games...
Processed 10000 games...
Processed 20000 games...
Processed 30000 games...
Processed 40000 games...
Processed 50000 games...
Processed 60000 games...
Processed 70000 games...
Processed 80000 games...
Processed 90000 games...
Processed 100000 games...
Vectorizing with TF-IDF...
TF-IDF matrix shape: (100000, 10000)
TF-IDF matrix size: 17.30 MB (sparse)
Applying TruncatedSVD...
Reduced matrix shape: (100000, 600)
Explained variance ratio: 0.4021
Saving to parquet...
Done! Saved 100000 games with 600 dimensions


Sanity checks:

In [None]:
from collections import Counter
import json

# Load the modified games data
with open('modified_games.json', 'r', encoding='utf-8') as file:
    dataset = json.load(file)

# Create a list of lowercase game names
game_names = [game['name'].lower() for game in dataset.values()]

# Count the occurrences of each game name
name_counts = Counter(game_names)

# The number of unique names is the number of entries in name_counts
num_unique_names = len(name_counts)

print(f"Number of unique game names: {num_unique_names}")

# Find names that appear more than once (This part is kept for completeness,
# although based on previous output there should be none)
repeated_names = {name: count for name, count in name_counts.items() if count > 1}

# Display the repeated names and their counts
if repeated_names:
    print("\nRepeated game names (lowercase) and their counts:")
    for name, count in repeated_names.items():
        print(f"- {name}: {count}")
# else:
    # The message "No repeated game names found..." is now less relevant since we print the unique count
    # print("No repeated game names found after converting to lowercase.")

Number of unique game names: 100000


In [None]:

df = pd.read_parquet('games_vectors.parquet')
print(df.shape)

(100000, 600)
