In [1]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
import nltk
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import re
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('every-game.csv')

In [3]:
data['Description'] = data['Description'].astype(str)
data['Tags'] = data['Tags'].astype(str) 

In [4]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU.


In [5]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base').to(device)
nltk.download('punkt')
stemmer = PorterStemmer()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ava01\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def preprocess_text(text):
    text = text.lower()  # Приведение к нижнему регистру
    text = re.sub(r'[^\w\s]', '', text)  # Удаление знаков препинания
    tokens = nltk.word_tokenize(text)  # Токенизация текста
    stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Стемминг токенов
    return ' '.join(stemmed_tokens)

In [7]:
def get_embeddings(texts):
    texts = [preprocess_text(text) for text in texts]
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    encoded_input = {key: value.to(device) for key, value in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :].cpu().detach()

In [8]:
import pickle
def save_embeddings_to_pickle(embeddings, filename):
    with open(filename, 'wb') as f:
        pickle.dump(embeddings, f)

def load_embeddings_from_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

description_embeddings = load_embeddings_from_pickle('description_embeddings.pkl')

In [9]:
from collections import Counter

In [10]:
def find_similar_descriptions_and_suggest_tags(new_description, num_similar=10):
    new_description_emb = get_embeddings([new_description])
    similarities = cosine_similarity(new_description_emb, description_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:num_similar]
    similar_data = data.iloc[top_indices]

    # Агрегация тегов из похожих описаний
    all_tags = []
    for tags in similar_data['Tags']:
        all_tags.extend(tags.split(','))  # Предполагаем, что теги разделены запятыми

    # Подсчёт частоты каждого тега
    tag_frequency = Counter(all_tags)
    most_common_tags = [tag.strip() for tag, _ in tag_frequency.most_common(20)]  # Выбираем топ-10 самых частых тегов

    return most_common_tags

In [11]:
all_tags = [
    'Party-Based RPG', 'Driving', 'Violent', 'Villain Protagonist', 'Medical Sim', '8-bit Music', 'Hentai', 'Visual Novel', 'Boxing', 'Sniper', 'Moddable', '2D',
    'Hobby Sim', 'Co-op Campaign', 'Procedural Generation', 'Quick-Time Events', 'Deckbuilding', 'Intentionally Awkward Controls', 'Psychedelic', 'Photo Editing',
    'Turn-Based Tactics', 'Western', 'Features: Online PvP', 'Score Attack', 'Lore-Rich', 'Capitalism', 'Steampunk', 'Vikings', 'RTS', 'Auto Battler', 'Political Sim',
    'Roguelike Deckbuilder', 'Asymmetric VR', 'Spectacle fighter', 'Immersive Sim', 'Tutorial', 'Logic', '3D Platformer', 'Ninja', 'Mini Golf', 'Early Access',
    'Philosophical', 'Time Management', 'Bullet Hell', 'Collectathon', 'Controller', 'Birds', 'Heist', 'Rhythm', 'Magic', 'Werewolves', 'Great Soundtrack',
    'Automobile Sim', 'Reboot', 'Baseball', 'Wargame', 'Post-apocalyptic', "Shoot 'Em Up", 'Cult Classic', 'Tower Defense', 'City Builder', 'Replay Value',
    'Based On A Novel', 'Trading', 'Assassin', 'Local Multiplayer', "1990's", 'Otome', 'Medieval', 'Detective', 'Drama', '6DOF', "Beat 'em up", 'Pixel Graphics',
    'Space', 'Sailing', 'Hand-drawn', 'Mythology', 'Naval Combat', 'Trivia', 'Addictive', 'Football (American)', '4X', 'Arcade', 'Instrumental Music', 'Science',
    'Choices Matter', 'Musou', 'Open World', 'Diplomacy', 'Illuminati', 'Aliens', 'Horses', 'Rock Music', 'Tile-Matching', 'Alternate History', 'Remake',
    'Simulation', '4 Player Local', 'Cute', 'Fishing', 'CRPG', 'Gore', 'Features: Stats', 'Features: Family Sharing', 'FMV', 'Economy', 'Minimalist',
    'Audio Production', 'Shooter', 'Nudity', 'Snow', 'Short', 'Pool', 'Third-Person Shooter', 'Side Scroller', "1990\\'s", 'Singleplayer', 'Text-Based', 'Crime',
    'Profile Features Limited', 'Mature', 'Combat', 'Romance', 'Real Time Tactics', "Shoot \\'Em Up", 'Skiing', 'LEGO', 'Software', 'Sports', 'Split Screen',
    'Creature Collector', 'Walking Simulator', 'Archery', 'Perma Death', 'Anime', 'Documentary', 'Action', 'Survival', 'Family Sharing', 'Transportation',
    'Benchmark', 'Jump Scare', 'Strategy RPG', 'Hex Grid', 'Cycling', 'Dinosaurs', 'Warhammer 40K', 'Elf', 'PvP', 'Design & Illustration', 'Strategy',
    'Hack and Slash', 'Card Game', 'Choose Your Own Adventure', 'GameMaker', 'Destruction', 'Top-Down', 'Nonlinear', 'Ambient', 'Education', 'Features: Single-player',
    'Character Action Game', 'Psychological Horror', 'Pirates', 'Retro', 'Kickstarter', 'Motocross', 'Tabletop', 'Puzzle Platformer', 'RPG', 'Third Person',
    'Character Customization', 'Volleyball', 'Building', 'Automation', 'Immersive', 'Class-Based', 'Trains', 'Satire', 'Souls-like', 'JRPG', 'Well-Written',
    'Voxel', 'Mod', 'Foreign', 'Extraction Shooter', 'Swordplay', 'Job Simulator', 'Jet', 'Dynamic Narration', 'Mechs', 'Space Sim', 'Blood', 'Sandbox',
    'Politics', 'Open World Survival Craft', 'Atmospheric', 'Casual', 'Minigames', 'Battle Royale', 'Stylized', 'Wholesome', 'Narrative', 'Survival Horror',
    'Hero Shooter', 'Shop Keeper', 'Supernatural', 'Time Attack', 'Narration', 'Clicker', 'Isometric', 'Arena Shooter', 'Fantasy', 'Cooking', 'Turn-Based Combat',
    'Rome', 'Submarine', 'Comic Book', 'Tennis', 'Free to Play', 'Social Deduction', 'Skateboarding', 'First-Person', 'America', 'Massively Multiplayer', 'Zombies',
    'Includes level editor', 'Bullet Time', 'Bikes', 'Match 3', 'World War II', 'Multiple Endings', 'Outbreak Sim', 'Management', 'Artificial Intelligence',
    'Comedy', 'Dark', 'ATV', 'Word Game', 'Physics', 'Roguelike', 'Lemmings', 'Golf', 'Difficult', 'Exploration', 'Action RPG', 'Point & Click', 'Trading Card Game',
    'Action RTS', 'Dungeons & Dragons', 'Lovecraftian', 'Historical', 'Time Travel', 'Escape Room', 'Grand Strategy', 'Voice Control', 'Runner', 'Boss Rush',
    'Soundtrack', 'Unforgiving', '360 Video', 'Indie', 'Female Protagonist', 'Silent Protagonist', 'Episodic', 'Sci-fi', 'Military', 'Farming', 'Boomer Shooter',
    'Noir', 'Dark Comedy', 'Spelling', 'Games Workshop', 'Conspiracy', 'Cozy', 'Conversation', 'Mahjong', 'FPS', 'Interactive Fiction', 'Inventory Management',
    'Mystery', 'Political', 'Platformer', 'Time Manipulation', 'Loot', 'Real-Time', 'Life Sim', 'Spaceships', 'Snooker', 'Wrestling', 'Experience', 'Colorful',
    'Coding', 'Hockey', 'Modern', 'Turn-Based Strategy', 'Game Development', 'Board Game', 'Beautiful', 'LGBTQ+', 'Gaming', 'Dating Sim', 'Cinematic', 'Puzzle',
    'Multiplayer', 'Typing', 'Metroidvania', 'Agriculture', 'Motorbike', 'Vampire', 'Robots', 'Relaxing', 'Competitive', 'Snowboarding', 'Electronic', 'Turn-Based',
    'Real-Time with Pause', '1980s', 'Vehicular Combat', 'Hidden Object', 'Underground', 'Classic', 'Feature Film', 'Investigation', 'Action-Adventure', 'Martial Arts',
    'Linear', 'Dungeon Crawler', 'Online Co-Op', 'Epic', 'Action Roguelike', '3D Fighter', 'Bowling', 'Sokoban', 'RPGMaker', 'Utilities', 'Resource Management',
    'Gun Customization', 'Cartoon', 'Gothic', 'eSports', 'Chess', 'Hunting', 'Touch-Friendly', 'Party Game', 'Fast-Paced', 'Traditional Roguelike', 'MOBA','Card Battler', 'Surreal', 'Tactical RPG', 'Base Building', 'Realistic', 'Thriller', 'Sexual Content', '3D', 'Roguevania', '3D Vision', 'War', 'Web Publishing',
    'Level Editor', 'Stealth', 'Superhero', 'Farming Sim', 'Parkour', 'Horror', 'Top-Down Shooter', 'Combat Racing', 'PvE', 'Sequel', 'Programming', 'Co-op', 'Party',
    'TrackIR', 'Movie', 'Team-Based', 'Hacking', 'Local Co-Op', 'Steam Machine', 'Memes', 'Hardware', 'Crafting', 'Cartoony', 'Mining', 'Racing', 'Dark Humor',
    'Mars', 'Fighting', 'Software Training', 'Old School', 'Family Friendly', 'Tactical', 'Roguelite', 'Skating', 'Transhumanism', 'Gambling', 'BMX', 'Story Rich',
    'Cyberpunk', 'Offroad', "Beat 'em up", 'Dystopian', 'Dark Fantasy', 'Pinball', 'Cricket', '2D Platformer', 'Colony Sim', 'Dog', 'Music-Based Procedural Generation',
    'Asynchronous Multiplayer', 'Tanks', 'Animation & Modeling', 'Mystery Dungeon', 'On-Rails Shooter', 'Faith', 'Basketball', 'Video Production', 'Electronic Music',
    'Nature', 'Solitaire', 'VR', 'Cats', 'Futuristic', 'Experimental', 'Demons', 'Naval', 'Twin Stick Shooter', 'Emotional', 'Adventure', 'God Game', 'Parody',
    'Flight', 'NSFW', 'MMORPG', '2.5D', 'Looter Shooter', 'Mouse only', 'Rugby', 'Funny', 'Idler', 'Dragons', 'Football (Soccer)', 'Music', 'Dwarf', '2D Fighter',
    'Psychological', 'Precision Platformer', 'Cold War', 'Underwater', 'Grid-Based Movement', 'Nostalgia', 'Crowdfunded', 'World War I', 'Abstract', 'Fox'
]


In [12]:
def get_random_game():
    # Выбор случайной строки из датасета
    random_game = data.sample()
    
    game_description = random_game['Description'].values[0]
    game_tags = random_game['Tags'].values[0]
    game_name = random_game['name'].values[0]
    
    return game_description, game_tags, game_name

In [13]:
import pickle
tag_embeddings = get_embeddings(all_tags)
with open('tag_embeddings.pkl', 'wb') as f:
    pickle.dump(tag_embeddings, f)

Новое описание и тэги для этого описания

In [14]:
new_description = 'In a frozen world where the last remnants of humanity struggle to survive, you are the leader of a desperate group of survivors, tasked with building the last city on Earth. Facing relentless cold, dwindling resources, and moral dilemmas, you must make tough decisions to ensure your peoples survival. Every choice, from how to manage your workforce to the laws you enact, will shape the fate of your society. Will you be a benevolent leader or rule with an iron fist? As the temperature drops and hope fades, how far will you go to ensure the survival of your city in this unforgiving world?'

In [15]:
suggested_tags = find_similar_descriptions_and_suggest_tags(new_description)

print("Suggested Tags based on similar descriptions:")
for tag in suggested_tags:
    print(f"{tag}")

Suggested Tags based on similar descriptions:
Singleplayer
Indie
Survival
Adventure
3D
Exploration
Crafting
Atmospheric
Simulation
Zombies
2D
Multiple Endings
Early Access
Post-apocalyptic
Colony Sim
City Builder
Resource Management
Choices Matter
Replay Value
Indie


Описание, тэги и название случайной игры из датасета

In [37]:
description, tags, name = get_random_game()
print("Description of the game:")
print(description)
print("\nTags associated with this game:")
print(tags)
print('\nName:')
print(name)

Description of the game:
The best way to track Santa on Steam! Read Santa's very own blog as he prepares for the big day and receive exclusive news stories from the North Pole. Use the Advent Calendar to find out where Santa has been spotted during the run up to Christmas. Is he working in the toy factory or going for a ride?

Tags associated with this game:
Casual, Simulation

Name:
Santa Tracker


In [17]:
def evaluate_tags():
    # Выбор случайной игры из датасета
    random_game = data.sample(1).iloc[0]
    real_tags = set([tag.strip() for tag in random_game['Tags'].split(', ')])
    predicted_tags = set(find_similar_descriptions_and_suggest_tags(random_game['Description'], num_similar=5))
    
    # Сравнение тегов
    correct_tags = predicted_tags.intersection(real_tags)
    if len(real_tags) > 0:
        accuracy = len(correct_tags) / len(real_tags) * 100
    else:
        accuracy = 0  # На случай если теги отсутствуют
    return accuracy, real_tags, predicted_tags, correct_tags

Оценка тегов слуачйной игры из датасета

In [19]:
accuracy, real_tags, predicted_tags, correct_tags = evaluate_tags()
print(f"Accuracy: {accuracy:.2f}%")
print(f"Real Tags: {real_tags}")
print(f"Predicted Tags: {predicted_tags}")
print(f"Correctly Predicted Tags: {correct_tags}")

Accuracy: 92.86%
Real Tags: {'Sexual Content', 'Text-Based', 'Adventure', 'Choose Your Own Adventure', 'Singleplayer', 'Choices Matter', 'Dating Sim', 'Story Rich', 'Hentai', 'Indie', 'Interactive Fiction', 'Visual Novel', 'RPG', 'Fantasy'}
Predicted Tags: {'Dark', 'Sexual Content', 'Third Person', "Shoot 'Em Up", 'Survival', '3D', 'Adventure', 'Choose Your Own Adventure', 'Dating Sim', 'Indie', 'Visual Novel', 'Fantasy', 'Text-Based', '2D', 'Choices Matter', 'Story Rich', 'Horror', 'RPG', 'Singleplayer', 'Interactive Fiction'}
Correctly Predicted Tags: {'Sexual Content', 'Singleplayer', 'Adventure', 'Choose Your Own Adventure', 'Text-Based', 'Choices Matter', 'Dating Sim', 'Story Rich', 'Indie', 'Interactive Fiction', 'Visual Novel', 'RPG', 'Fantasy'}
