In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm
import torchutils as tu
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
torch.manual_seed(666)
import faiss

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [33]:
df = pd.read_csv("~/DS_Elbrus/123/game_data.csv")

In [45]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

In [46]:
def embed_bert_cls(text, model, tokenizer):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    with torch.no_grad():
        outputs = model(torch.tensor([tokens]))
    embeddings = outputs.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

print(embed_bert_cls("Hello, world", model, tokenizer).shape)

(312,)


In [36]:
df

Unnamed: 0,title,image_url,page_url,description
0,Wasteland,https://iwant.games/wp-content/uploads/wastela...,https://iwant.games/wasteland/,"Wasteland (рус. «Пустошь») — это одиночная, ро..."
1,The Need for Speed,https://iwant.games/wp-content/uploads/the-nee...,https://iwant.games/the-need-for-speed/,Road & Track Presents: The Need for Speed (рус...
2,System Shock,https://iwant.games/wp-content/uploads/system-...,https://iwant.games/system-shock/,System Shock – хоррор на выживание в жанре нау...
3,Warcraft: Orcs & Humans,https://iwant.games/wp-content/uploads/warcraf...,https://iwant.games/warcraft-orcs-and-humans/,Warcraft: Orcs and Humans — стратегия в реальн...
4,Warcraft 2: Tides of Darkness,https://iwant.games/wp-content/uploads/warcraf...,https://iwant.games/warcraft-2-tides-of-darkness/,Warcraft 2: Tides of Darkness — стратегия в ре...
...,...,...,...,...
1956,State of Decay 3,https://iwant.games/wp-content/uploads/state-o...,https://iwant.games/state-of-decay-3/,State of Decay 3 — постапокалиптический экшен ...
1957,The Elder Scrolls 6,https://iwant.games/wp-content/uploads/the-eld...,https://iwant.games/the-elder-scrolls-6/,The Elder Scrolls 6 — шестая часть популярной ...
1958,Mass Effect 5,https://iwant.games/wp-content/uploads/mass-ef...,https://iwant.games/mass-effect-5/,Mass Effect 5 — научно-фантастическая Action/R...
1959,Marvel’s Spider-Man 3,https://iwant.games/wp-content/uploads/marvels...,https://iwant.games/marvels-spider-man-3/,Marvel's Spider-Man 3 — приключенческий экшен ...


In [47]:
# Создание эмбеддингов для описаний
description_embeddings = np.vstack(
    df["description"].apply(lambda x: embed_bert_cls(x, model, tokenizer)).values
)

In [48]:
# Построение индекса FAISS
index = faiss.IndexFlatL2(description_embeddings.shape[1])
index.add(description_embeddings.astype(np.float32))

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1961 non-null   object
 1   image_url    1961 non-null   object
 2   page_url     1961 non-null   object
 3   description  1961 non-null   object
dtypes: object(4)
memory usage: 61.4+ KB


In [49]:
def find_shows(query, df, model, tokenizer, index, top_k=int):
    # Создание эмбеддингов для запроса
    query_embedding = embed_bert_cls(query, model, tokenizer).reshape(1, -1)

    # Поиск топ-K ближайших соседей с использованием Faiss
    _, top_k_indices = index.search(query_embedding.astype(np.float32), top_k)

    # Извлечение соответствующих строк из DataFrame
    result = df.iloc[top_k_indices[0]].copy()

    # Вычисление косинусного сходства для сортировки (опционально)
    similarities = cosine_similarity(
        query_embedding, description_embeddings[top_k_indices[0]]
    ).flatten()
    result["similarity"] = similarities

    return result.sort_values("similarity", ascending=False)

In [52]:
query = "Шутер от 1 лица"
find_shows(query, df, model, tokenizer, index, 10)

Unnamed: 0,title,image_url,page_url,description,similarity
616,Get Even,https://iwant.games/wp-content/uploads/Get-Eve...,https://iwant.games/get-even/,"Шутер от 1-го лица, события которого происходя...",0.670842
1846,Son and Bone,https://iwant.games/wp-content/uploads/son-and...,https://iwant.games/son-and-bone/,Son and Bone — шутер от 1-го лица во вселенной...,0.668715
813,Generation Zero,https://iwant.games/wp-content/uploads/generat...,https://iwant.games/generation-zero/,Generation Zero (рус. «Поколение Нулевых») - к...,0.666123
595,Sniper: Ghost Warrior 3,https://iwant.games/wp-content/uploads/sniper-...,https://iwant.games/sniper-ghost-warrior-3/,Sniper: Ghost Warrior 3 - тактический шутер от...,0.653031
476,The Witness,https://iwant.games/wp-content/uploads/the-wit...,https://iwant.games/the-witness/,The Witness - 3D видеоигра от 1-го лица голово...,0.628125
723,Agony,https://iwant.games/wp-content/uploads/agony-l...,https://iwant.games/agony/,Agony («Агония») - одиночная игра от 1-го лица...,0.61791
688,Kingdom Come: Deliverance,https://iwant.games/wp-content/uploads/kingdom...,https://iwant.games/kingdom-come-deliverance/,Kingdom Come: Deliverance - средневековые сраж...,0.605801
856,Wolfenstein: Youngblood,https://iwant.games/wp-content/uploads/wolfens...,https://iwant.games/wolfenstein-youngblood/,Wolfenstein Youngblood («Молодая кровь») - коо...,0.603804
573,theHunter: Call of the Wild,https://iwant.games/wp-content/uploads/thehunt...,https://iwant.games/thehunter-call-of-the-wild/,TheHunter: Call of the Wild - охотничий симуля...,0.601489
1765,Sniper Ghost Warrior Contracts 3,https://iwant.games/wp-content/uploads/sniper-...,https://iwant.games/sniper-ghost-warrior-contr...,Sniper Ghost Warrior Contracts 3 — третья част...,0.601481


In [51]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

# Сохранение модели и токенизатора
model.save_pretrained("my_saved_model")
tokenizer.save_pretrained("my_saved_model") 



('my_saved_model/tokenizer_config.json',
 'my_saved_model/special_tokens_map.json',
 'my_saved_model/vocab.txt',
 'my_saved_model/added_tokens.json',
 'my_saved_model/tokenizer.json')