In [5]:
%pip install pandas datasets

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from datasets import load_dataset
from collections import defaultdict

# Загружаем датасет MovieLens (train split)
dataset = load_dataset("ashraq/movielens_ratings", split="train")

# Уменьшаем размер выборки для примера
n = 2000
small_dataset = dataset.select(range(n))
df = pd.DataFrame(small_dataset)

# Сортируем оценки по user_id и по времени просмотра
df = df.sort_values(['user_id'])

# Делим на train/val индексы для "истории" и "фильмов для оценки"
val_share = 0.2
user2indices = defaultdict(list)
for idx, user_id in enumerate(df['user_id']):
    user2indices[user_id].append(idx)

user_prompts = []

for user_id, indices in user2indices.items():
    n_hist = max(1, int(len(indices) * (1 - val_share)))
    history_idxs = indices[:n_hist]
    val_idxs = indices[n_hist:]
    if not val_idxs:
        continue  # Пропускаем юзеров только с историей
    
    # Формируем часть с историей
    history_movies = df.iloc[history_idxs]
    history_prompt = "User previously watched and rated the following movies:\n"
    for i, row in history_movies.iterrows():
        history_prompt += (
            f"{row['title']} | Genres: {row['genres']} | Rating: {row['rating']}\n"
        )
    history_prompt = history_prompt.strip()
    
    # Формируем часть с новыми фильмами для оценки (ground truth rating при необходимости можно добавить для сравнения)
    validation_movies = df.iloc[val_idxs]
    movies_to_rate_prompt = "\nNow, please provide your ratings (from 1 to 5) for the following movies:\n"
    for i, row in validation_movies.iterrows():
        movies_to_rate_prompt += (
            f"{row['title']} | Genres: {row['genres']}\n"
        )
    movies_to_rate_prompt = movies_to_rate_prompt.strip()

    prompt = history_prompt + "\n" + movies_to_rate_prompt

    # Можно добавить ground truth для валидации
    ground_truth = [
        {
            "title": row["title"],
            "genres": row["genres"],
            "true_rating": row["rating"],
        }
        for _, row in validation_movies.iterrows()
    ]
    
    user_prompts.append({
        "user_id": user_id,
        "prompt": prompt,
        "ground_truth": ground_truth
    })

# Пример одного промпта:
print("=== Пример GPT-промпта ===")
print(user_prompts[0]["prompt"])
print("--- Истинные оценки для валидации ---")
print(user_prompts[0]["ground_truth"])

# Теперь user_prompts можно использовать для подачи в GPT API, где prompt — это вся история + новые фильмы для оценки,
# а ground_truth — настоящие оценки для проверки результата.


=== Пример GPT-промпта ===
User previously watched and rated the following movies:
Moana (2016) | Genres: Adventure|Animation|Children|Comedy|Fantasy | Rating: 4.5
Now, please provide your ratings (from 1 to 5) for the following movies:
Hotel Transylvania 2 (2015) | Genres: Animation|Comedy
--- Истинные оценки для валидации ---
[{'title': 'Hotel Transylvania 2 (2015)', 'genres': 'Animation|Comedy', 'true_rating': 3.5}]


In [21]:
import ast
import collections
import copy
import datetime
import json
import re
import six
import tqdm
import re
import requests

In [None]:
TOKEN = '{{}}' # указываем токен

def get_gpt_response(passage_prompt):
    content = json.dumps(passage_prompt, ensure_ascii=False)
    response = requests.post(
        url='',
        headers={
            'Authorization': f'OAuth {TOKEN}',
            'Ya-User': '', # Здесь указываем свой логин
            'Content-Type': 'application/json',
        },
        json={
            "messages": [
                {"role": "user", "content": content}
            ],
            "model": "gpt-4o-2024-05-13"
        }
    )
    if response.json().get('response', {}).get('choices') is not None:
        return response.json()['response']['choices'][0]['message']['content']
    print(response.json())
    return None

def apply_to(inputs):
    gpt_answers = list()

    for text in tqdm.tqdm(inputs, total=len(inputs)):
        answer = get_gpt_response(text)
        if answer is None:
            break

        gpt_answers.append(answer)

    return gpt_answers

In [32]:
inputs = [dialog['prompt'] for dialog in user_prompts]