## Преобразование данных

In [None]:
import os
import zipfile

import gdown
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
url = "https://drive.google.com/uc?export=download&confirm=no_antivirus&id=1uKwBqaqnuuKdJxIkfblsEONHMfW8KMrz"

gdown.download(url, "data.zip")

In [None]:
!unzip "data.zip"

In [None]:
train_path = "/content/train"
test_path = "/content/test"

In [None]:
def get_dataframe(path):

    users = []
    items = []
    orders = []
    with open(path, "r") as file:
        data = file.read().splitlines()
        for i, string in tqdm(enumerate(data)):
            item = string.split()
            user = np.full(len(string), i)
            order = list(range(len(item), 0, -1))

            users.extend(user)
            items.extend(item)
            orders.extend(order)
            del data[0]
    print(len(users))
    print(len(items))
    print(len(orders))

    return pd.DataFrame({"user": users, "item": items, "order": orders})

In [None]:
train_df = get_dataframe(train_path)
test_df = get_dataframe(test_path)

In [None]:
df = train_df.append(test_df)

# Задание 1 - Validation

In [None]:
df = pd.read_csv("/content/dataframe.csv/dataframe.csv")
df.drop(["rank", "is_train"], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0,user_id,track_id,reversed_rank
0,0,333396,53
1,0,267089,52
2,0,155959,51
3,0,353335,50
4,0,414000,49


In [None]:
class UsersKFoldPOut:
    def __init__(self, n_folds, p, random_seed=23):
        self.n_folds = n_folds
        self.p = p
        self.random_seed = random_seed

    def split(self, df):
        users = df["user_id"].unique()
        users_count = len(users)

        np.random.seed(self.random_seed)
        np.random.shuffle(users)

        fold_sizes = np.full(self.n_folds, users_count // self.n_folds, dtype=int)
        fold_sizes[: users_count % self.n_folds] += 1
        current = 0
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            p_range = list(range(self.p))
            test_fold_users = users[start:stop]
            test_mask = df["reversed_rank"].isin(p_range) & df["user_id"].isin(
                test_fold_users
            )
            train_mask = ~df["user_id"].isin(test_fold_users)

            yield train_mask, test_mask

In [None]:
cv = UsersKFoldPOut(n_folds=3, p=5)

for i, (train_mask, test_mask) in enumerate(cv.split(df)):
    train = df[train_mask]
    test = df[test_mask]
    print(f"Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}")

Fold#0 | Train: 78277912, Test: 2417380
Fold#1 | Train: 78277912, Test: 2417380
Fold#2 | Train: 78277978, Test: 2417375


Проверим есть ли общие пользователи

In [None]:
train_users = train.user_id.unique()
test_users = test.user_id.unique()

np.intersect1d(test_users, train_users)

array([], dtype=int64)

Проверим сколько треков в тесте для каждого пользователя

In [None]:
print("Количество последних треков: ", len(test.reversed_rank.unique()))
print("Ранг треков: ", *test.reversed_rank.unique())

Количество последних треков:  5
Ранг треков:  4 3 2 1 0


# Задание 2 - PFound

In [None]:
url = "https://drive.google.com/uc?export=download&confirm=no_antivirus&id=1lEt7Lkbj3YFDnIkK7qKu9lzCBN3zEQ_w"

gdown.download(url, "yandex_cup.zip")

In [None]:
!unzip -q "/content/yandex_cup.zip"

In [None]:
!unzip -q "/content/open_task.zip"

Считываем данные

In [None]:
qid_query = pd.read_csv("open_task/qid_query.tsv", sep="\t", names=["qid", "query"])
qid_url_rating = pd.read_csv(
    "open_task/qid_url_rating.tsv", sep="\t", names=["qid", "url", "rating"]
)
hostid_url = pd.read_csv("open_task/hostid_url.tsv", sep="\t", names=["hostid", "url"])

In [None]:
qid_url_rating_hostid = pd.merge(qid_url_rating, hostid_url, on="url")

In [None]:
def mean_pfound(df: pd.DataFrame, k: int) -> float:
    def pfound(group):
        max_by_host = group.groupby("hostid")["rating"].max()
        top10 = max_by_host.sort_values(ascending=False)[:k]
        p_look = ((1 - top10) * 0.85).cumprod().shift(1, fill_value=1)
        return (top10 * p_look).sum()

    return df.groupby("qid").apply(pfound).mean()

In [None]:
mean_pfound(qid_url_rating_hostid, 10)

0.4603173929969002

Решение через цикл

In [None]:
def plook(ind, rels):
    if ind == 0:
        return 1
    return plook(ind-1, rels)*(1-rels[ind-1])*(1-0.15)


def pfound(group):
    max_by_host = group.groupby("hostid")["rating"].max() # Максимальный рейтинг хоста
    top10 = max_by_host.sort_values(ascending=False)[:10] # Берем топ10 урлов с наивысшим рейтингом
    pfound = 0
    for ind, val in enumerate(top10):
        pfound += val*plook(ind, top10.values)
    return pfound


qid_pfound = qid_url_rating_hostid.groupby("qid").apply(pfound) # Группируем по qid и вычисляем pfound
qid_pfound.mean() # Берем qid с максимальным pfound


0.4603173929969002