In [44]:
import pickle
import pyarrow.parquet as pq
import pandas as pd
import tqdm

In [45]:
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier


def get_scores(X, y, x_predict, return_names=False, return_only_names=False):
    if return_only_names is True and return_names is False:
        raise ValueError("return_only_names cannot be True if return_names is False")
    clf = LogisticRegression(max_iter=3000).fit(X.to_numpy(), y)
    scores = clf.predict_proba([x_predict])
    if return_names is False:
        return scores
    else:
        classes = clf.classes_
        scores = scores.tolist()[0]
        classes = [(classes[i], scores[i]) for i in range(10)]
        classes.sort(key=lambda x: x[1], reverse=True)
        if return_only_names is True:
            classes = [i[0] for i in classes]
            print('результаты', classes)
        return classes


def get_scores_catboost(X, y, x_predict, return_names=False, return_only_names=False):
    if return_only_names is True and return_names is False:
        raise ValueError("return_only_names cannot be True if return_names is False")
    clf = CatBoostClassifier(iterations=1000, learning_rate=0.01, logging_level='Silent').fit(X, y)
    scores = clf.predict_proba([x_predict])
    if return_names is False:
        return scores
    else:
        classes = clf.classes_
        scores = scores.tolist()[0]
        classes = [(classes[i], scores[i]) for i in range(10)]
        classes.sort(key=lambda x: x[1], reverse=True)
        if return_only_names is True:
            classes = [i[0] for i in classes]
            print(classes)
        return classes

In [46]:
import random
import numpy as np
import pandas as pd
from pandas import DataFrame
from pynndescent import NNDescent
from sklearn import preprocessing
from tqdm import tqdm


def vid_long(videos: DataFrame, user_id: int, users: DataFrame) -> DataFrame:
    tqdm.pandas()
    hist = users[users["user_id"] == user_id]
    hist = pd.merge(hist, videos, on='item_id').drop(columns=["user_id"], axis=1)
    for i in tqdm(range(100)):
        hist[f"v_title_{i}"] = hist.progress_apply(lambda row: row[f"v_title_{i}"] * 2
        if ((row["watch_time"] / (row["duration"] / 1000)) > 0.25
            if (row["duration"] / 1000) > 300
            else row["watch_time"] > 30)
        else 1, axis=1)
        hist[f"v_title_{i}"] = preprocessing.minmax_scale(hist[f"v_title_{i}"].T).T
    return hist.drop(columns=["watch_time", "duration"], axis=1)


def like(videos: pd.DataFrame, user_id: int, emotions: pd.DataFrame) -> pd.DataFrame:
    emotions = emotions[["user_id", "item_id", "type"]]
    emotions = emotions[emotions["user_id"] == user_id]
    emotions = emotions[["item_id", "type"]]
    # hist = users[users["user_id"] == user_id]
    # hist = pd.merge(hist, videos, on='item_id').drop(columns=["user_id"], axis=1)
    # hist = pd.merge(hist, emotions, left_on='item_id', right_on="C3")
    tqdm.pandas()
    new_videos = videos.merge(emotions, on="item_id")
    if len(new_videos) != 0:
        videos = new_videos
        for i in tqdm(range(100)):
            videos[f"vid_title_{i}"] = videos.progress_apply(
                lambda row: row[f"vid_title_{i}"] * 2 if row["type"] == "pos_emotions"
                else (0.5 if row["type"] == "neg_emotions" else 1), axis=1).to_numpy()
            videos[f"vid_title_{i}"] = preprocessing.minmax_scale(np.array([videos[f"vid_title_{i}"]]).T).T
            videos[f"vid_descr_{i}"] = preprocessing.minmax_scale(np.array([videos.progress_apply(
                lambda row: row[f"vid_descr_{i}"] * 2 if row["type"] == "pos_emotions"
                else (0.5 if row["type"] == "neg_emotions" else 1), axis=1).to_numpy()]).T).T

            # normalize(np.array([videos.progress_apply(
            #         lambda row: row[f"vtitle{i}"] * 2 if row["C4"] == "pos_emotions"
            #         else (0.5 if row["C4"] == "neg_emotions" else 1), axis=1).to_numpy()]))[0]
        return videos.drop(columns=["type"], axis=1)
    else:
        return videos
    # return videos.drop(columns=["type"], axis=1)


def make_vector(data: DataFrame, users: DataFrame, emotions: DataFrame, user_id: str):
    # print("история просмотра", video_ids)

    video_ids = users.query("user_id == @user_id")["item_id"].values
    df = data.loc[data["item_id"].isin(video_ids)]

    # vid_long(data, user_id, users)
    # likes = like(df, user_id, emotions)

    # TODO add weights
    return df._get_numeric_data().mean()
    # return np.mean(df.drop(['item_id'], axis=1).values, axis=0)
    # return np.average(df.drop(['item_id', 'video_description', 'video_title', 'CTR_10days_01_08', 'author_title', 'tv_title', 'CTR_10days_21_07', 'CTR_10days_10_08', 'CTR_10days_21_08', 'tv_sub', 'season', 'publicated', 'category_title'], axis=1).values, axis=0)
    # return np.mean(likes.drop(['item_id', 'video_description', 'video_title', 'ctr.CTR_10days_01_08', 'ctr.CTR_10days_21_07', 'ctr.CTR_10days_10_08', 'ctr.CTR_10days_21_08', 'tv_sub', 'season', 'publicated', 'category_title'], axis=1).values, axis=0)


def get_video_corpus(data: DataFrame, vector: np.ndarray):
    df = data.drop('item_id', axis=1).columns.to_list()
    top100nearest = pickled_model.query(pd.DataFrame([vector], columns=df), k=10)
    vids_indices = top100nearest[0][0]
    res_data = data.iloc[vids_indices]
    return res_data.drop(['item_id'], axis=1).values, res_data['item_id'].values
    # return res_data.drop(['item_id', 'video_description', 'video_title', 'CTR_10days_01_08','author_title', 'tv_title', 'CTR_10days_21_07', 'CTR_10days_10_08', 'CTR_10days_21_08', 'tv_sub', 'season', 'publicated', 'category_title'], axis=1), res_data['item_id'].values


def get_10_category(train_hist: pd.DataFrame, new_hist: pd.DataFrame = None) -> list:
    if new_hist is not None:
        train_hist = train_hist.append(new_hist, ignore_index=True)
    cat_columns = [col for col in tqdm(train_hist.columns) if col.startswith('cat')]
    cat_data = train_hist[cat_columns]

    # подсчет количества 1 в каждом столбце
    counts = cat_data.sum()

    # сортировка столбцов по убыванию количества 1 и вывод первых 10
    top_columns = counts.sort_values(ascending=False)[:10]
    return top_columns


def get_top_videos_in_cat(cat_name, train_hist: pd.DataFrame, new_hist: pd.DataFrame = None) -> str:
    if new_hist is not None:
        train_hist = train_hist.append(new_hist, ignore_index=True)
    cat_data = train_hist.loc[train_hist[cat_name] == 1]
    counts = cat_data['item_id'].value_counts()
    return counts.index[0]


def get_top_videos(train_hist: pd.DataFrame, new_hist: pd.DataFrame = None) -> list:
    top_category = get_10_category(train_hist, new_hist)
    return [get_top_videos_in_cat(i, train_hist, new_hist) for i in tqdm(top_category)]


def get_popular() -> np.ndarray:
    popular = pd.read_csv("top_cat.csv")
    popular = data["item_id"].to_numpy()
    popular = np.reshape(popular, (10, 10))
    result = []
    for i in range(10):
        result.append(popular[i][random.randint(0, 9)])
    result = np.random.permutation(result)
    return result


def predict_user(user_id):
    user_items = users.loc[users["user_id"] == user_id, "item_id"].values
    if len(user_items) == 0:
        return get_popular()
    x_predict = make_vector(data, user_items, users, emotions)
    if len(x_predict) == 0:
        return get_popular()
    corpus, target = get_video_corpus(data, x_predict)
    return target


def create_submission_file(path: str, data: DataFrame, users: DataFrame, emotions: DataFrame):
    test_file = pd.read_csv(path)
    user_ids = test_file["user_id"].values
    preds = []
    for idx, user_id in enumerate(tqdm(user_ids)):
        # user_items = users.query("user_id == @user_id")["item_id"].values
        x_predict = make_vector(data, users, emotions, user_id)
        if len(x_predict) == 0:
            preds.append(get_popular)
            break
        corpus, target = get_video_corpus(data, x_predict)
        preds.append(target)

        # if idx % 20 == 0:
        #     submission = pd.DataFrame({"user_id": user_ids[:idx+1], "recs": preds})
        #     submission.to_csv(f"submission_{idx}.csv", index=False)

    submission = pd.DataFrame({"user_id": user_ids, "recs": preds})
    submission.to_csv("submission.csv", index=False)

In [48]:
data = pd.read_csv("final_df_2.csv")

  data = pd.read_csv("final_df_2.csv")


In [None]:
pickled_model = pickle.load(open('final_model_jaccard_2.pkl', 'rb'))

In [55]:
data.drop('Unnamed: 0', inplace=True, axis=1)
data.drop(['video_description', 'video_title', 'ctr.CTR_10days_01_08', 'author_title', 'tv_title', 'ctr.CTR_10days_21_07', 'ctr.CTR_10days_10_08', 'ctr.CTR_10days_21_08', 'tv_sub', 'season', 'publicated', 'category_title'], axis=1, inplace=True)

In [6]:
users = pq.read_table('train_dataset_RUTUBE/player_starts_train.parquet').to_pandas()

In [5]:
emotions = pd.read_csv("train_dataset_RUTUBE/emotions.csv")

In [56]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [57]:
test.head()

Unnamed: 0,item_id,vid_title_0,vid_title_1,vid_title_2,vid_title_3,vid_title_4,vid_title_5,vid_title_6,vid_title_7,vid_title_8,...,cat_Юмор,duration,channel_sub,upld_year,upld_month,upld_day,upld_hour,upld_minute,upld_second,upld_dayofweek
1025987,video_594977,0.143496,0.564228,0.565773,0.133816,0.185969,0.739892,0.611547,0.355713,0.433471,...,0,103000,7,2021,7,19,14,0,11,0
468859,video_2000944,0.437278,0.502699,0.468497,0.903102,0.579554,0.583583,0.459427,0.325452,0.720823,...,0,638456,30,2022,3,15,20,21,22,1
1638170,video_947196,0.424399,0.463304,0.31054,0.133427,0.386383,0.410636,0.441294,0.590166,0.337766,...,0,30813,10,2023,4,15,10,52,2,5
423766,video_2283492,0.534073,0.397533,0.537404,1.0,0.53618,0.534906,0.333033,0.613686,0.615194,...,0,1516264,1,2023,7,18,13,46,18,1
1597720,video_1537558,0.58403,0.516683,0.733101,0.776581,0.642122,0.621102,0.447726,0.435635,0.504945,...,0,3840067,18,2023,2,5,8,47,24,6


In [58]:
neighbors = pickled_model.query(test.drop(['item_id'], axis=1).head(10), k=100)

In [59]:
create_submission_file("train_dataset_RUTUBE/sample_submission.csv", data, users, emotions)

  0%|          | 2/97240 [01:21<1095:05:56, 40.54s/it]


KeyboardInterrupt: 