In [120]:
from google.colab import drive
import pandas as pd
import numpy as np
import ast


def load_and_process_data(file_path):
  """
  Загружает данные из CSV-файла, обрабатывает столбцы с эмбеддингами.

  Args:
    file_path: Путь к CSV-файлу.

  Returns:
    DataFrame с обработанными данными.
  """
  drive.mount('/content/drive')
  df_stat = pd.read_csv(file_path)

  try:
    df_stat['combined_embedding'][0] = ast.literal_eval(df_stat['combined_embedding'][0])
  except: pass

  try:
    df_stat['description_embedding'] = df_stat['description_embedding'].apply(lambda x: ast.literal_eval(x))
  except: pass
  try:
    df_stat['combined_embedding'] = df_stat['combined_embedding'].apply(lambda x: ast.literal_eval(x))
  except: pass
  try:
    df_stat['category_embedding'] = df_stat['category_embedding'].apply(lambda x: ast.literal_eval(x))
  except: pass

  return df_stat


# Загрузка и препроцессинг данных:
file_path = '/content/drive/MyDrive/ColabFiles/Datasets/train_dataset_cold_start_train/video_stat_50k_emb.csv'
df_stat = load_and_process_data(file_path)
print(df_stat.head())


Unnamed: 0,video_id,v_pub_datetime,v_total_comments,v_year_views,v_month_views,v_week_views,v_day_views,v_likes,v_dislikes,v_duration,...,days_since_publication,categoty_embedding,description_embedding,combined_embedding,category_embedding,total_views_norm,week_views_norm,freshness_norm,ranking_score,cosine_similarity
0,973f24c0-15b2-434f-8740-4e2726f79c30,2024-06-15 22:58:03+03:00,0,556,2,0,0,0,0,7.733,...,105,"tensor([ 2.5785e-01, -2.0326e-01, -3.0456e-01,...","[-0.9560340642929077, 0.21739435195922852, 0.1...","[0.25784894824028015, -0.20325875282287598, -0...","[0.25784894824028015, -0.20325875282287598, -0...",5.7e-05,0.0,0.994749,0.24871,0.120359
1,1a3f527c-44b2-4f7e-9641-26f078edbf2e,2021-04-18 17:27:42+03:00,0,109,2,1,0,0,0,379.0,...,1259,"tensor([ 5.2154e-01, -1.3211e-01, -3.3299e-01,...","[0.20713642239570618, 0.054023392498493195, 0....","[0.5215436220169067, -0.13210757076740265, -0....","[0.5215436220169067, -0.13210757076740265, -0....",1.1e-05,0.0,0.937034,0.234263,0.102284
2,2f563d09-45ef-4e27-8da0-daa74235a2e7,2022-11-17 16:36:43+03:00,0,163,2,2,0,0,0,1020.767,...,681,"tensor([ 4.3666e-01, -6.3910e-02, -2.0521e-01,...","[0.5428599119186401, -0.9696580171585083, 0.86...","[0.4366607367992401, -0.06391023844480515, -0....","[0.4366607367992401, -0.06391023844480515, -0....",1.7e-05,0.0,0.965941,0.241492,0.092078
3,154fac0e-1794-4406-afa0-61f688e3a764,2010-01-01 00:00:01+03:00,0,10657,5227,1300,202,24,0,42.025,...,5385,"tensor([ 1.3409e-01, -3.0837e-01, 1.1327e-01,...","[0.07294316589832306, -0.38724276423454285, 0....","[0.13409094512462616, -0.30836614966392517, 0....","[0.13409094512462616, -0.30836614966392517, 0....",0.001102,0.025841,0.730683,0.192156,0.108834
4,470b4e3e-e06d-4370-80dc-34d6a78b22db,2023-11-29 16:49:46+03:00,0,154,45,13,1,0,0,695.647,...,304,"tensor([ 4.7319e-01, -4.0215e-01, -4.5259e-01,...","[-0.6243616342544556, -0.6120455265045166, 0.4...","[0.47319018840789795, -0.40214964747428894, -0...","[0.47319018840789795, -0.40214964747428894, -0...",1.6e-05,0.000178,0.984796,0.246268,0.095935


In [121]:
def rank_videos(df, total_views_weight=0.4, week_views_weight=0.35, freshness_weight=0.25, top_n=10):

    # Нормализуем каждый критерий
    df['total_views_norm'] = df['v_year_views'] / df['v_year_views'].max()
    df['week_views_norm'] = df['v_long_views_7_days'] / df['v_long_views_7_days'].max()
    df['freshness_norm'] = 1 - (df['days_since_publication'] / df['days_since_publication'].max())

    # Рассчитываем итоговый рейтинг для каждой строки
    df['ranking_score'] = (df['total_views_norm'] * total_views_weight +
                           df['week_views_norm'] * week_views_weight +
                           df['freshness_norm'] * freshness_weight)

    # Сортируем по итоговому рейтингу
    df_sorted = df.sort_values(by='ranking_score', ascending=False)

    # Возвращаем топ-N наиболее релевантных видео
    return df_sorted.head(top_n)[['video_id', 'ranking_score']]

# Пример использования
top_10_videos = rank_videos(df_stat)
print(top_10_videos)

                                   video_id  ranking_score
36150  ab3e67cf-7bce-4282-8148-ea816f80d40a       0.794079
12743  13cdb496-a252-4ced-bf7e-efb4a0275c4e       0.618739
37875  a8e1980f-614b-4ddd-a08c-4134f0e6eb81       0.601518
3744   5540694f-7499-4c6e-97d1-ba453bfa1d25       0.586449
21243  1efd3f7e-67ad-4edd-bc38-91cda229f1d6       0.514925
2090   104f448e-668b-4553-9557-65059249072d       0.487308
16069  77b8dd6c-4098-433c-8720-084564088391       0.463249
31551  6914161e-6b58-40fa-a682-714a87926c3d       0.460590
130    0c3ce65a-d7cf-4e1d-8e8b-b81f67ee24fc       0.430457
16105  4bb0b99e-eee3-4ea4-acc4-051b9c340145       0.430245


In [91]:
def update_interest_vector(user_vector, video_vector, feedback, learning_rate=0.1):
    """
    Обновление вектора интересов пользователя на основе обратной связи (лайк/дизлайк).

    :param user_vector: Вектор интересов пользователя (np.array)
    :param video_vector: Вектор видео (np.array)
    :param feedback: Обратная связь пользователя (1 для лайка, -1 для дизлайка)
    :param learning_rate: Скорость обучения (шаг градиентного обновления)
    :return: Обновленный вектор интересов пользователя
    """
    # Проверяем, что размерности векторов совпадают
    assert len(user_vector) == len(video_vector), "Векторы пользователя и видео должны быть одинаковой размерности"

    # Градиентное обновление вектора интересов пользователя
    updated_user_vector = user_vector + learning_rate * feedback * (video_vector - user_vector)

    # Нормализация вектора для предотвращения его роста
    if sum(updated_user_vector) >= len(updated_user_vector):
      updated_user_vector = updated_user_vector / np.linalg.norm(updated_user_vector)

    return updated_user_vector

In [32]:
def cosine_similarity(user_vector, video_vector):
    """
    Вычисляет косинусное сходство между вектором пользователя и вектором видео.

    :param user_vector: Вектор интересов пользователя
    :param video_vector: Вектор видео
    :return: Косинусное сходство
    """
    # Преобразуем векторы в numpy массивы
    user_vector = np.array(user_vector)
    video_vector = np.array(video_vector)

    # Вычисляем косинусное сходство
    dot_product = np.dot(user_vector, video_vector)
    norm_user = np.linalg.norm(user_vector)
    norm_video = np.linalg.norm(video_vector)

    if norm_user == 0 or norm_video == 0:
        return 0.0  # Избегаем деления на ноль

    cosine_similarity = dot_product / (norm_user * norm_video)
    return cosine_similarity


In [None]:
# Поиск N самых близких к интересу пользователя видео по cosine_similarity
def find_closest_videos(user_vector, df, n=10):
  """
  Находит N видео, наиболее близких к вектору интересов пользователя,
  используя косинусное сходство.

  :param user_vector: Вектор интересов пользователя
  :param df: DataFrame с данными о видео, содержащий столбец 'combined_embedding'
  :param n: Количество видео для возврата
  :return: DataFrame с N наиболее близкими видео
  """
  df['cosine_similarity'] = df['combined_embedding'].apply(lambda x: cosine_similarity(user_vector, x))
  df_sorted = df.sort_values(by='cosine_similarity', ascending=False)
  return df_sorted.head(n)[['video_id', 'cosine_similarity']]

# Пример использования:
# closest_videos = find_closest_videos(user_vector, df_stat, n=10)
# print(closest_videos)


### Processing

In [110]:
LEN_USER_VECTOR = len(df_stat['combined_embedding'][0])

user_vector = np.array([0.5] * LEN_USER_VECTOR)

In [111]:
recommended_videos = top_10_videos['video_id'].to_list()

recommended_videos

['ab3e67cf-7bce-4282-8148-ea816f80d40a',
 '13cdb496-a252-4ced-bf7e-efb4a0275c4e',
 'a8e1980f-614b-4ddd-a08c-4134f0e6eb81',
 '5540694f-7499-4c6e-97d1-ba453bfa1d25',
 '1efd3f7e-67ad-4edd-bc38-91cda229f1d6',
 '104f448e-668b-4553-9557-65059249072d',
 '77b8dd6c-4098-433c-8720-084564088391',
 '6914161e-6b58-40fa-a682-714a87926c3d',
 '0c3ce65a-d7cf-4e1d-8e8b-b81f67ee24fc',
 '4bb0b99e-eee3-4ea4-acc4-051b9c340145']

In [112]:
# Псевлослучайные оценки на подборку

feedback_dict = {
    recommended_videos[0]: 1,
    recommended_videos[1]: 0,
    recommended_videos[2]: -1,
    recommended_videos[3]: 1,
    recommended_videos[4]: 0,
    recommended_videos[5]: 1,
    recommended_videos[6]: -1,
    recommended_videos[7]: 0,
    recommended_videos[8]: 1,
    recommended_videos[9]: 1
}

In [113]:
video_pool = []
archived_videos = []

In [114]:
# Обновление вектора

for video in recommended_videos:
  video_vector = df_stat.loc[df_stat['video_id'] == video, 'combined_embedding']
  video_vector = video_vector.to_list()[0]
  updated_user_vector = update_interest_vector(user_vector, video_vector, feedback_dict[video])
  if video not in video_pool:
    archived_videos.append(video)

user_vector = updated_user_vector
archived_videos

['ab3e67cf-7bce-4282-8148-ea816f80d40a',
 '13cdb496-a252-4ced-bf7e-efb4a0275c4e',
 'a8e1980f-614b-4ddd-a08c-4134f0e6eb81',
 '5540694f-7499-4c6e-97d1-ba453bfa1d25',
 '1efd3f7e-67ad-4edd-bc38-91cda229f1d6',
 '104f448e-668b-4553-9557-65059249072d',
 '77b8dd6c-4098-433c-8720-084564088391',
 '6914161e-6b58-40fa-a682-714a87926c3d',
 '0c3ce65a-d7cf-4e1d-8e8b-b81f67ee24fc',
 '4bb0b99e-eee3-4ea4-acc4-051b9c340145']

In [None]:
closest_videos = find_closest_videos(user_vector, df_stat, n=10)