In [5]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import preprocessing
pd.options.display.max_colwidth = 100

In [6]:
TRAIN_PATH = 'train_dataset_RUTUBE/'
MODELS_PATH = 'doc2vec_models'

In [7]:
data =  pq.read_table(TRAIN_PATH+'videos.parquet').to_pandas()
emotions = pd.read_csv(TRAIN_PATH+'emotions.csv')

In [78]:
# schema = pa.schema([
#     ('item_id', pa.string()),
#     ('video_title', pa.string()),
#     ('author_title', pa.string()),
#     ('tv_title', pa.string()),
#     ('season', pa.int16()),
#     ('video_description', pa.string()),
#     ('category_title', pa.string()),
#     ('publicated', pa.string()),
#     ('duration', pa.int32()),
#     ('channel_sub', pa.int32()),
#     ('tv_sub', pa.int32()),
#     ('CTR_10days_21_07', pa.float32()),
#     ('CTR_10days_01_08', pa.float32()),
#     ('CTR_10days_10_08', pa.float32()),
#     ('CTR_10days_21_08', pa.float32()),
#     ('__index_level_0__', pa.int64())
# ])
#
# source_table = pa.Table.from_pandas(data)
# source_table = source_table.cast(schema)
# pq.write_table(source_table, 'my_parquet_table.parquet')

In [8]:
data.rename(columns={'ctr.CTR_10days_21_07': 'CTR_10days_21_07', 'ctr.CTR_10days_01_08': 'CTR_10days_01_08',
                     'ctr.CTR_10days_10_08': 'CTR_10days_10_08', 'ctr.CTR_10days_21_08': 'CTR_10days_21_08'}, inplace=True)

In [2]:
users = pq.read_table('train_dataset_RUTUBE/player_starts_train.parquet').to_pandas()

In [19]:
small_users = pd.read_csv('train_dataset_RUTUBE/small_player_starts_train.csv')

In [191]:
import string
translator = str.maketrans('', '', string.punctuation)

data['video_title'] = data['video_title'].apply(lambda x: x.translate(translator).lower().strip() if isinstance(x, str) else x)
data['video_description'].fillna("", inplace=True)
data['video_description'] = data['video_description'].apply(lambda x: x.translate(translator).lower().strip() if isinstance(x, str) else x)


In [193]:
video_descriptions = [TaggedDocument(doc, [i]) for i, doc in enumerate(data['video_description'].tolist())]
author_titles = [TaggedDocument(doc, [i]) for i, doc in enumerate(data['author_title'].tolist())]
video_titles = [TaggedDocument(doc, [i]) for i, doc in enumerate(data['video_title'].tolist())]

model_video_descr = Doc2Vec(video_descriptions,
                vector_size=100,
                window=len(video_descriptions),
                workers=16,
                epochs=10,
                seed=42)

model_author_titles = Doc2Vec(author_titles,
                vector_size=100,
                window=len(author_titles),
                workers=8,
                epochs=10,
                seed=42)

model_video_titles = Doc2Vec(video_titles,
                vector_size=100,
                window=len(video_titles),
                workers=8,
                epochs=10,
                seed=42)

KeyboardInterrupt: 

In [None]:
# model_video_titles.save(MODELS_PATH+'model_video_titles')
# model_author_titles.save(MODELS_PATH+'model_author_titles')
model_video_descr.save(MODELS_PATH+'model_video_descr')


In [10]:
# author_titles = np.load('doc2vec_models/model_author_titles.dv.vectors.npy', allow_pickle=True)
video_titles = np.load('doc2vec_models/model_video_titles.dv.vectors.npy', allow_pickle=True)
video_descrs= np.load('doc2vec_models/model_video_descr.dv.vectors.npy', allow_pickle=True)

In [11]:
video_titles_normalized = preprocessing.minmax_scale(video_titles.T).T
video_descrs_normalized = preprocessing.minmax_scale(video_descrs.T).T

In [12]:
text_embs = np.concatenate([video_titles_normalized,video_descrs_normalized], axis=-1)

In [13]:
cols_titles = [f'vid_title_{i}' for i in list(map(str, list(range(100))))]
cols_descrs_ = [f'vid_descr_{i}' for i in list(map(str, list(range(100))))]
cols_titles.extend(cols_descrs_)
df = pd.DataFrame(text_embs, columns = cols_titles)

In [14]:
df['item_id'] = data.item_id
cols = ['item_id']
cols.extend(cols_titles)
df = df[cols]

In [15]:
# categories_dict = {cat: idx for idx, cat in enumerate(data['category_title'].unique())}
# data['category_title'] = data['category_title'].map(categories_dict)
author_titles = {title: idx for idx, title in enumerate(data['author_title'].unique())}
data['author_title'] = data['author_title'].map(author_titles)

In [16]:
df = pd.concat([df, pd.get_dummies(data['category_title'], prefix='cat')], axis=1)

In [233]:
# videos_emotions_types = emotions.groupby(['item_id', 'type']).size().unstack(fill_value=0).reset_index()

In [17]:
final = df.merge(data,how='inner', on='item_id')

In [97]:
# final.drop(['video_title', 'tv_title', 'season', 'video_description',
#        'channel_sub', 'tv_sub',
#        ], inplace=True, axis=1)

In [18]:
final['CTR_10days_21_07'].fillna(data['CTR_10days_21_07'].mode()[0], inplace=True)
final['CTR_10days_01_08'].fillna(data['CTR_10days_01_08'].mode()[0], inplace=True)
final['CTR_10days_10_08'].fillna(data['CTR_10days_10_08'].mode()[0], inplace=True)
final['CTR_10days_21_08'].fillna(data['CTR_10days_21_08'].mode()[0], inplace=True)

In [19]:
final['publicated'] = pd.to_datetime(final['publicated'])

In [20]:
final['upld_year'] = final['publicated'].apply(lambda x: x.year)
final['upld_month'] = final['publicated'].apply(lambda x: x.month)
final['upld_day'] = final['publicated'].apply(lambda x: x.day)
final['upld_hour'] = final['publicated'].apply(lambda x: x.hour)
final['upld_minute'] = final['publicated'].apply(lambda x: x.minute)
final['upld_second'] = final['publicated'].apply(lambda x: x.second)
final['upld_dayofweek'] = final['publicated'].apply(lambda x: x.weekday())

In [1]:
final = final[final.columns.drop(list(df.filter(regex='vid_title')))]

NameError: name 'final' is not defined

In [22]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(final, test_size=0.2, random_state=42)

In [3]:
train

NameError: name 'train' is not defined

In [23]:
import pynndescent
index = pynndescent.NNDescent(train.drop(['item_id', 'video_description', 'tv_title', 'video_title', 'CTR_10days_01_08', 'CTR_10days_21_07', 'CTR_10days_10_08', 'CTR_10days_21_08', 'tv_sub', 'season', 'publicated', 'category_title'], axis=1), n_jobs=8, n_neighbors=5, metric='cosine')

In [276]:
index.prepare()

  self._set_arrayXarray(i, j, x)


In [287]:
neighbors = index.query(test.drop(['item_id', 'video_description', 'tv_title', 'video_title', 'CTR_10days_01_08', 'CTR_10days_21_07', 'CTR_10days_10_08', 'CTR_10days_21_08', 'tv_sub', 'season', 'publicated', 'category_title'], axis=1).head(10), k=100)

In [279]:
test.head(10)['item_id']

1025987     video_594977
468859     video_2000944
1638170     video_947196
423766     video_2283492
1597720    video_1537558
1698657      video_53898
67347      video_1350571
872393      video_468461
1564649    video_2155498
79079       video_687512
Name: item_id, dtype: object

In [294]:
for i in neighbors[0][4]:
    print(data[data['item_id'] == train.iloc[i].item_id].video_title)


1683267    OKTOBER 2101 - Technomania 012
Name: video_title, dtype: object
749157    ТАХИОН ➤ Ratchet & Clank: Rift Apart #1
Name: video_title, dtype: object
794685    МОГИЛА ГЕНРИ ЭВЕРИ ◢ Uncharted 4 A Thiefs End #6
Name: video_title, dtype: object
1866885    буднитурагента
Name: video_title, dtype: object
2027168    Parklane, a Luxury Collection Resort & Spa
Name: video_title, dtype: object
1030059    Прогулка со смертью: начинается с подземелья мурлоков / Озвучка манги / Глава 1-10
Name: video_title, dtype: object
45303    Новогодний утренник в Детском Саду\nСанкт - Петербург 2023
Name: video_title, dtype: object
1561585    Инструментальная азиатская музыка. Тибетская музыка для здоровья. Гималайский храм. №55
Name: video_title, dtype: object
1063682    Место и сроки Выживания! Радужные флаги и _радужные_ люди. Он, Она и... Оно(!_) Ольга Викторовна.
Name: video_title, dtype: object
891764    ПОЧТМЕЙСТЕР 2 0 или ОЖИВЛЕНИЕ ЧЕРЕЗ ПОЧТОВЫЙ СОЮЗ.  часть 2
Name: video_title, dtype: object

In [295]:
data[data['item_id'] == 'video_1537558'].video_title

1597718    Micro Machines (NES Dendy 8bit) - Full Walkthrough Longplay no commentary - Микро машинки на денди
Name: video_title, dtype: object

In [293]:
data[data['video_title'].str.contains('Казахстан')]

Unnamed: 0,item_id,video_title,author_title,tv_title,season,video_description,category_title,publicated,duration,channel_sub,tv_sub,CTR_10days_21_07,CTR_10days_01_08,CTR_10days_10_08,CTR_10days_21_08
4,video_105383,"Вебинар ""Особенности трудоустройства граждан Белоруссии, Казахстана, Киргизии и Армении""",4,,0,"10.08.2023 Вебинар ""Особенности трудоустройства граждан Белоруссии, Казахстана, Киргизии и Армен...",Бизнес и предпринимательство,2023-08-11 09:02:07+03:00,3834404,19,0,0.0,0.0,0.000000,0.0
977,video_193682,Казахстанский певец Димаш впервые выступил в Армении,700,,0,В Армению впервые приехал казахстанский певец Димаш. Концерт прошел в Большом зале спортивно-кон...,Искусство,2023-05-21 21:24:03+03:00,66240,2714,0,0.0,0.0,0.000000,0.0
3132,video_1079724,Приложение Музеи Казахстана,833,,0,"Представляю вашему вниманию нашу идею для стартапа ""Музеи Казахстана"". Прошу не судить строго, в...",Разное,2023-07-15 12:17:07+03:00,122114,0,0,0.0,0.0,0.000000,0.0
3752,video_1660306,"ZEEKR YOU 001,МАКСИМАЛКА НА ГАЗОВОЙ КРЫШЕ, Автозаказ из ОАЭ, Китая, Казахстана т 89853354575 ватсап",3036,,0,Пишите и звоните ставьте лайки и подпишитесь на Ютуб канал.\nВКонтакте:\nhttps://vk.com/avtozaka...,Авто-мото,2023-08-19 10:10:50+03:00,66106,0,0,0.0,0.0,0.000000,0.0
8504,video_830463,поселок ГЛУБОКОЕ. Восточный Казахстан. Осень 2022г. Съемка по заявкам зрителей.,5902,,0,"Мы с вами проедем по всем центральным улицам поселка, так же заедем в микрорайон Курчум. Посмотр...",Путешествия,2022-10-08 18:01:25+03:00,1264811,6,0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2315276,video_2263059,"Трейлер фильма ""Ореховое дерево"" (Казахстан)",4762,,0,,Разное,2023-07-07 14:01:59+03:00,110597,3,0,0.0,0.0,0.000000,0.0
2317049,video_572127,«Казахстан Темир Жолы» не будет повышать цены на билеты в 2016 году - Kazakh TV,2985,,0,,Разное,2023-07-29 16:44:01+03:00,163190,2,0,0.0,0.0,0.000000,0.0
2318179,video_484754,Восточный Казахстан г. Шемонаиха,1079,,0,,Разное,2023-07-22 15:15:10+03:00,78229,1,0,,,,
2319052,video_1884090,"[ES] 102.5 Радио Шалкар (Макат, Казахстан), dist 2081 km. Ant 6-el yagi. Прием 23.07.19 г.",6313,,0,,Разное,2023-07-29 13:43:39+03:00,20829,0,0,0.0,0.0,0.000000,0.0


In [297]:
import pickle

pickle.dump(index, open('index_vid_titles_categs.pkl', 'wb'))
# m = pickle.load(open('index_vid_titles.pkl', 'rb'))

In [None]:

def wid_long(videos: pd.DataFrame, user_id: int, users: pd.DataFrame) -> pd.DataFrame:
    tqdm.pandas()
    hist = users[users["user_id"] == user_id]
    hist = pd.merge(hist, videos, on='item_id').drop(columns=["user_id"], axis=1)
    for i in tqdm(range(100)):
        hist[f"v_title_{i}"] = hist.progress_apply(lambda row: row[f"v_title_{i}"] * 2
        if ((row["watch_time"] / (row["duration"] / 1000)) > 0.25
            if (row["duration"] / 1000) > 300
            else row["watch_time"] > 30)
        else 1, axis=1)
    return hist.drop(columns=["watch_time", "duration"], axis=1)


def like(videos: pd.DataFrame, user_id: int, users: pd.DataFrame, emotions: pd.DataFrame) -> pd.DataFrame:
    emotions = emotions[["C2", "C3", "C4"]]
    emotions = emotions[emotions["C2"] == user_id]
    emotions = emotions[["C3", "C4"]]
    hist = users[users["user_id"] == user_id]
    hist = pd.merge(hist, videos, on='item_id').drop(columns=["user_id"], axis=1)
    hist = pd.merge(hist, emotions, left_on='item_id', right_on="C3").drop(columns=["user_id"], axis=1)
    for i in tqdm(range(100)):
        hist[f"v_title_{i}"] = hist.progress_apply(lambda row: row[f"v_title_{i}"] * 2 if row["C4"] == "pos_emotions"
        else (0.5 if row["C4"] == "neg_emotions" else 1), axis=1)
    return hist.drop(columns=["C4", "C3"], axis=1)


def encode_categories(videos: pd.DataFrame) -> pd.DataFrame:
    # получаем dummy-переменные
    dummies = pd.get_dummies(videos['category_title'], prefix='cat')

    # объединяем с исходным датафреймом
    videos = pd.concat([videos, dummies], axis=1)
    return videos.drop(columns=["category_title"], axis=1)

In [201]:
data

Unnamed: 0,item_id,video_title,author_title,tv_title,season,video_description,category_title,publicated,duration,channel_sub,tv_sub,CTR_10days_21_07,CTR_10days_01_08,CTR_10days_10_08,CTR_10days_21_08
0,video_165654,msi pro mp241x недообзор решение проблемы с ме...,0,,0,в видео я обывательским взглядом расскажу про ...,0,2022-12-08 13:53:05+03:00,391382,0,0,,0.0,0.000000,
1,video_1173704,наложение пястно фаланговой повязки на кисть,1,,0,видео с канала уц академия безопасности abdpor...,1,2022-03-24 09:19:15+03:00,125922,26,0,,0.0,0.000000,0.00
2,video_23927,silverstonef1 sochi pro и neoline x cop 6000s ...,2,,0,silverstonef1 sochi pro и neoline x cop 6000s ...,2,2022-03-19 17:41:49+03:00,436570,2,0,,,0.000000,0.00
3,video_1003780,больница в brookhaven доктор пупкин спас жених...,3,,0,играем в роблокс roblox больница в brookhaven...,3,2021-02-20 11:50:53+03:00,719377,673,0,0.0,1.0,0.000000,0.00
4,video_105383,вебинар особенности трудоустройства граждан бе...,4,,0,10082023 вебинар особенности трудоустройства г...,4,2023-08-11 09:02:07+03:00,3834404,19,0,0.0,0.0,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2320655,video_839160,любимцы гена в памперсе бегает по клинике,147463,Любимцы,1,,22,2017-09-11 22:00:13+03:00,23290,0,0,,0.0,0.333333,0.25
2320656,video_2134427,телеканал «спас» «главное» интервью с в степашина,34203,,0,,33,2022-03-28 11:19:33+03:00,1548792,57,0,0.0,0.0,0.000000,0.00
2320657,video_1453482,взятие 3 отметок на ис 7,155222,,0,,20,2022-09-10 20:05:17+03:00,520001,12,0,,,,
2320658,video_2225207,простой и вкусный рецепт пирога с грушами и сы...,1646,,0,,6,2023-07-06 12:16:29+03:00,8267,9,0,0.0,0.0,0.000000,0.00
