In [3]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import preprocessing
pd.options.display.max_colwidth = 100

In [4]:
TRAIN_PATH = 'train_dataset_RUTUBE/'
MODELS_PATH = 'doc2vec_models'

In [5]:
data =  pq.read_table(TRAIN_PATH+'videos.parquet').to_pandas()
emotions = pd.read_csv(TRAIN_PATH+'emotions.csv')

In [2]:
users = pq.read_table('train_dataset_RUTUBE/player_starts_train.parquet').to_pandas()

In [19]:
small_users = pd.read_csv('train_dataset_RUTUBE/small_player_starts_train.csv')

In [6]:
import string
translator = str.maketrans('', '', string.punctuation)

data['video_title'] = data['video_title'].apply(lambda x: x.translate(translator).lower().strip() if isinstance(x, str) else x)
data['video_description'].fillna("", inplace=True)
data['video_description'] = data['video_description'].apply(lambda x: x.translate(translator).lower().strip() if isinstance(x, str) else x)

In [193]:
video_descriptions = [TaggedDocument(doc, [i]) for i, doc in enumerate(data['video_description'].tolist())]
video_titles = [TaggedDocument(doc, [i]) for i, doc in enumerate(data['video_title'].tolist())]

model_video_descr = Doc2Vec(video_descriptions,
                vector_size=100,
                window=len(video_descriptions),
                workers=8,
                epochs=10,
                seed=42)


model_video_titles = Doc2Vec(video_titles,
                vector_size=100,
                window=len(video_titles),
                workers=8,
                epochs=10,
                seed=42)

KeyboardInterrupt: 

In [None]:
model_video_titles.save(MODELS_PATH+'model_video_titles')
model_video_descr.save(MODELS_PATH+'model_video_descr_10epochs')


In [4]:
video_titles = np.load('doc2vec_models/model_video_titles.dv.vectors.npy', allow_pickle=True)
video_descrs = np.load('doc2vec_models/model_video_descr_10epochs.dv.vectors.npy', allow_pickle=True)

In [5]:
video_titles_normalized = preprocessing.minmax_scale(video_titles.T).T
video_descrs_normalized = preprocessing.minmax_scale(video_descrs.T).T

In [6]:
text_embs = np.concatenate([video_titles_normalized,video_descrs_normalized], axis=-1)

In [7]:
cols_titles = [f'vid_title_{i}' for i in list(map(str, list(range(100))))]
cols_descrs_ = [f'vid_descr_{i}' for i in list(map(str, list(range(100))))]
cols_titles.extend(cols_descrs_)
df = pd.DataFrame(text_embs, columns = cols_titles)

In [8]:
df['item_id'] = data.item_id
cols = ['item_id']
cols.extend(cols_titles)
df = df[cols]

In [9]:
# categories_dict = {cat: idx for idx, cat in enumerate(data['category_title'].unique())}
# data['category_title'] = data['category_title'].map(categories_dict)
author_titles = {title: idx for idx, title in enumerate(data['author_title'].unique())}
data['author_title'] = data['author_title'].map(author_titles)

In [10]:
df = pd.concat([df, pd.get_dummies(data['category_title'], prefix='cat')], axis=1)

In [11]:
final = df.merge(data,how='inner', on='item_id')

In [15]:
final['ctr.CTR_10days_21_07'].fillna(data['ctr.CTR_10days_21_07'].mode()[0], inplace=True)
final['ctr.CTR_10days_01_08'].fillna(data['ctr.CTR_10days_01_08'].mode()[0], inplace=True)
final['ctr.CTR_10days_10_08'].fillna(data['ctr.CTR_10days_10_08'].mode()[0], inplace=True)
final['ctr.CTR_10days_21_08'].fillna(data['ctr.CTR_10days_21_08'].mode()[0], inplace=True)

In [16]:
final['publicated'] = pd.to_datetime(final['publicated'])

In [17]:
final['upld_year'] = final['publicated'].apply(lambda x: x.year)
final['upld_month'] = final['publicated'].apply(lambda x: x.month)
final['upld_day'] = final['publicated'].apply(lambda x: x.day)
final['upld_hour'] = final['publicated'].apply(lambda x: x.hour)
final['upld_minute'] = final['publicated'].apply(lambda x: x.minute)
final['upld_second'] = final['publicated'].apply(lambda x: x.second)
final['upld_dayofweek'] = final['publicated'].apply(lambda x: x.weekday())

In [19]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(final, test_size=0.2, random_state=42)

In [21]:
import pynndescent
index = pynndescent.NNDescent(train.drop(['item_id', 'video_description', 'tv_title', 'video_title', 'ctr.CTR_10days_01_08', 'ctr.CTR_10days_21_07', 'ctr.CTR_10days_10_08', 'ctr.CTR_10days_21_08', 'tv_sub', 'season', 'publicated', 'category_title'], axis=1), n_jobs=8, n_neighbors=5, metric='jaccard')

In [22]:
index.prepare()

  self._set_arrayXarray(i, j, x)


In [23]:
import pickle
with open('final_model_jaccard_2.pkl', 'wb') as f:
    pickle.dump(index, f)

In [24]:
final.to_csv('final_df_2.csv')