# Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import glob
import shutil
import os
import requests
import shutil
import urllib

import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.pyplot import imread, imshow, subplots, show
style.use('seaborn')
%matplotlib inline
#graphs in svg look clearer
%config InlineBackend.figure_format = 'svg' 
import warnings
warnings.simplefilter('ignore')

import pylab as pl

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.utils.data import DataLoader,Dataset
import torch.optim as optim

from os import listdir
from os.path import isfile, join

from tqdm.auto import tqdm
import os
from PIL import Image
import cv2

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.functional")

# Preprocessing

In [18]:
data_path = '/root/User/_DIPLOMA/data/data_tbl_cvr_oscr_kws_small.csv'

df = pd.read_csv(data_path, index_col=0)

In [20]:
df.shape

(8784, 1037)

In [21]:
df.columns[:30]

Index(['age_access_type', 'name', 'director', 'genre', 'average_rating',
       'type', 'country', 'release_type', 'release_year', 'duration', 'actor',
       'element_uid', 'ACTOR', 'COMPOSER', 'DESIGN', 'DIRECTOR', 'EDITOR',
       'OPERATOR', 'PRODUCER', 'WRITER', 'BUDGET', 'MARKETING', 'RUS', 'USA',
       'WORLD', 'element_id', 'rating', 'джек', 'расследовать', 'подруга'],
      dtype='object')

In [22]:
df.columns[-15:]

Index(['чета', 'зебра', 'сидень', 'беатрис', 'рыбка', 'has_oscar_actor',
       'has_oscar_composer', 'has_oscar_design', 'has_oscar_director',
       'has_oscar_editor', 'has_oscar_operator', 'has_oscar_producer',
       'has_oscar_writer', 'poster_url', 'poster_url_preview'],
      dtype='object')

Get rid of keywords

In [23]:
df_cols = ['age_access_type', 'name', 'director', 'genre', 'average_rating',
       'type', 'country', 'release_type', 'release_year', 'duration', 'actor',
       'element_uid', 'ACTOR', 'COMPOSER', 'DESIGN', 'DIRECTOR', 'EDITOR',
       'OPERATOR', 'PRODUCER', 'WRITER', 'BUDGET', 'MARKETING', 'RUS', 'USA',
       'WORLD', 'element_id', 'rating', 'has_oscar_actor',
       'has_oscar_composer', 'has_oscar_design', 'has_oscar_director',
       'has_oscar_editor', 'has_oscar_operator', 'has_oscar_producer',
       'has_oscar_writer', 'poster_url', 'poster_url_preview']

In [24]:
df = df[df_cols]

In [25]:
len(df['poster_url_preview'].unique())

8699

In [26]:
# add names of imgs

def add_img_name(data):
  return data.split('/')[-1]

df['img_name'] = df['poster_url_preview'].apply(add_img_name)

df.head().T[-5:]

Unnamed: 0,0,1,2,3,4
has_oscar_producer,0,0,0,0,0
has_oscar_writer,0,0,0,0,0
poster_url,https://kinopoiskapiunofficial.tech/images/posters/kp/6580.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp/6460.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp/1228112.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp/623934.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp/1172958.jpg
poster_url_preview,https://kinopoiskapiunofficial.tech/images/posters/kp_small/6580.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp_small/6460.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp_small/1228112.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp_small/623934.jpg,https://kinopoiskapiunofficial.tech/images/posters/kp_small/1172958.jpg
img_name,6580.jpg,6460.jpg,1228112.jpg,623934.jpg,1172958.jpg


### Test/Train split

In [30]:
train_df = df[df['release_year'] != 2022]

In [31]:
train_df.shape

(8683, 38)

In [32]:
train_df.dropna(subset=['rating'], inplace=True)
train_df.shape

(8683, 38)

In [33]:
test_df = df[df['release_year'] == 2022]
test_df.shape

(101, 38)

In [34]:
test_df.dropna(subset=['rating'], inplace=True)
test_df.shape

(101, 38)

### Preproc

In [35]:
train = train_df.copy(deep=True)
test = test_df.copy(deep=True)

In [36]:
train = train[train['release_year'] >= 2000]

In [37]:
train.shape

(6672, 38)

In [38]:
train.drop(['director', 'average_rating', 'release_type', 'actor', 'element_id'], axis=1, inplace=True)

test.drop(['director', 'average_rating', 'release_type', 'actor', 'element_id'], axis=1, inplace=True)

In [39]:
import ast

columns = ['ACTOR', 'COMPOSER', 'DESIGN', 'DIRECTOR', 'EDITOR',
                      'OPERATOR', 'PRODUCER', 'WRITER', 'country', 'genre']
for col in columns:
  train[col] = train[col].apply(lambda x: ast.literal_eval(x) if not isinstance(x, float) else np.nan)
  test[col] = test[col].apply(lambda x: ast.literal_eval(x) if not isinstance(x, float) else np.nan)

In [40]:
mltpl_cat_features = ['ACTOR', 'COMPOSER', 'DESIGN', 'DIRECTOR', 'EDITOR',
       'OPERATOR', 'PRODUCER', 'WRITER', 'country', 'genre']
num_features = [ 'duration']
cat_features = ['age_access_type']
skewed_num_features = ['duration', 'BUDGET', 'MARKETING', 'RUS', 'WORLD']
mltpl_cat_n_top = {'ACTOR': 10, 'country': 1, 'DIRECTOR': 1, 
                   'EDITOR': 1, 'OPERATOR': 1, 'PRODUCER': 5, 'WRITER': 2,
                   'DESIGN': 3, 'COMPOSER': 1, 'genre': 3}

In [41]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split

class Preprocesser:
    def __init__(self, 
                mulpiple_cat_features: list,
                skewed_num_features: list,
                mltpl_cat_n_top: dict):
        self.mulpiple_cat_features = mulpiple_cat_features
        self.skewed_num_features = skewed_num_features
        self.mltpl_cat_n_top = mltpl_cat_n_top
        
    def __get_top_n_mltpl_cat_features(self, df_pr, col):
        all_cats = []
        for element_values in df_pr[col]:
            all_cats.extend(element_values)
        return Counter(all_cats).most_common(self.mltpl_cat_n_top[col])
            
    def __preprocess_mulpiple_cat_features(self, df_pr):
        for col in self.mulpiple_cat_features:
            feature_number = self.mltpl_cat_n_top[col]
            df_pr[[f'{col}_{i}' for i in range(feature_number)]] = pd.DataFrame(np.full((len(df), feature_number), np.nan))
            for i in df_pr.index:
                names = df_pr.loc[i, col]
                if isinstance(names, float):
                    names = ['Na']
                for k, name in enumerate(names[:int(feature_number)]):
                    df_pr.loc[i, f'{col}_{k}'] = name
            df_pr.drop(col, axis=1, inplace=True)

        return df_pr
    
    def __preprocess_skewed_num_features(self, df_pr):
        for col in self.skewed_num_features:
            df_pr[col] = df_pr[col].apply(lambda x: np.log1p(x))
        return df_pr
    
    def __preprocess_text_features(self, df_pr):
        for col in self.text_features:
            df_pr[col] = df_pr[col].apply(lambda x: self.tokenizer(x))
        return df_pr
            
    def preprocess(self,
                   df: pd.DataFrame):
        df_pr = df.copy()
        df_pr = self.__preprocess_mulpiple_cat_features(df_pr)
        df_pr = self.__preprocess_skewed_num_features(df_pr)
        # df_pr = self.__preprocess_text_features(df_pr)
        
        return df_pr
        

In [42]:
preprocesser = Preprocesser(mltpl_cat_features,
            skewed_num_features,
            mltpl_cat_n_top)

In [43]:
train_pr = preprocesser.preprocess(train)
test_pr = preprocesser.preprocess(test)

In [44]:
test_pr.head().T[-15:]

Unnamed: 0,4,13,46,461,1020
DESIGN_2,,Ричард Чэнь,Чарло Далли,,Джина Руис
DIRECTOR_0,Теодор Ти,Крис Бэйли,Дэниэл Грэм,Уте фон Мюнхов-Поль,Джаред Кон
EDITOR_0,Стефан Гарнье,Иван Биланкио,Люк Дулан,Na,Тревор Мирош
OPERATOR_0,Гийом Зиммер,Армен Мелконян,Бен Зирьяб,Na,Брэндон Ли Кокс
PRODUCER_0,Лоран Зэйтун,Джеймс Машелло,Дэвид Болл,Дирк Байнхольд,Амар Балагган
PRODUCER_1,Ян Зеноу,Том Ортенберг,Джон Дженкс,Имке Ферманн,Кори Лардж
PRODUCER_2,Франсуа-Ксавье Обаг,Мэтью Сидари,П.Дж. Паркер,Валентин Гройлих,Росс Мразек
PRODUCER_3,Валери д’Отей,Яр Ландау,Джо Симпсон,Себастьян Руншке,Бенжамин Раппапорт
PRODUCER_4,Жюльетт Фурнье,Адам Нэгл,Мэтт Хукингс,,Марк Данон
WRITER_0,Лоран Зэйтун,Роберт Бен Гарант,Мэтт Хукингс,Катя Грюбель,Джаред Кон


In [45]:
train_pr.columns

Index(['age_access_type', 'name', 'type', 'release_year', 'duration',
       'element_uid', 'BUDGET', 'MARKETING', 'RUS', 'USA', 'WORLD', 'rating',
       'has_oscar_actor', 'has_oscar_composer', 'has_oscar_design',
       'has_oscar_director', 'has_oscar_editor', 'has_oscar_operator',
       'has_oscar_producer', 'has_oscar_writer', 'poster_url',
       'poster_url_preview', 'img_name', 'ACTOR_0', 'ACTOR_1', 'ACTOR_2',
       'ACTOR_3', 'ACTOR_4', 'ACTOR_5', 'ACTOR_6', 'ACTOR_7', 'ACTOR_8',
       'ACTOR_9', 'COMPOSER_0', 'DESIGN_0', 'DESIGN_1', 'DESIGN_2',
       'DIRECTOR_0', 'EDITOR_0', 'OPERATOR_0', 'PRODUCER_0', 'PRODUCER_1',
       'PRODUCER_2', 'PRODUCER_3', 'PRODUCER_4', 'WRITER_0', 'WRITER_1',
       'country_0', 'genre_0', 'genre_1', 'genre_2'],
      dtype='object')

In [46]:
rename_d = {'ACTOR_0': 'actor_0','ACTOR_1' : 'actor_1', 'ACTOR_2': 'actor_2', 'ACTOR_3':'actor_3',
       'ACTOR_4': 'actor_4', 'ACTOR_5': 'actor_5', 'ACTOR_6': 'actor_6', 'ACTOR_7':'actor_7',
              'ACTOR_8':'actor_8', 'ACTOR_9':'actor_9',
       'COMPOSER_0': 'composer_0', 'DESIGN_0': 'design_0', 'DESIGN_1': 'design_1', 'DESIGN_2': 'design_2',
              'DIRECTOR_0': 'director_0',
       'EDITOR_0': 'editor_0', 'OPERATOR_0': 'operator_0', 'PRODUCER_0': 'producer_0',
              'PRODUCER_1':  'producer_1', 'PRODUCER_2':  'producer_2',
       'PRODUCER_3':  'producer_3', 'PRODUCER_4': 'producer_4', 'WRITER_0':'writer_0', 'WRITER_1': 'writer_1'}

train_pr.rename(rename_d, axis=1, inplace=True)


test_pr.rename(rename_d, axis=1, inplace=True)

In [47]:
train_pr.rename({'BUDGET': 'budget','MARKETING' : 'marketing', 'RUS': 'rus', 'WORLD':'world'}, axis=1, inplace=True)

test_pr.rename({'BUDGET': 'budget','MARKETING' : 'marketing', 'RUS': 'rus', 'WORLD':'world'}, axis=1, inplace=True)

In [48]:
cat_features = ['age_access_type', 'type', 'actor_0', 'actor_1', 'actor_2', 'actor_3',
                'actor_4', 'actor_5', 'actor_6', 'actor_7', 'actor_8', 'actor_9',
                'country_0', 'director_0', 'editor_0', 'operator_0', 'producer_0',
                'producer_1', 'producer_2', 'producer_3', 'producer_4', 'writer_0',
                'writer_1', 'design_0', 'design_1', 'design_2', 'composer_0', 'genre_0',
                'genre_1', 'genre_2']

In [49]:
train_pr['rating'] = train_pr['rating'].astype(np.float64)

test_pr['rating'] = test_pr['rating'].astype(np.float64)

In [50]:
train_pr.drop(['release_year', 'element_uid', 'name'], axis=1, inplace=True)
test_pr.drop(['release_year', 'element_uid', 'name'], axis=1, inplace=True)

In [53]:
fls = None 
for root_dir, cur_dir, files in os.walk(r'/root/User/_DIPLOMA/data/train_covers/'):
    fls = files
    
len(fls)

7892

In [54]:
print(train_pr.shape)
train_pr = train_pr[train_pr['img_name'].isin(fls)]
train_pr.shape

(6672, 48)


(6394, 48)

In [55]:
tst_fls = None 
for root_dir, cur_dir, files in os.walk(r'/root/User/_DIPLOMA/data/test_covers/'):
    tst_fls = files

In [56]:
len(tst_fls)

97

In [57]:
print(test_pr.shape)
test_pr = test_pr[test_pr['img_name'].isin(tst_fls)]
test_pr.shape

(101, 48)


(61, 48)

In [66]:
train_pr.drop(['poster_url', 'poster_url_preview'], axis=1, inplace=True)
test_pr.drop(['poster_url', 'poster_url_preview'], axis=1, inplace=True)

### Encoding + normalization

In [None]:
# WORK WITH EMBEDDINGS ???

**Save preproc dfs**

In [67]:
import pickle

with open('/root/User/_DIPLOMA/data/preproc_dfs/train_pr_oscr.pkl', 'wb') as f:
    pickle.dump(train_pr, f)

with open('/root/User/_DIPLOMA/data/preproc_dfs/test_pr_oscr.pkl', 'wb') as f:
    pickle.dump(test_pr, f)

In [68]:
test_pr.head().T

Unnamed: 0,4,13,46,1020,2190
age_access_type,6,6,16,16,16
type,MOVIE,MOVIE,MOVIE,MOVIE,MOVIE
duration,15.523889,15.5347,15.665539,15.566448,15.479437
budget,,,15.894952,,
marketing,,,,,
rus,14.405929,,,,
USA,,,,,
world,15.542443,,,12.039197,11.104145
rating,9.282869,8.9459,8.626496,5.504337,6.411529
has_oscar_actor,0,0,0,0,0


In [70]:
test_pr.columns

Index(['age_access_type', 'type', 'duration', 'budget', 'marketing', 'rus',
       'USA', 'world', 'rating', 'has_oscar_actor', 'has_oscar_composer',
       'has_oscar_design', 'has_oscar_director', 'has_oscar_editor',
       'has_oscar_operator', 'has_oscar_producer', 'has_oscar_writer',
       'img_name', 'actor_0', 'actor_1', 'actor_2', 'actor_3', 'actor_4',
       'actor_5', 'actor_6', 'actor_7', 'actor_8', 'actor_9', 'composer_0',
       'design_0', 'design_1', 'design_2', 'director_0', 'editor_0',
       'operator_0', 'producer_0', 'producer_1', 'producer_2', 'producer_3',
       'producer_4', 'writer_0', 'writer_1', 'country_0', 'genre_0', 'genre_1',
       'genre_2'],
      dtype='object')

# PyTorch

In [62]:
!pip install pytorch_lightning==1.5

Collecting pytorch_lightning==1.5
  Downloading pytorch_lightning-1.5.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tensorboard>=2.2.0
  Downloading tensorboard-2.12.0-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting google-auth-oauthlib<0.5,>=0.4.1
  Using cached google_auth_oauthlib-0.4.6-py2.py3-none-any.whl (18 kB)
Collecting protobuf>=3.19.6
  Downloading protobuf-4.22.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
Collecting grpcio>=1.48.2
  Downloading grpcio-1.51.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8

In [69]:
import pandas as pd
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping


data_path = "./data/"

In [None]:
class ImageDataset(Dataset):
    """Tabular and Image dataset."""

    def __init__(self, df, image_dir):
        self.image_dir = image_dir
        # self.pickle_file = pickle_file
        self.tabular = df #pd.read_pickle(pickle_file)

    def __len__(self):
        return len(self.tabular)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        tabular = self.tabular.iloc[idx, 0:]

        y = tabular["rating"]

        image = Image.open(f"{self.image_dir}/{tabular['img_name']}")
        image = np.array(image)
        image = image[..., :3]

        image = transforms.functional.to_tensor(image)

        tabular = tabular[['age_access_type', 'type', 'duration', 'budget', 'marketing', 'rus',
                            'USA', 'world', 'has_oscar_actor', 'has_oscar_composer',
                            'has_oscar_design', 'has_oscar_director', 'has_oscar_editor',
                            'has_oscar_operator', 'has_oscar_producer', 'has_oscar_writer',
                            'actor_0', 'actor_1', 'actor_2', 'actor_3', 'actor_4',
                            'actor_5', 'actor_6', 'actor_7', 'actor_8', 'actor_9', 'composer_0',
                            'design_0', 'design_1', 'design_2', 'director_0', 'editor_0',
                            'operator_0', 'producer_0', 'producer_1', 'producer_2', 'producer_3',
                            'producer_4', 'writer_0', 'writer_1', 'country_0', 'genre_0', 'genre_1',
                            'genre_2']]
        tabular = tabular.tolist()
        tabular = torch.FloatTensor(tabular)

        return image, tabular, y