In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

import statsmodels.stats.api as sms
from matplotlib import style
style.use('seaborn')
%matplotlib inline
#graphs in svg look clearer
%config InlineBackend.figure_format = 'svg' 

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Unhashing

In [3]:
genre_mapping = {'0ef0aa70-f86c-4141-8054-8b39af97867d': 'Biography',
                '287a1485-7a88-4c2f-bc94-ca418b6c47a1': 'Cartoons',
                '66fad8c3-d84f-458d-a8bf-5b4f154969e0': 'Show',
                '7b7c97f6-1adb-4b43-bfe8-9455812fac0b': 'Theater',
                '9fa28b61-a257-4a3e-945b-a9ef76a146d6': 'Fantasy',
                'b0836a1d-635f-4d89-bcc5-25d10ba56642': 'Anime',
                'd7214feb-8c11-4aea-aabb-ac98a8d56fd5': 'History',
                'dc65dbc8-34ba-4df1-b32c-4f895e10bff8': 'Shorts',
                'eb001d27-5be3-4d42-9d88-90d593f2627d': 'War_movies',
                '364fdc2e-bdfe-40be-b2c5-d30f43ec432e': 'Crime',
                '1f22ccf1-288a-4e6e-b39a-7502799e7125': 'Лекции',
                '6d640e04-be3a-4c8c-852e-4e9b12449d5d': 'Концерты',
                '7b7c97f6-1adb-4b43-bfe8-9455812fac0b': 'Театр',
                '2f7908cc-e2fd-43cf-b626-ec1aef436160': 'Курсы'}

In [4]:
import pickle

with open('./NEW_OKKO_KINOP_parsed_ratings_df.pkl', 'rb') as f:
    df = pickle.load(f, encoding='utf-8')

In [5]:
df.shape

(10731, 27)

In [6]:
df.isna().sum()

age_access_type        0
name                   0
director               3
genre                  0
average_rating       459
type                   0
country                0
release_type           0
release_year           0
duration               0
actor                  0
element_uid            0
ACTOR                211
COMPOSER            1390
DESIGN              1488
DIRECTOR             152
EDITOR              1896
OPERATOR            1479
PRODUCER            1193
WRITER               339
BUDGET              7432
MARKETING          10478
RUS                 7101
USA                 7082
WORLD               5434
element_id             0
rating                 0
dtype: int64

In [7]:
all_genres = []

for item_genres in df['genre']:
    all_genres.extend(item_genres)

In [8]:
all_unique_genres = set(all_genres)

In [9]:
len(all_unique_genres)

33

In [10]:
all_unique_genres

{'0ef0aa70-f86c-4141-8054-8b39af97867d',
 '1f22ccf1-288a-4e6e-b39a-7502799e7125',
 '287a1485-7a88-4c2f-bc94-ca418b6c47a1',
 '364fdc2e-bdfe-40be-b2c5-d30f43ec432e',
 '3e6e08b4-2bb0-46d6-aee7-98780e394c86',
 '5743ecbe-a141-47d6-a7d7-e800f41cb6f5',
 '5c403894-146a-47a4-ae75-9f1956a30dbb',
 '66fad8c3-d84f-458d-a8bf-5b4f154969e0',
 '6d640e04-be3a-4c8c-852e-4e9b12449d5d',
 '7b7c97f6-1adb-4b43-bfe8-9455812fac0b',
 '9fa28b61-a257-4a3e-945b-a9ef76a146d6',
 'Action',
 'Adventure',
 'ArtHouse',
 'Comedy',
 'Detective',
 'Documentary',
 'Drama',
 'Family',
 'ForKids',
 'Horror',
 'Humor',
 'Melodrama',
 'Music',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'Travel',
 'b0836a1d-635f-4d89-bcc5-25d10ba56642',
 'd7214feb-8c11-4aea-aabb-ac98a8d56fd5',
 'd7f6b51c-6ebe-4b9c-9aad-2c60792a7d9c',
 'dc65dbc8-34ba-4df1-b32c-4f895e10bff8',
 'eb001d27-5be3-4d42-9d88-90d593f2627d'}

In [11]:
decoded_genres = [genre_mapping.get(genre, genre) for genre in all_unique_genres]

In [12]:
decoded_genres

['Humor',
 'Melodrama',
 'Концерты',
 'Thriller',
 'Sci-Fi',
 'Biography',
 'd7f6b51c-6ebe-4b9c-9aad-2c60792a7d9c',
 'Cartoons',
 'Detective',
 '3e6e08b4-2bb0-46d6-aee7-98780e394c86',
 'Documentary',
 '5743ecbe-a141-47d6-a7d7-e800f41cb6f5',
 'Adventure',
 'Horror',
 '5c403894-146a-47a4-ae75-9f1956a30dbb',
 'Music',
 'Anime',
 'Лекции',
 'Action',
 'Show',
 'War_movies',
 'Shorts',
 'History',
 'ForKids',
 'Comedy',
 'Family',
 'ArtHouse',
 'Театр',
 'Sport',
 'Drama',
 'Travel',
 'Crime',
 'Fantasy']

# Preprocessing





In [13]:
df.head().T

Unnamed: 0,0,1,2,3,4
age_access_type,12,16,18,18,18
name,Звёздный путь 5: Последний рубеж,Звёздный путь 6: Неоткрытая страна,В поисках древнего артефакта,Прожарка Чарли Шина,Прожарка Уильяма Шэтнера
director,[cff3362f-ad91-498d-80b3-ea3ab8ec65cf],[bab6b7f4-4506-458f-9091-d567505674f2],[895f5774-964a-4c5a-ae22-d697d3e3e620],[1de22aff-430e-4af9-bf1d-159dbf8e9269],[1de22aff-430e-4af9-bf1d-159dbf8e9269]
genre,"[Sci-Fi, Action, Thriller, Adventure]","[Sci-Fi, Action, Thriller, Adventure]",[Horror],"[Comedy, Documentary]",[Comedy]
average_rating,6.21,7.2,4.0,6.99,6.0
type,MOVIE,MOVIE,MOVIE,MOVIE,MOVIE
country,[usa],[usa],[usa],[usa],[usa]
release_type,RELEASE,RELEASE,RELEASE,RELEASE,RELEASE
release_year,1989,1991,2019,2011,2006
duration,6420000.0,6780000.0,6000000.0,3780000.0,3780000.0


In [14]:
df.shape

(10731, 27)

In [15]:
df.head().T

Unnamed: 0,0,1,2,3,4
age_access_type,12,16,18,18,18
name,Звёздный путь 5: Последний рубеж,Звёздный путь 6: Неоткрытая страна,В поисках древнего артефакта,Прожарка Чарли Шина,Прожарка Уильяма Шэтнера
director,[cff3362f-ad91-498d-80b3-ea3ab8ec65cf],[bab6b7f4-4506-458f-9091-d567505674f2],[895f5774-964a-4c5a-ae22-d697d3e3e620],[1de22aff-430e-4af9-bf1d-159dbf8e9269],[1de22aff-430e-4af9-bf1d-159dbf8e9269]
genre,"[Sci-Fi, Action, Thriller, Adventure]","[Sci-Fi, Action, Thriller, Adventure]",[Horror],"[Comedy, Documentary]",[Comedy]
average_rating,6.21,7.2,4.0,6.99,6.0
type,MOVIE,MOVIE,MOVIE,MOVIE,MOVIE
country,[usa],[usa],[usa],[usa],[usa]
release_type,RELEASE,RELEASE,RELEASE,RELEASE,RELEASE
release_year,1989,1991,2019,2011,2006
duration,6420000.0,6780000.0,6000000.0,3780000.0,3780000.0


## Add keywords

In [16]:
kwds = pd.read_csv('keywords_df_for_each_content.csv', index_col=0)

In [17]:
kwds.head().T

Unnamed: 0,e785baa6-f175-42b4-9e16-4319ac7991d5,4593737e-de9c-40df-97db-fb3cf85a08ef,11ba66db-e941-4c3a-8da6-d8900e56f8c7,3f30a2ef-53b7-40e3-954f-1bdfc38a6d17,ba6bec1a-3aa9-48c7-aa65-908c21627a12
джек,0,0,0,0,0
расследовать,0,0,0,0,0
подруга,0,0,0,0,0
ограбление,0,0,0,0,0
катя,0,0,0,0,0
...,...,...,...,...,...
таможенный,0,0,0,0,0
диабет,0,0,0,0,0
млрд,0,0,0,0,0
рф,0,0,0,0,0


In [18]:
kwds_df = kwds.drop('texts', axis=1).reset_index().rename(columns={'index':'element_id'})

In [19]:
kwds_df.head().T

Unnamed: 0,0,1,2,3,4
element_id,e785baa6-f175-42b4-9e16-4319ac7991d5,4593737e-de9c-40df-97db-fb3cf85a08ef,11ba66db-e941-4c3a-8da6-d8900e56f8c7,3f30a2ef-53b7-40e3-954f-1bdfc38a6d17,ba6bec1a-3aa9-48c7-aa65-908c21627a12
джек,0,0,0,0,0
расследовать,0,0,0,0,0
подруга,0,0,0,0,0
ограбление,0,0,0,0,0
...,...,...,...,...,...
собственник,0,0,0,0,0
таможенный,0,0,0,0,0
диабет,0,0,0,0,0
млрд,0,0,0,0,0


In [20]:
df_merged = df.merge(kwds_df, on='element_id', how='inner')

In [21]:
df_merged.shape

(10489, 33859)

In [22]:
with open('Okko_ratings_all_content_FULL_kws.pkl', 'wb') as f:
    pickle.dump(df_merged, f)

In [23]:
################

In [24]:
def  get_certain_country(data: pd.DataFrame, country_name: str):
    df = data.copy(deep=True)
    for idx, row in df.iterrows():
        countries = row['country']
        country_presence = False
        for country in countries:
            if country == country_name:
                country_presence = True
        if not country_presence:
            df.drop(labels=idx, axis=0, inplace=True)
    
    return df
        

## Get russian content

In [25]:
russ_content = get_certain_country(df_merged, 'russia')

In [26]:
russ_content.head().T

Unnamed: 0,143,144,182,199,230
age_access_type,16,16,16,16,18
name,Айка,Медвежий поцелуй,Байконур,Девушка и смерть,Хрусталь
director,[8fa71c53-ca22-44f0-bd9f-af273eb2cf29],[3d64928e-b4a3-4c78-b176-77ef1045d5bc],[1f007e2b-31c6-4f3d-849c-95a1df635ab1],[2b043c71-af5a-47b1-9755-432047883465],[8468f32f-5484-46bc-9106-61149fb31ba4]
genre,[Drama],"[Drama, 9fa28b61-a257-4a3e-945b-a9ef76a146d6]","[Melodrama, Drama]",[Drama],"[Comedy, Drama]"
average_rating,7.24,7.43,6.09,6.8,7.05
...,...,...,...,...,...
собственник,0,0,0,0,0
таможенный,0,0,0,0,0
диабет,0,0,0,0,0
млрд,0,0,0,0,0


In [27]:
russ_content.shape

(1705, 33859)

In [28]:
russ_content['release_year'].value_counts().sort_index() 

1953      1
1990      1
1992      7
1993      7
1994      3
1995      4
1996      1
1997      5
1998      4
1999      6
2000     14
2001     11
2002     13
2003     10
2004     33
2005     37
2006     51
2007     69
2008     68
2009     69
2010     62
2011     70
2012     95
2013     88
2014    109
2015    111
2016     91
2017     98
2018    127
2019    136
2020    116
2021    137
2022     51
Name: release_year, dtype: int64

In [29]:
import pickle

with open('Okko_ratings_RUSS_FULL_kws.pkl', 'wb') as f:
    pickle.dump(russ_content, f)

## Get foriegn content

In [30]:
foreign_content = df_merged[df_merged['element_id'].isin(list(russ_content['element_id'].values)) == False]

In [31]:
foreign_content.shape

(8784, 33859)

In [32]:
foreign_content.head().T

Unnamed: 0,0,1,2,3,4
age_access_type,12,16,18,18,6
name,Звёздный путь 5: Последний рубеж,Звёздный путь 6: Неоткрытая страна,В поисках древнего артефакта,Прожарка Чарли Шина,Пламенное сердце
director,[cff3362f-ad91-498d-80b3-ea3ab8ec65cf],[bab6b7f4-4506-458f-9091-d567505674f2],[895f5774-964a-4c5a-ae22-d697d3e3e620],[1de22aff-430e-4af9-bf1d-159dbf8e9269],"[40f78e6d-3c9f-434f-a505-c6ffdd045763, 332105c..."
genre,"[Sci-Fi, Action, Thriller, Adventure]","[Sci-Fi, Action, Thriller, Adventure]",[Horror],"[Comedy, Documentary]","[Comedy, Adventure, Family, 287a1485-7a88-4c2f..."
average_rating,6.21,7.2,4.0,6.99,6.4
...,...,...,...,...,...
собственник,0,0,0,0,0
таможенный,0,0,0,0,0
диабет,0,0,0,0,0
млрд,0,0,0,0,0


In [33]:
foreign_content['release_year'].value_counts().sort_index() 

1914      1
1915      1
1916      1
1919      1
1920      3
       ... 
2018    567
2019    696
2020    583
2021    417
2022    101
Name: release_year, Length: 104, dtype: int64

In [34]:
import pickle

with open('Okko_ratings_FOREIGN_FULL_kws.pkl', 'wb') as f:
    pickle.dump(foreign_content, f)