In [1]:
!pip install statsmodels



In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

import statsmodels.stats.api as sms
from matplotlib import style
style.use('seaborn')
%matplotlib inline
#graphs in svg look clearer
%config InlineBackend.figure_format = 'svg' 

  style.use('seaborn')


In [2]:
import warnings
warnings.filterwarnings("ignore")

# Functions

In [3]:
from sklearn import metrics

def mae(y_gt, Y_pr):
    return metrics.mean_absolute_error(y_gt, Y_pr)

def mse(y_gt, Y_pr):
    return metrics.mean_squared_error(y_gt, Y_pr)

def rmse(y_gt, Y_pr):
    my_mse = mse(y_gt, Y_pr)
    return np.sqrt(my_mse)

def medae(y_gt, Y_pr):
    return metrics.median_absolute_error(y_gt, Y_pr)

def R2(y_gt, Y_pr):
    return metrics.r2_score(y_gt, Y_pr)

def calc_metrics(true, pred):

    mse1 = mse(true, pred)

    rmse1 = rmse(true, pred)

    mae1 = mae(true, pred)

    medae1 = medae(true, pred)

    R21 = R2(true, pred)

    print('*** VAL **: ')
    print("MSE:   {}\nRMSE:  {}\nR2:    {}\nMAE:   {}\nMedAE: {}".format(mse1, rmse1, R21, mae1, medae1))
    print('-'*30)

In [4]:
def target_distr_linear(train_label, test_label, predicted_dv): 
    sns.kdeplot(train_label, label='train')
    sns.kdeplot(test_label, label='test')
    sns.kdeplot(predicted_dv, label='pred')
    plt.legend()
    plt.show()

    sns.scatterplot(test_label, predicted_dv, color='blueviolet')
    plt.title('Linear model')
    plt.xlabel('True Values')
    plt.ylabel('Predictions')
    plt.show()

In [5]:
from scipy.stats import norm

def draw_histogram(data):
    f,ax_hist = plt.subplots(1)
    sns.distplot(data, hist=True,  kde=True, rug=True, bins=10, fit=norm, ax=ax_hist);
    plt.xlabel("values")
    plt.title("Distribution") 

    mean=data.mean()

    ax_hist.axvline(mean, color='r', linestyle='--', label=f"Mean={mean:.3f}")
    ax_hist.legend()

    plt.show()

In [6]:
def draw_res_analys(errors, pred):
    sns.scatterplot(list(range(len(errors))),errors);
    plt.title("Distibution of errors")
    plt.ylabel('Error');
    plt.xlabel('Index');
    plt.show();

    sns.scatterplot(pred,errors);
    plt.title('Relationship of true value vs error (resudual analysis)')
    plt.ylabel('Error');
    plt.xlabel('True value');
    plt.show();

In [7]:
def get_feature_imp(model, X_train):
    imp = model.get_feature_importance(data=X_train)
    fig, axes = plt.subplots(1, 1, figsize=(12, 8))
    axes.barh(width=imp, y=model.feature_names_)
    axes.set_title('Finetuned Catboost feature importance')
    plt.show()

# Unhashing

In [8]:
genre_mapping = {'0ef0aa70-f86c-4141-8054-8b39af97867d': 'Biography',
                '287a1485-7a88-4c2f-bc94-ca418b6c47a1': 'Cartoons',
                '66fad8c3-d84f-458d-a8bf-5b4f154969e0': 'Show',
                '7b7c97f6-1adb-4b43-bfe8-9455812fac0b': 'Theater',
                '9fa28b61-a257-4a3e-945b-a9ef76a146d6': 'Fantasy',
                'b0836a1d-635f-4d89-bcc5-25d10ba56642': 'Anime',
                'd7214feb-8c11-4aea-aabb-ac98a8d56fd5': 'History',
                'dc65dbc8-34ba-4df1-b32c-4f895e10bff8': 'Shorts',
                'eb001d27-5be3-4d42-9d88-90d593f2627d': 'War_movies',
                '364fdc2e-bdfe-40be-b2c5-d30f43ec432e': 'Crime'}

In [9]:
types_mapping = {'1f22ccf1-288a-4e6e-b39a-7502799e7125': 'Лекции',
    '6d640e04-be3a-4c8c-852e-4e9b12449d5d': 'Концерты',
    '7b7c97f6-1adb-4b43-bfe8-9455812fac0b': 'Театр',
    '2f7908cc-e2fd-43cf-b626-ec1aef436160': 'Курсы' }

In [10]:
import pickle

with open('/home/jovyan/work/item_coldstart_dataset.pkl', 'rb') as f:
    df = pickle.load(f, encoding='utf-8')

In [11]:
!pwd

/home/jovyan/work/cold_start_models


In [12]:
df.shape

(11919, 14)

In [13]:
all_genres = []

for item_genres in df['genre']:
    all_genres.extend(item_genres)

In [14]:
all_unique_genres = set(all_genres)

In [15]:
len(all_unique_genres)

32

In [16]:
all_unique_genres

{'0ef0aa70-f86c-4141-8054-8b39af97867d',
 '287a1485-7a88-4c2f-bc94-ca418b6c47a1',
 '364fdc2e-bdfe-40be-b2c5-d30f43ec432e',
 '3e6e08b4-2bb0-46d6-aee7-98780e394c86',
 '5743ecbe-a141-47d6-a7d7-e800f41cb6f5',
 '5c403894-146a-47a4-ae75-9f1956a30dbb',
 '66fad8c3-d84f-458d-a8bf-5b4f154969e0',
 '6d640e04-be3a-4c8c-852e-4e9b12449d5d',
 '7b7c97f6-1adb-4b43-bfe8-9455812fac0b',
 '9fa28b61-a257-4a3e-945b-a9ef76a146d6',
 'Action',
 'Adventure',
 'ArtHouse',
 'Comedy',
 'Detective',
 'Documentary',
 'Drama',
 'Family',
 'ForKids',
 'Horror',
 'Humor',
 'Melodrama',
 'Music',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'Travel',
 'b0836a1d-635f-4d89-bcc5-25d10ba56642',
 'd7214feb-8c11-4aea-aabb-ac98a8d56fd5',
 'd7f6b51c-6ebe-4b9c-9aad-2c60792a7d9c',
 'dc65dbc8-34ba-4df1-b32c-4f895e10bff8',
 'eb001d27-5be3-4d42-9d88-90d593f2627d'}

In [17]:
decoded_genres = [genre_mapping.get(genre, genre) for genre in all_unique_genres]

In [18]:
decoded_genres

['Melodrama',
 'Sci-Fi',
 'ArtHouse',
 '6d640e04-be3a-4c8c-852e-4e9b12449d5d',
 'Adventure',
 'Horror',
 '5c403894-146a-47a4-ae75-9f1956a30dbb',
 'Thriller',
 'Detective',
 'Crime',
 'Theater',
 'Show',
 '5743ecbe-a141-47d6-a7d7-e800f41cb6f5',
 'Cartoons',
 'History',
 '3e6e08b4-2bb0-46d6-aee7-98780e394c86',
 'Action',
 'Fantasy',
 'Biography',
 'Anime',
 'Shorts',
 'd7f6b51c-6ebe-4b9c-9aad-2c60792a7d9c',
 'Comedy',
 'Humor',
 'Music',
 'Family',
 'Documentary',
 'Sport',
 'War_movies',
 'Travel',
 'ForKids',
 'Drama']

In [19]:
all_genres = []

for col in ['actor', 'country', 'genre', 'director']:
    print(f'======{col}======')
    all_values = []
    for i, item_values in enumerate(df[col]):
        try:
            all_values.extend(item_values)
        except:
            print(i, item_values)
    all_values = set(all_values)
    print('Len:', len(all_values))
    # print(all_values)

Len: 34028
Len: 108
Len: 32
2408 nan
2446 nan
Len: 8191


In [20]:
df.drop([2408, 2446], axis=0, inplace=True)

# Preprocessing

In [21]:
prep_df = df.copy(deep=True)

In [22]:
prep_df.shape

(11917, 14)

In [23]:
def add_topN_feature(df, col, n: int):
    items = []

    for i in df[col]:
        items.extend(i)

    topN = [x[0] for x in Counter(items).most_common()][:n]
    df[f'top{n}{col}'] = df[col].apply(lambda x: [int(item in x) for item in topN])
    df.drop(col, axis=1, inplace=True)
    return topN, f'top{n}{col}'

In [24]:
def ohe_topN_features(df, topN, cat):
    for i in range(len(topN)):
        df[f'{cat}_{topN[i]}'] = [item[i] for item in df[cat]]
    df.drop(cat, axis=1, inplace=True)
    return df

In [25]:
top10genres, name = add_topN_feature(prep_df, 'genre', 10)
prep_df = ohe_topN_features(prep_df, top10genres, name)

In [26]:
top10actors, name = add_topN_feature(prep_df, 'actor', 10)
prep_df = ohe_topN_features(prep_df, top10actors, name)

In [27]:
top3directors, name = add_topN_feature(prep_df, 'director', 3)
prep_df = ohe_topN_features(prep_df, top3directors, name)

In [28]:
top5countries, name = add_topN_feature(prep_df, 'country', 5)
prep_df = ohe_topN_features(prep_df, top3directors, name)

In [29]:
def get_unique_values(feature):
    unique_values = []
    for element in feature.dropna():
        unique_values.extend(element)
    return set(unique_values)

def preprocess_cat_features(df_pr, col):
    unique_values = get_unique_values(df_pr[col])
    cats_df = pd.DataFrame(np.full((len(df_pr), len(unique_values)),
                                    0),
                           columns=list(unique_values))

    for i, element in enumerate(df_pr[col]):
        if isinstance(element, float):
            cats_df.loc[i, :] = -1
            continue
        for value in element:
            cats_df.loc[i, value] = 1
    df_pr.drop(col, axis=1, inplace=True)
    df_pr[cats_df.columns] = cats_df
    
    return df_pr
prep_df = preprocess_cat_features(prep_df, 'availability')
prep_df.drop('FVOD', axis=1, inplace=True)

In [30]:
prep_df.head().T

Unnamed: 0,0,1,2,3,4
age_access_type,12,16,18,18,18
average_rating,6.21,7.2,4.0,6.99,6.0
duration,6420000,6780000,6000000,3780000,3780000
type,MOVIE,MOVIE,MOVIE,MOVIE,MOVIE
name,Звёздный путь 5: Последний рубеж,Звёздный путь 6: Неоткрытая страна,В поисках древнего артефакта,Прожарка Чарли Шина,Прожарка Уильяма Шэтнера
release_year,1989,1991,2019,2011,2006
target,0.000001,0.000002,0.000004,0.000354,0.000064
subscription_only,False,False,False,True,True
uid,e785baa6-f175-42b4-9e16-4319ac7991d5,4593737e-de9c-40df-97db-fb3cf85a08ef,11ba66db-e941-4c3a-8da6-d8900e56f8c7,3f30a2ef-53b7-40e3-954f-1bdfc38a6d17,cdfa700f-122d-41e5-b8dc-9c6813bab6d2
top10genre_Drama,0,0,0,0,0


In [31]:
prep_df.columns

Index(['age_access_type', 'average_rating', 'duration', 'type', 'name',
       'release_year', 'target', 'subscription_only', 'uid',
       'top10genre_Drama', 'top10genre_Comedy', 'top10genre_Thriller',
       'top10genre_Action', 'top10genre_Melodrama', 'top10genre_Adventure',
       'top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1',
       'top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e', 'top10genre_Family',
       'top10genre_Sci-Fi', 'top10actor_d08bba89-e937-40f3-b2a7-7ea26de4c246',
       'top10actor_fd401c6b-3e00-4f7f-aea7-5512df6b91e0',
       'top10actor_5463b4bc-e332-415c-aca8-11c0f217a9eb',
       'top10actor_5f3f6dc9-15fd-464f-91bc-b98c1e467f04',
       'top10actor_bebdaaad-69e1-49c4-ab58-89fc9acafaad',
       'top10actor_9703b641-2566-4e58-aebb-8d140805c0f3',
       'top10actor_fbdae856-85b9-4d4a-96dd-cc383571a488',
       'top10actor_dfd861c4-d2de-4bb6-bc6e-bae3f92a18a9',
       'top10actor_488a47a6-b3e1-40b3-acca-9f605202a040',
       'top10actor_3e94064c-14ef-4f11-9a

In [32]:
skewed_num_features = ['duration'] # can try 'average_rating'

In [33]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

class Preprocesser:
    def __init__(self, 
#                 mulpiple_cat_features: list,
                skewed_num_features: list,
               # text_features: list,
               # release_year_feature: str,
#                 mltpl_cat_n_top: dict
                ):
#         self.mulpiple_cat_features = mulpiple_cat_features
        self.skewed_num_features = skewed_num_features
        #self.text_features = text_features
#         self.mltpl_cat_n_top = mltpl_cat_n_top
       # self.release_year_feature = release_year_feature
        # self.tokenizer = Tokenizer(nlp.vocab)
        
    def __get_top_n_mltpl_cat_features(self, df_pr, col):
        all_cats = []
        for element_values in df_pr[col]:
            all_cats.extend(element_values)
        return Counter(all_cats).most_common(self.mltpl_cat_n_top[col])
            
    def __preprocess_mulpiple_cat_features(self, df_pr):
        for col in self.mulpiple_cat_features:
            feature_number = self.mltpl_cat_n_top[col]
            df_pr[[f'{col}_{i}' for i in range(feature_number)]] = pd.DataFrame(np.full((len(df), feature_number), np.nan))
            for i in df_pr.index:
                names = df_pr.loc[i, col]
                for k, name in enumerate(names[:int(feature_number)]):
                    df_pr.loc[i, f'{col}_{k}'] = name
            df_pr.drop(col, axis=1, inplace=True)

        return df_pr
    
    def __preprocess_skewed_num_features(self, df_pr):
        for col in self.skewed_num_features:
            df_pr[col] = df_pr[col].apply(lambda x: np.log1p(x))
        return df_pr
    
    def __preprocess_text_features(self, df_pr):
        for col in self.text_features:
            df_pr[col] = df_pr[col].apply(lambda x: self.tokenizer(x))
        return df_pr
            
    def preprocess(self,
                   df: pd.DataFrame):
        df_pr = df.copy()
#         df_pr = self.__preprocess_mulpiple_cat_features(df_pr)
        df_pr = self.__preprocess_skewed_num_features(df_pr)
        # df_pr = self.__preprocess_text_features(df_pr)
        
        return df_pr
        

In [34]:
preprocesser = Preprocesser(
#     mltpl_cat_features,
            skewed_num_features,
            #text_features,
#             mltpl_cat_n_top
                           )

In [35]:
df_modern = prep_df[prep_df['release_year'] >= 2000]
df_short = df_modern[df_modern['target'] < 0.02]
df_short.reset_index(drop=True, inplace=True)
prep_df = preprocesser.preprocess(df_short)

#prep_df.drop('FVOD', axis=1, inplace=True)
prep_df['release_year'] = prep_df['release_year'].astype(str)
prep_df['average_rating'] = prep_df['average_rating'].astype(float)
prep_df['age_access_type'] = prep_df['age_access_type'].astype(int)
prep_df['subscription_only'] = prep_df['subscription_only'].apply(int).astype(str)

In [37]:
prep_df.isnull().sum()

age_access_type                                        0
average_rating                                       525
duration                                               0
type                                                   0
name                                                   0
release_year                                           0
target                                                 0
subscription_only                                      0
uid                                                    0
top10genre_Drama                                       0
top10genre_Comedy                                      0
top10genre_Thriller                                    0
top10genre_Action                                      0
top10genre_Melodrama                                   0
top10genre_Adventure                                   0
top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1        0
top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e        0
top10genre_Family              

In [39]:
prep_df['subscription_only'] = prep_df['subscription_only'].astype(int)

In [40]:
prep_df.columns

Index(['age_access_type', 'average_rating', 'duration', 'type', 'name',
       'release_year', 'target', 'subscription_only', 'uid',
       'top10genre_Drama', 'top10genre_Comedy', 'top10genre_Thriller',
       'top10genre_Action', 'top10genre_Melodrama', 'top10genre_Adventure',
       'top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1',
       'top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e', 'top10genre_Family',
       'top10genre_Sci-Fi', 'top10actor_d08bba89-e937-40f3-b2a7-7ea26de4c246',
       'top10actor_fd401c6b-3e00-4f7f-aea7-5512df6b91e0',
       'top10actor_5463b4bc-e332-415c-aca8-11c0f217a9eb',
       'top10actor_5f3f6dc9-15fd-464f-91bc-b98c1e467f04',
       'top10actor_bebdaaad-69e1-49c4-ab58-89fc9acafaad',
       'top10actor_9703b641-2566-4e58-aebb-8d140805c0f3',
       'top10actor_fbdae856-85b9-4d4a-96dd-cc383571a488',
       'top10actor_dfd861c4-d2de-4bb6-bc6e-bae3f92a18a9',
       'top10actor_488a47a6-b3e1-40b3-acca-9f605202a040',
       'top10actor_3e94064c-14ef-4f11-9a

In [42]:
cat_features = ['age_access_type', 'type',
               'subscription_only']

num_features = ['average_rating', 'duration']

In [43]:
prep_df[cat_features] = prep_df[cat_features].replace(np.nan, 'Na')

In [44]:
prep_df.isnull().sum()

age_access_type                                        0
average_rating                                       525
duration                                               0
type                                                   0
name                                                   0
release_year                                           0
target                                                 0
subscription_only                                      0
uid                                                    0
top10genre_Drama                                       0
top10genre_Comedy                                      0
top10genre_Thriller                                    0
top10genre_Action                                      0
top10genre_Melodrama                                   0
top10genre_Adventure                                   0
top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1        0
top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e        0
top10genre_Family              

In [46]:
prep_df[prep_df['RENT'].isnull()]

Unnamed: 0,age_access_type,average_rating,duration,type,name,release_year,target,subscription_only,uid,top10genre_Drama,...,top3director_751aebbe-2d4d-4421-bace-d44b8e10c8eb,top3director_b5dd6805-5c41-467d-a417-84a7e463a5f6,top3director_f7ded2b9-7c98-47ab-bfc4-57fb60bf946c,top5country_751aebbe-2d4d-4421-bace-d44b8e10c8eb,top5country_b5dd6805-5c41-467d-a417-84a7e463a5f6,top5country_f7ded2b9-7c98-47ab-bfc4-57fb60bf946c,RENT,AVOD,DTO,SUBSCRIPTION
9481,16,,16.358686,SERIAL,Тиндер-80,2020,0.001266,1,249b3db3-8dbd-4a18-bd4b-5d0cc08878c4,0,...,0,0,0,0,0,0,,,,
9482,0,6.3,0.0,SERIAL,Царевны,2018,0.000786,0,ca50fd7c-41c5-4208-9f72-06d78fc602b5,0,...,0,0,0,0,1,0,,,,


In [48]:
prep_df.drop([9481, 9482], axis=0, inplace=True)

In [49]:
prep_df.columns

Index(['age_access_type', 'average_rating', 'duration', 'type', 'name',
       'release_year', 'target', 'subscription_only', 'uid',
       'top10genre_Drama', 'top10genre_Comedy', 'top10genre_Thriller',
       'top10genre_Action', 'top10genre_Melodrama', 'top10genre_Adventure',
       'top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1',
       'top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e', 'top10genre_Family',
       'top10genre_Sci-Fi', 'top10actor_d08bba89-e937-40f3-b2a7-7ea26de4c246',
       'top10actor_fd401c6b-3e00-4f7f-aea7-5512df6b91e0',
       'top10actor_5463b4bc-e332-415c-aca8-11c0f217a9eb',
       'top10actor_5f3f6dc9-15fd-464f-91bc-b98c1e467f04',
       'top10actor_bebdaaad-69e1-49c4-ab58-89fc9acafaad',
       'top10actor_9703b641-2566-4e58-aebb-8d140805c0f3',
       'top10actor_fbdae856-85b9-4d4a-96dd-cc383571a488',
       'top10actor_dfd861c4-d2de-4bb6-bc6e-bae3f92a18a9',
       'top10actor_488a47a6-b3e1-40b3-acca-9f605202a040',
       'top10actor_3e94064c-14ef-4f11-9a

# KNN Imputer

In [54]:
prep_df.head(1).T

Unnamed: 0,0
age_access_type,18
average_rating,4.0
duration,15.60727
type,MOVIE
name,В поисках древнего артефакта
release_year,2019
target,0.000004
subscription_only,0
uid,11ba66db-e941-4c3a-8da6-d8900e56f8c7
top10genre_Drama,0


In [51]:
float_features = ['age_access_type', 'average_rating', 'duration',
       'release_year', 'subscription_only',
       'top10genre_Drama', 'top10genre_Comedy', 'top10genre_Thriller',
       'top10genre_Action', 'top10genre_Melodrama', 'top10genre_Adventure',
       'top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1',
       'top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e', 'top10genre_Family',
       'top10genre_Sci-Fi', 'top10actor_d08bba89-e937-40f3-b2a7-7ea26de4c246',
       'top10actor_fd401c6b-3e00-4f7f-aea7-5512df6b91e0',
       'top10actor_5463b4bc-e332-415c-aca8-11c0f217a9eb',
       'top10actor_5f3f6dc9-15fd-464f-91bc-b98c1e467f04',
       'top10actor_bebdaaad-69e1-49c4-ab58-89fc9acafaad',
       'top10actor_9703b641-2566-4e58-aebb-8d140805c0f3',
       'top10actor_fbdae856-85b9-4d4a-96dd-cc383571a488',
       'top10actor_dfd861c4-d2de-4bb6-bc6e-bae3f92a18a9',
       'top10actor_488a47a6-b3e1-40b3-acca-9f605202a040',
       'top10actor_3e94064c-14ef-4f11-9ada-50db5b806557',
       'top3director_751aebbe-2d4d-4421-bace-d44b8e10c8eb',
       'top3director_b5dd6805-5c41-467d-a417-84a7e463a5f6',
       'top3director_f7ded2b9-7c98-47ab-bfc4-57fb60bf946c',
       'top5country_751aebbe-2d4d-4421-bace-d44b8e10c8eb',
       'top5country_b5dd6805-5c41-467d-a417-84a7e463a5f6',
       'top5country_f7ded2b9-7c98-47ab-bfc4-57fb60bf946c', 'RENT', 'DTO',
       'SUBSCRIPTION', 'AVOD']

In [55]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
prep_df[float_features] = imputer.fit_transform(prep_df[float_features])

In [56]:
prep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9481 entries, 0 to 9480
Data columns (total 39 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   age_access_type                                    9481 non-null   float64
 1   average_rating                                     9481 non-null   float64
 2   duration                                           9481 non-null   float64
 3   type                                               9481 non-null   object 
 4   name                                               9481 non-null   object 
 5   release_year                                       9481 non-null   float64
 6   target                                             9481 non-null   float64
 7   subscription_only                                  9481 non-null   float64
 8   uid                                                9481 non-null   object 
 9   top10gen

In [57]:
prep_df.columns

Index(['age_access_type', 'average_rating', 'duration', 'type', 'name',
       'release_year', 'target', 'subscription_only', 'uid',
       'top10genre_Drama', 'top10genre_Comedy', 'top10genre_Thriller',
       'top10genre_Action', 'top10genre_Melodrama', 'top10genre_Adventure',
       'top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1',
       'top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e', 'top10genre_Family',
       'top10genre_Sci-Fi', 'top10actor_d08bba89-e937-40f3-b2a7-7ea26de4c246',
       'top10actor_fd401c6b-3e00-4f7f-aea7-5512df6b91e0',
       'top10actor_5463b4bc-e332-415c-aca8-11c0f217a9eb',
       'top10actor_5f3f6dc9-15fd-464f-91bc-b98c1e467f04',
       'top10actor_bebdaaad-69e1-49c4-ab58-89fc9acafaad',
       'top10actor_9703b641-2566-4e58-aebb-8d140805c0f3',
       'top10actor_fbdae856-85b9-4d4a-96dd-cc383571a488',
       'top10actor_dfd861c4-d2de-4bb6-bc6e-bae3f92a18a9',
       'top10actor_488a47a6-b3e1-40b3-acca-9f605202a040',
       'top10actor_3e94064c-14ef-4f11-9a

In [58]:
str_features = ['type',
               'release_year', 'subscription_only',
               'top10genre_Drama', 'top10genre_Comedy', 'top10genre_Thriller',
               'top10genre_Action', 'top10genre_Melodrama', 'top10genre_Adventure',
               'top10genre_287a1485-7a88-4c2f-bc94-ca418b6c47a1',
               'top10genre_364fdc2e-bdfe-40be-b2c5-d30f43ec432e', 'top10genre_Family',
               'top10genre_Sci-Fi', 'top10actor_d08bba89-e937-40f3-b2a7-7ea26de4c246',
               'top10actor_fd401c6b-3e00-4f7f-aea7-5512df6b91e0',
               'top10actor_5463b4bc-e332-415c-aca8-11c0f217a9eb',
               'top10actor_5f3f6dc9-15fd-464f-91bc-b98c1e467f04',
               'top10actor_bebdaaad-69e1-49c4-ab58-89fc9acafaad',
               'top10actor_9703b641-2566-4e58-aebb-8d140805c0f3',
               'top10actor_fbdae856-85b9-4d4a-96dd-cc383571a488',
               'top10actor_dfd861c4-d2de-4bb6-bc6e-bae3f92a18a9',
               'top10actor_488a47a6-b3e1-40b3-acca-9f605202a040',
               'top10actor_3e94064c-14ef-4f11-9ada-50db5b806557',
               'top3director_751aebbe-2d4d-4421-bace-d44b8e10c8eb',
               'top3director_b5dd6805-5c41-467d-a417-84a7e463a5f6',
               'top3director_f7ded2b9-7c98-47ab-bfc4-57fb60bf946c',
               'top5country_751aebbe-2d4d-4421-bace-d44b8e10c8eb',
               'top5country_b5dd6805-5c41-467d-a417-84a7e463a5f6',
               'top5country_f7ded2b9-7c98-47ab-bfc4-57fb60bf946c', 'RENT', 'DTO',
               'SUBSCRIPTION', 'AVOD']

In [59]:
prep_df[str_features] = prep_df[str_features].astype(str)

In [60]:
prep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9481 entries, 0 to 9480
Data columns (total 39 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   age_access_type                                    9481 non-null   float64
 1   average_rating                                     9481 non-null   float64
 2   duration                                           9481 non-null   float64
 3   type                                               9481 non-null   object 
 4   name                                               9481 non-null   object 
 5   release_year                                       9481 non-null   object 
 6   target                                             9481 non-null   float64
 7   subscription_only                                  9481 non-null   object 
 8   uid                                                9481 non-null   object 
 9   top10gen

In [62]:
with open('dataset_3.pkl', 'wb') as f:
    pickle.dump(prep_df, f)

In [63]:
with open('dataset_3.pkl', 'rb') as f:
    df_pr = pickle.load(f)

In [64]:
df_pr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9481 entries, 0 to 9480
Data columns (total 39 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   age_access_type                                    9481 non-null   float64
 1   average_rating                                     9481 non-null   float64
 2   duration                                           9481 non-null   float64
 3   type                                               9481 non-null   object 
 4   name                                               9481 non-null   object 
 5   release_year                                       9481 non-null   object 
 6   target                                             9481 non-null   float64
 7   subscription_only                                  9481 non-null   object 
 8   uid                                                9481 non-null   object 
 9   top10gen