In [1]:
!pip install transliterate
!pip install spacy

[0m

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
from collections import Counter

import statsmodels.stats.api as sms
from matplotlib import style
style.use('seaborn')
%matplotlib inline
#graphs in svg look clearer
%config InlineBackend.figure_format = 'svg' 

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Load award datasets

### dataset 1

In [4]:
oscar_awards_1027_2019 = pd.read_csv('the_oscar_award.csv')

In [5]:
oscar_awards_1027_2019.shape

(10395, 7)

In [6]:
oscar_awards_1027_2019.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [7]:
oscar_awards_1027_2019['winner'].value_counts()

False    8038
True     2357
Name: winner, dtype: int64

In [8]:
oscar_awards_1027_2019['category'].unique()

array(['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY',
       'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)',
       'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE',
       'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)',
       'WRITING (Original Story)', 'WRITING (Title Writing)',
       'SPECIAL AWARD', 'DIRECTING', 'WRITING', 'OUTSTANDING PRODUCTION',
       'SOUND RECORDING', 'SHORT SUBJECT (Cartoon)',
       'SHORT SUBJECT (Comedy)', 'SHORT SUBJECT (Novelty)',
       'ASSISTANT DIRECTOR', 'FILM EDITING', 'MUSIC (Scoring)',
       'MUSIC (Song)', 'DANCE DIRECTION', 'WRITING (Screenplay)',
       'ACTOR IN A SUPPORTING ROLE', 'ACTRESS IN A SUPPORTING ROLE',
       'SHORT SUBJECT (Color)', 'SHORT SUBJECT (One-reel)',
       'SHORT SUBJECT (Two-reel)', 'IRVING G. THALBERG MEMORIAL AWARD',
       'MUSIC (Original Score)', 'CINEMATOGRAPHY (Black-and-White)',
       'CINEMATOGRAPHY (Color)', 'SPECIAL EFFECTS',
       'ART DIRECTION (Black-and-White)', 'ART DIRECT

## Translate english names into russian

In [24]:
from transliterate import translit

In [25]:
test = oscar_awards_1027_2019['name'][0]
test

'Richard Barthelmess'

In [26]:
print(translit(test, "ru"))

Ричард Бартхелмесс


In [27]:
def eng2rus_translater(data):
    return translit(data, "ru")

In [28]:
oscar_awards_1027_2019['rus_name'] = oscar_awards_1027_2019['name'].apply(eng2rus_translater)

In [29]:
oscar_awards_1027_2019.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,rus_name
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False,Ричард Бартхелмесс
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True,Емил Яннингс
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False,Лоуисе Дрессер
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True,Янет Гаынор
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False,Глориа Сwансон


In [30]:
oscar_has_1027_2019 = oscar_awards_1027_2019[oscar_awards_1027_2019['winner'] == True]

In [31]:
oscar_has_1027_2019.shape

(2357, 8)

### Preprocess names

In [32]:
import re
from pymystem3 import Mystem
import spacy


class NamesPreprocesser:
    def __init__(self):
        self.remove_pnkt_pattern = re.compile(r'[^A-zА-яё0-9 ]')
        self.remove_spaces_pattern = re.compile(r' ')
        self.lemmatizer = Mystem()
        
    def __remove_punctuation(self, text):
        return re.sub(self.remove_pnkt_pattern, '', text)
    
    def __remove_spaces(self, text):
        return re.sub(self.remove_spaces_pattern, '', text)
    
    def __lemmatize(self, text):
        return self.lemmatizer.lemmatize(text)
    
    def __join_text(self, text):
        return ''.join(text)[:-1]
    
    def transform(self, texts: pd.Series):
        # lower
        texts_pr = texts.apply(str.lower)
        # remove punctuation
        texts_pr = texts_pr.apply(self.__remove_punctuation)
        # lemmatize
        texts_pr = texts_pr.apply(self.__lemmatize)
        # join
        texts_pr = texts_pr.apply(self.__join_text)
        # remove spaces
        texts_pr = texts_pr.apply(self.__remove_spaces)
            
        return texts_pr


In [33]:
names_preprocessor = NamesPreprocesser()
prepr_names_oscars = names_preprocessor.transform(oscar_has_1027_2019['rus_name'])

In [34]:
prepr_names_oscars

1                                             емиляннингс
3                                             янетагаынор
6                                   wиллиамцамеронмензиес
9                                             чарлесрошер
10                                             карлструсс
                               ...                       
10390    сцреенплаыбыбонгйоонхоханйинwонсторабыбонгйоонхо
10391                                         гееныйдавис
10392                                           давидлынч
10393                                          wесстудить
10394                                       линаwертмллер
Name: rus_name, Length: 2357, dtype: object

In [35]:
oscar_owners = list(prepr_names_oscars.values)

## Load foriegn content

In [36]:
with open('./Okko_ratings_FOREIGN_FULL_kws.pkl', 'rb') as f:
    df = pickle.load(f, encoding='utf-8')

# **Take 1st 5 actors....

если есть актер с оскаром среди первых 5 , то добавляем флаг actor_oscar +1,2
если есть режиссер с оскаром среди первых 3 , то добавляем флаг director_oscar +1,2

In [37]:
def preproc_names(names, top_n):
    if isinstance(names, float):
        return np.nan
    elif len(names) < top_n:
        top_n = len(names)
        
    top_n_names = []
    for name in names[:top_n]:
        name = pd.Series(name)
        prepr = names_preprocessor.transform(name)
        top_n_names.append(prepr.values[0])
        
    return top_n_names

In [38]:
df['preproc_actor_names'] = df['ACTOR'].apply(preproc_names, top_n = 10)
df['preproc_composer_names'] = df['COMPOSER'].apply(preproc_names, top_n = 1)
df['preproc_design_names'] = df['DESIGN'].apply(preproc_names, top_n = 3)
df['preproc_director_names'] = df['DIRECTOR'].apply(preproc_names, top_n = 1)
df['preproc_editor_names'] = df['EDITOR'].apply(preproc_names, top_n = 1)
df['preproc_operator_names'] = df['OPERATOR'].apply(preproc_names, top_n = 1)
df['preproc_producer_names'] = df['PRODUCER'].apply(preproc_names, top_n = 5)
df['preproc_writer_names'] = df['WRITER'].apply(preproc_names, top_n = 2)

## Match dataset with oscars

In [39]:
def find_oscar_owner(names, oscar_owns: list):
    counter = 0
    
    if isinstance(names, float):
        return counter
    
    for name in names:
        if name in oscar_owns:
            counter += 1
    return counter

In [40]:
cols_people_oscar = ['actor', 'composer', 'design', 
                     'director', 'editor', 'operator',
                     'producer', 'writer']

In [41]:
for col in cols_people_oscar:  
    df[f'has_oscar_{col}'] = df[f'preproc_{col}_names'].apply(find_oscar_owner, oscar_owns = oscar_owners)
    
    print(f"AMOUNT {col}:\n {df[f'has_oscar_{col}'].value_counts()}\n")
    df.drop(f'preproc_{col}_names', axis=1, inplace=True)

AMOUNT actor:
 0    8588
1     193
2       3
Name: has_oscar_actor, dtype: int64

AMOUNT composer:
 0    8732
1      52
Name: has_oscar_composer, dtype: int64

AMOUNT design:
 0    8776
1       8
Name: has_oscar_design, dtype: int64

AMOUNT director:
 0    8744
1      40
Name: has_oscar_director, dtype: int64

AMOUNT editor:
 0    8784
Name: has_oscar_editor, dtype: int64

AMOUNT operator:
 0    8739
1      45
Name: has_oscar_operator, dtype: int64

AMOUNT producer:
 0    8757
1      26
3       1
Name: has_oscar_producer, dtype: int64

AMOUNT writer:
 0    8750
1      34
Name: has_oscar_writer, dtype: int64



In [42]:
import pickle

with open('dataset_FOREIGN_kws_oscr_PCA.pkl', 'wb') as f:
    pickle.dump(df, f)