In [1]:
import os
import pandas as pd
import numpy as np
from functools import reduce
import sys
import csv
from time import strptime
import json
from datetime import datetime as dt
from datetime import date
from statistics import mean
import math
import pickle
import time

In [2]:
# csv.field_size_limit(sys.maxsize)
# np.set_printoptions(threshold=sys.maxsize)
pd.options.display.max_columns = None
pd.options.display.float_format = '{:20,.15f}'.format
# pd.options.display.max_rows = 10000
# pd.set_option('display.max_colwidth', -1)

In [3]:
def calc_smooth_mean(df, by, on, m):
    mean = df[on].mean()
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    smooth = (counts * means + m * mean) / (counts + m)
    return df[by].map(smooth)

In [5]:
df_file = pd.read_csv('savedcsv/17_homepage.csv')
df_file = df_file.drop(df_file[df_file['META__id'] == 6].index)
df_file.to_csv('datasets/df_file.csv', index=False)
df_file = pd.read_csv('datasets/df_file.csv')

In [26]:
# df_raw = reduce_mem_usage(pd.read_csv('savedcsv/17_homepage.csv'))
df_raw = pd.read_csv('savedcsv/17_homepage.csv')
df_raw['META__profitability'] = df_raw['profitability']
df_raw['META__year_avg_profitability'] = df_raw['year_avg_profitability']
df_raw = df_raw.drop(['profitability', 'year_avg_profitability', 'PROCESS__original_language'], axis=1)
drop_cols = [c for c in df_raw.columns if 'META' not in c and ('avg' in c or 'experience' in c or 'movies_before' in c)]
df_raw = df_raw.drop(drop_cols, axis=1)
df_raw = df_raw.drop(df_raw[df_raw['META__id'] == 6].index)
df_raw.to_csv('datasets/df_raw.csv', index=False)
df_raw = pd.read_csv('datasets/df_raw.csv')

In [4]:
df_raw = pd.read_csv('datasets/df_raw.csv')
# for i in range(1, 9):
#     df_raw.rename(columns={f'META__cast_{i}_name': f'META__cast_{i}'}, inplace=True)
# df_raw.rename(columns={
#     'META__crew__editing__editor__1': 'META__crew__editing__editor', 
#     'META__crew__writing__screenplay__1': 'META__crew__writing__screenplay',
#     'META__crew__production__executive_producer__1': 'META__crew__production__executive_producer',
#     'META__crew__directing__director__1': 'META__crew__directing__director'
#     }, inplace=True)
# df_raw.rename(columns={
#     'META__crew__production__producer__1': 'META__crew__production__producer_1',
#     'META__crew__production__producer__2': 'META__crew__production__producer_2'
#     }, inplace=True)
# df_raw.to_csv('datasets/df_raw.csv', index=False)
df_raw['META__date'] = pd.to_datetime((df_raw.META__year*10000 + df_raw.META__month*100 + df_raw.META__day).apply(str), format='%Y%m%d')

In [5]:
get_group_avg_revenue = lambda group: np.rint(group['META__revenue'].mean()) if not group.empty else np.nan
get_group_avg_profit = lambda group: np.rint((group['META__revenue'] - group['budget']).mean()) if not group.empty else np.nan


class CalculateInfo:
    def __init__(self, df, movie, movie_index, df_ref):
        self.df_ref = df_ref
        self.df = df
        self.movie = movie
        self.movie_index = movie_index
        self.movies_before = df_ref[df_ref.META__date < movie.META__date]

    def _get_col_index(self, col):
        if col not in self.df:
            self.df[col] = np.nan
        return self.df.columns.get_loc(col)

    def _set_column(self, column, value):
        self.df.iat[self.movie_index, self._get_col_index(column)] = value

    def _set_avg_profit(self, column, group): 
        self._set_column(f'{column}_avg_profit', get_group_avg_profit(group))
    def _set_avg_revenue(self, column, group): 
        self._set_column(f'{column}_avg_revenue', get_group_avg_revenue(group))
    def _set_movies_before(self, column, group): 
        self._set_column(f'{column}_movies_before', group.shape[0])
    def _set_experience(self, column, group): 
        self._set_column(f'{column}_experience', self.movie.META__year - group['META__year'].min() if not group.empty else 0)



    def _get_groups(self, column, iteration, df_ref): 
        def concat_df(df, column, iteration):
            def_columns = ['budget', 'META__revenue', 'META__year']
            if len(iteration) > 1:
                return pd.concat([df[[f'{column}_{i}'] + def_columns].rename(columns={f'{column}_{i}': column}) for i in iteration])
            return df[list(set([column] + def_columns))]

        df = self.movies_before if not df_ref else self.df_ref
        return concat_df(df, column, iteration).groupby(column)

    def _apply_set_functions(self, column, group, *argv):
        for func in argv:
            func(column, group)

    def _set_info(self, column_to_iterate, new_column, iteration, df_ref, *argv):
        groups = self._get_groups(column_to_iterate, iteration, df_ref)
        if len(iteration) == 1:
            value = self.movie.get(column_to_iterate)
            group = groups.get_group(value) if value in groups.groups.keys() else pd.DataFrame()
            self._apply_set_functions(new_column, group, *argv)
        for i in iteration:
            value = self.movie.get(f'{column_to_iterate}_{i}')
            group = groups.get_group(value) if value in groups.groups.keys() else pd.DataFrame()
            self._apply_set_functions(f'{new_column}_{i}', group, *argv)
        return self

    def set_info_year(self):
        return self._set_info('META__year', 'year', range(1), True, self._set_avg_profit, self._set_avg_revenue)

    def set_info_production_company(self):
        return self._set_info('META__production_company', 'production_company', range(1,4), False, self._set_avg_profit, self._set_avg_revenue)

    def set_info_cast(self):
        return self._set_info('META__cast', 'cast', range(1,9), False, self._set_avg_profit, self._set_avg_revenue, self._set_experience, self._set_movies_before)

    def set_info_collections(self):
        return self._set_info('META__collection_name', 'collection', range(1), False, self._set_avg_profit, self._set_avg_revenue)

    def set_info_crew(self):
        crew_columns = [column for column in self.df.columns if 'META__crew' in column]
        producer_passed = False
        for column in crew_columns:
            if 'production__producer' in column and not producer_passed:
                column_to_iterate, new_column, iteration = column[:-2], column[6:-2], range(1,3)
                producer_passed = True
            else:
                column_to_iterate, new_column, iteration = column, column[6:], range(1)
            self._set_info(column_to_iterate, new_column, iteration, False, self._set_avg_profit, self._set_avg_revenue, self._set_experience)
        return self


    def set_info_cast_avg(self):
        key_columns = ['avg_revenue', 'avg_profit', 'experience', 'movies_before']
        for key_column in key_columns:
            df_columns = [f'cast_{i}_{key_column}' for i in range(1,9)]
            movie = self.df.iloc[self.movie_index]
            self._set_column(f'cast_avg_{key_column}', np.mean([movie[col] for col in df_columns if not pd.isna(movie[col])]))
        return self

In [9]:
start = time.time()


dff = df_raw[:10].copy()
for row_index, row in dff.iterrows():
    CalculateInfo(dff, row, row_index, df_ref=df_raw) \
        .set_info_cast() \
        .set_info_collections() \
        .set_info_crew() \
        .set_info_production_company() \
        .set_info_year() \
        .set_info_cast_avg()

print(time.time() - start)

4.335583209991455


In [8]:
start = time.time()

d = df_raw.copy()
d = set_all_info(d)

print(time.time() - start)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
126.9867513179779


In [154]:
df_compare = pd.read_csv('datasets/dataset_all.csv')

In [4]:
import pandas as pd
pd.__version__

'1.0.5'

In [166]:
d.shape

(10, 293)

In [10]:
help('modules')


Please wait a moment while I gather a list of all available modules...

Update LANGUAGE_CODES (inside config/base.py) if a new translation has been added to Spyder
Cython              cgi                 lief                scipy
IPython             cgitb               linecache           scripts
OpenSSL             chardet             llvmlite            seaborn
PIL                 chunk               locale              secrets
PyQt5               click               locket              secretstorage
__future__          cloudpickle         logging             select
_abc                clyent              lxml                selectors
_ast                cmath               lzma                send2trash
_asyncio            cmd                 mailbox             setuptools
_bisect             code                mailcap             shell_exec
_blake2             codecs              markupsafe          shelve
_bootlocale         codeop              marshal             shlex
_bz2    

In [167]:
coll = [col for col in d.columns if (col.startswith('cast') or col.startswith('crew') or col.startswith('production_company') or col.startswith('collection') or col.startswith('year')) and not 'gender' in col]

In [168]:
d[5:9]

Unnamed: 0,budget,META__id,runtime,spoken_languages,META__year,META__month,META__day,weekend,META__collection_name,META__cast_1,META__cast_2,META__cast_3,META__cast_4,META__cast_5,META__cast_6,META__cast_7,META__cast_8,META__crew__sound__music_editor,META__crew__sound__original_music_composer,META__crew__sound__sound_designer,META__crew__sound__sound_effects_editor,META__crew__sound__sound_re_recording_mixer,META__crew__sound__supervising_sound_editor,META__crew__directing__director,META__crew__directing__script_supervisor,META__crew__production__casting,META__crew__production__executive_producer,META__crew__production__producer_1,META__crew__production__producer_2,META__crew__editing__editor,META__crew__costume__costume_designer,META__crew__costume__costume_supervisor,META__crew__costume__makeup_artist,META__crew__crew__stunt_coordinator,META__crew__writing__screenplay,META__crew__art__art_direction,META__crew__art__production_design,META__crew__art__property_master,META__crew__art__set_decoration,META__crew__visualeffects__visual_effects_supervisor,META__crew__camera__director_of_photography,META__crew__camera__steadicam_operator,META__crew__camera__still_photographer,genre__war,genre__western,genre__mystery,genre__music,genre__crime,genre__romance,genre__action,genre__adventure,genre__thriller,genre__animation,genre__family,genre__drama,genre__comedy,genre__documentary,genre__history,genre__fantasy,genre__horror,META__production_company_1,META__production_company_2,META__production_company_3,country__es,country__jp,country__us,country__ca,country__de,country__cn,country__in,country__fr,country__ru,country__it,country__au,country__gb,day_sin,day_cos,month_sin,month_cos,competition,rating__g,rating__pg,rating__pg-13,rating__r,rating__nc-17,META__original_title,META__title,META__keywords,META__plot,country__other,META__imdb_id,tag__murder,tag__violence,tag__flashback,tag__romantic,tag__cult,tag__revenge,tag__psychedelic,tag__comedy,tag__suspenseful,tag__good_versus_evil,tag__humor,tag__satire,tag__entertaining,tag__neo_noir,tag__action,tag__sadist,tag__insanity,tag__tragedy,tag__fantasy,tag__paranormal,tag__boring,tag__mystery,tag__horror,tag__melodrama,tag__cruelty,tag__gothic,tag__dramatic,tag__dark,tag__atmospheric,tag__storytelling,tag__sci_fi,tag__psychological,tag__historical,tag__absurd,tag__prank,tag__sentimental,tag__philosophical,tag__bleak,tag__alternate_reality,tag__depressing,tag__plot_twist,tag__realism,tag__cute,tag__stupid,tag__home_movie,tag__thought_provoking,tag__inspiring,tag__other,cast_1_gender,cast_2_gender,cast_3_gender,cast_4_gender,cast_5_gender,cast_6_gender,cast_7_gender,cast_8_gender,META__cast_1_male,META__cast_1_female,META__cast_2_male,META__cast_2_female,META__cast_3_male,META__cast_3_female,META__cast_4_male,META__cast_4_female,META__cast_5_male,META__cast_5_female,META__cast_6_male,META__cast_6_female,META__cast_7_male,META__cast_7_female,META__cast_8_male,META__cast_8_female,META__revenue,homepage_exists,homepage_repeats,META__profitability,META__year_avg_profitability,META__date,cast_1_avg_profit,cast_1_avg_revenue,cast_1_movies_before,cast_1_experience,cast_2_avg_profit,cast_2_avg_revenue,cast_2_movies_before,cast_2_experience,cast_3_avg_profit,cast_3_avg_revenue,cast_3_movies_before,cast_3_experience,cast_4_avg_profit,cast_4_avg_revenue,cast_4_movies_before,cast_4_experience,cast_5_avg_profit,cast_5_avg_revenue,cast_5_movies_before,cast_5_experience,cast_6_avg_profit,cast_6_avg_revenue,cast_6_movies_before,cast_6_experience,cast_7_avg_profit,cast_7_avg_revenue,cast_7_movies_before,cast_7_experience,cast_8_avg_profit,cast_8_avg_revenue,cast_8_movies_before,cast_8_experience,production_company_1_avg_profit,production_company_1_avg_revenue,production_company_2_avg_profit,production_company_2_avg_revenue,production_company_3_avg_profit,production_company_3_avg_revenue,collection_avg_profit,collection_avg_revenue,crew__production__producer_1_avg_profit,crew__production__producer_1_avg_revenue,crew__production__producer_1_movies_before,crew__production__producer_2_avg_profit,crew__production__producer_2_avg_revenue,crew__production__producer_2_movies_before,crew__sound__music_editor_avg_profit,crew__sound__music_editor_avg_revenue,crew__sound__music_editor_movies_before,crew__sound__original_music_composer_avg_profit,crew__sound__original_music_composer_avg_revenue,crew__sound__original_music_composer_movies_before,crew__sound__sound_designer_avg_profit,crew__sound__sound_designer_avg_revenue,crew__sound__sound_designer_movies_before,crew__sound__sound_effects_editor_avg_profit,crew__sound__sound_effects_editor_avg_revenue,crew__sound__sound_effects_editor_movies_before,crew__sound__sound_re_recording_mixer_avg_profit,crew__sound__sound_re_recording_mixer_avg_revenue,crew__sound__sound_re_recording_mixer_movies_before,crew__sound__supervising_sound_editor_avg_profit,crew__sound__supervising_sound_editor_avg_revenue,crew__sound__supervising_sound_editor_movies_before,crew__directing__director_avg_profit,crew__directing__director_avg_revenue,crew__directing__director_movies_before,crew__directing__script_supervisor_avg_profit,crew__directing__script_supervisor_avg_revenue,crew__directing__script_supervisor_movies_before,crew__production__casting_avg_profit,crew__production__casting_avg_revenue,crew__production__casting_movies_before,crew__production__executive_producer_avg_profit,crew__production__executive_producer_avg_revenue,crew__production__executive_producer_movies_before,crew__editing__editor_avg_profit,crew__editing__editor_avg_revenue,crew__editing__editor_movies_before,crew__costume__costume_designer_avg_profit,crew__costume__costume_designer_avg_revenue,crew__costume__costume_designer_movies_before,crew__costume__costume_supervisor_avg_profit,crew__costume__costume_supervisor_avg_revenue,crew__costume__costume_supervisor_movies_before,crew__costume__makeup_artist_avg_profit,crew__costume__makeup_artist_avg_revenue,crew__costume__makeup_artist_movies_before,crew__crew__stunt_coordinator_avg_profit,crew__crew__stunt_coordinator_avg_revenue,crew__crew__stunt_coordinator_movies_before,crew__writing__screenplay_avg_profit,crew__writing__screenplay_avg_revenue,crew__writing__screenplay_movies_before,crew__art__art_direction_avg_profit,crew__art__art_direction_avg_revenue,crew__art__art_direction_movies_before,crew__art__production_design_avg_profit,crew__art__production_design_avg_revenue,crew__art__production_design_movies_before,crew__art__property_master_avg_profit,crew__art__property_master_avg_revenue,crew__art__property_master_movies_before,crew__art__set_decoration_avg_profit,crew__art__set_decoration_avg_revenue,crew__art__set_decoration_movies_before,crew__visualeffects__visual_effects_supervisor_avg_profit,crew__visualeffects__visual_effects_supervisor_avg_revenue,crew__visualeffects__visual_effects_supervisor_movies_before,crew__camera__director_of_photography_avg_profit,crew__camera__director_of_photography_avg_revenue,crew__camera__director_of_photography_movies_before,crew__camera__steadicam_operator_avg_profit,crew__camera__steadicam_operator_avg_revenue,crew__camera__steadicam_operator_movies_before,crew__camera__still_photographer_avg_profit,crew__camera__still_photographer_avg_revenue,crew__camera__still_photographer_movies_before,year_avg_profit,year_avg_revenue,cast_avg_avg_revenue,cast_avg_avg_profit,cast_avg_experience,cast_avg_movies_before
5,839727,15,119,1,1941,4,30,0,,orson welles,joseph cotten,dorothy comingore,ray collins,george coulouris,agnes moorehead,paul stewart,ruth warrick,,bernard herrmann,,,clem portman,john aalberg,orson welles,,rufus le maire,george schaefer,orson welles,,robert wise,edward stevenson,,layne britton,,orson welles,van nest polglase,,charles sayers,darrell silvera,,gregg toland,,phil stern,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,rko radio pictures,mercury productions,,0,0,1,0,0,0,0,0,0,0,0,0,-0.207911690817759,0.978147600733806,0.989821441880933,-0.142314838273285,0.0,0,1,0,0,0,Citizen Kane,Citizen Kane,journalist newspaper florida capitalist banker...,"Charles Foster Kane , an enormously wealthy me...",False,tt0033467,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,0,1,23217674,0,0,26.649074044302488,4.834435714362336,1941-04-30,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,2041900.0,2872445.0,,,,,,,,,0,,,0,,,0,,,0,,,0,,,0,2593000.0,3202000.0,1,,,0,,,0,,,0,,,0,,,0,5077900.0,6877900.0,1,892000.0,2807000.0,1,,,0,,,0,,,0,,,0,1413738.0,2374112.0,8,,,0,,,0,2984950.0,4842450.0,2,,,0,,,0,,,0,,,0,6927690.0,7868508.0,,,0.0,0.0
6,12800000,16,141,1,2000,5,17,0,,bjrk,catherine deneuve,david morse,peter stormare,joel grey,cara seymour,vladica kostic,jean-marc barr,,bjrk,kristian eidnes andersen,,,,lars von trier,,avy kaufman,peter aalbk jensen,vibeke windelv,finn gjerdrum,franois gdigier,manon rasmussen,,cdric grard,,,peter grant,karl jlusson,jesper lorents,,,robby mller,,david koskas,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,lantia cinema & audiovisivi,fine line features,zentropa entertainments,0,0,1,0,1,0,0,1,0,1,0,1,-0.207911690817759,-0.978147600733806,0.755749574354258,-0.654860733945285,2.15,0,0,0,1,0,Dancer in the Dark,Dancer in the Dark,individual dancing robbery factory worker secr...,The film is set in Washington state in 1964 an...,True,tt0168629,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,1,0,40031879,0,0,2.127490546875,-1.162769204436023,2000-05-17,,,0,0,-2486102.0,1813898.0,2,31,63455518.0,105682791.0,11,10,34076891.0,53743558.0,3,4,,,0,0,,,0,0,,,0,0,,,0,0,,,878739.0,7103151.0,-2492765.0,7235.0,,,-2492765.0,7235.0,1,,,0,,,0,,,0,,,0,,,0,,,0,,,0,-2492765.0,7235.0,1,,,0,42300225.0,66800225.0,14,-2492765.0,7235.0,1,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,14676795.0,17697790.0,7,,,0,,,0,40044968.0,74682780.0,53746749.0,31682102.33333333,5.625,2.0
7,90000000,18,126,3,1997,5,2,0,,bruce willis,gary oldman,ian holm,milla jovovich,chris tucker,luke perry,brion james,tom lister jr.,,ric serra,mark a. mangini,julia evershade,ron bartlett,,luc besson,jean bourne,lucinda syson,,patrice ledoux,,sylvie landra,jean paul gaultier,janet tebrooke,amanda knight,marc boyle,luc besson,ira gilford,dan weil,barry wilkinson,maggie gray,mark stetson,thierry arbogast,john ward,jack english,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,columbia pictures,gaumont,sony pictures,0,0,0,0,0,0,0,1,0,0,0,0,0.207911690817759,0.978147600733806,0.755749574354258,-0.654860733945285,1.366666666666667,0,0,1,0,0,The Fifth Element,The Fifth Element,new york city clone taxi cyborg egypt future s...,"In 1914, at the outbreak of World War I, extra...",False,tt0119116,1,0,0,1,1,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,1,0,263920180,0,0,1.932446444444444,-0.169600900518265,1997-05-02,83959284.0,122759284.0,15,10,40171047.0,62291303.0,8,11,34815533.0,48172675.0,7,21,-8192146.0,2807854.0,1,6,24715918.0,28215918.0,1,2,9624456.0,16624456.0,1,5,19268503.0,42635725.0,9,15,10621424.0,14888091.0,3,10,27823833.0,46696953.0,13617.0,18721357.0,19383263.0,35870406.0,,,29284974.0,45284974.0,1,,,0,,,0,79582741.0,100717417.0,4,276101666.0,366101666.0,1,,,0,64548284.0,95148284.0,2,,,0,8045643.0,16891878.0,3,34514260.0,44014260.0,2,,,0,,,0,29284974.0,45284974.0,1,-16261389.0,1738611.0,1,,,0,80134932.0,133634932.0,2,109773545.0,159773545.0,1,29284974.0,45284974.0,1,,,0,12873136.0,25142487.0,2,305000000.0,333000000.0,1,-36562432.0,20437568.0,2,,,0,1312525.0,16947201.0,4,185925792.0,218425792.0,2,,,0,55279438.0,91347406.0,42299413.25,26873002.375,10.0,5.625
8,92620000,19,153,1,1927,1,10,0,,gustav frhlich,brigitte helm,alfred abel,rudolf klein-rogge,theodor loos,fritz rasp,erwin biswanger,heinrich george,frank strobel,gottfried huppertz,,,,,fritz lang,,,,erich pommer,,fritz lang,aenne willkomm,,,,fritz lang,karl vollbrecht,,,,,karl freund,,horst von harbou,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,universum film (ufa),,,0,0,0,0,1,0,0,0,0,0,0,0,0.951056516295154,-0.309016994374947,0.0,1.0,0.0,0,1,0,0,0,Metropolis,Metropolis,man vs machine based on novel or book undergro...,"Set in the year 2026, Metropolis takes place i...",False,tt0017136,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,650422,0,0,-141.39985732339926,-18.88439515829816,1927-01-10,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,,,0,0,,,,,,,,,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,,,0,-45712462.0,839090.0,,,0.0,0.0


In [169]:
coll2 = [col for col in df_compare.columns if (col.startswith('cast') or col.startswith('crew') or col.startswith('production_company') or col.startswith('collection') or col.startswith('year')) and not 'gender' in col]

In [170]:
df_compare[5:9]

Unnamed: 0,budget,META__id,runtime,spoken_languages,META__year,META__month,META__day,weekend,META__collection_name,META__cast_1_name,META__cast_2_name,META__cast_3_name,META__cast_4_name,META__cast_5_name,META__cast_6_name,META__cast_7_name,META__cast_8_name,META__crew__sound__music_editor,META__crew__sound__original_music_composer,META__crew__sound__sound_designer,META__crew__sound__sound_effects_editor,META__crew__sound__sound_re_recording_mixer,META__crew__sound__supervising_sound_editor,META__crew__directing__director__1,META__crew__directing__script_supervisor,META__crew__production__casting,META__crew__production__executive_producer__1,META__crew__production__producer__1,META__crew__production__producer__2,META__crew__editing__editor__1,META__crew__costume__costume_designer,META__crew__costume__costume_supervisor,META__crew__costume__makeup_artist,META__crew__crew__stunt_coordinator,META__crew__writing__screenplay__1,META__crew__art__art_direction,META__crew__art__production_design,META__crew__art__property_master,META__crew__art__set_decoration,META__crew__visualeffects__visual_effects_supervisor,META__crew__camera__director_of_photography,META__crew__camera__steadicam_operator,META__crew__camera__still_photographer,genre__war,genre__western,genre__mystery,genre__music,genre__crime,genre__romance,genre__action,genre__adventure,genre__thriller,genre__animation,genre__family,genre__drama,genre__comedy,genre__documentary,genre__history,genre__fantasy,genre__horror,META__production_company_1,META__production_company_2,META__production_company_3,country__es,country__jp,country__us,country__ca,country__de,country__cn,country__in,country__fr,country__ru,country__it,country__au,country__gb,day_sin,day_cos,month_sin,month_cos,competition,rating__g,rating__pg,rating__pg-13,rating__r,rating__nc-17,META__original_title,META__title,META__keywords,META__plot,country__other,META__imdb_id,tag__murder,tag__violence,tag__flashback,tag__romantic,tag__cult,tag__revenge,tag__psychedelic,tag__comedy,tag__suspenseful,tag__good_versus_evil,tag__humor,tag__satire,tag__entertaining,tag__neo_noir,tag__action,tag__sadist,tag__insanity,tag__tragedy,tag__fantasy,tag__paranormal,tag__boring,tag__mystery,tag__horror,tag__melodrama,tag__cruelty,tag__gothic,tag__dramatic,tag__dark,tag__atmospheric,tag__storytelling,tag__sci_fi,tag__psychological,tag__historical,tag__absurd,tag__prank,tag__sentimental,tag__philosophical,tag__bleak,tag__alternate_reality,tag__depressing,tag__plot_twist,tag__realism,tag__cute,tag__stupid,tag__home_movie,tag__thought_provoking,tag__inspiring,tag__other,cast_1_gender,cast_2_gender,cast_3_gender,cast_4_gender,cast_5_gender,cast_6_gender,cast_7_gender,cast_8_gender,META__cast_1_male,META__cast_1_female,META__cast_2_male,META__cast_2_female,META__cast_3_male,META__cast_3_female,META__cast_4_male,META__cast_4_female,META__cast_5_male,META__cast_5_female,META__cast_6_male,META__cast_6_female,META__cast_7_male,META__cast_7_female,META__cast_8_male,META__cast_8_female,META__revenue,homepage_exists,homepage_repeats,META__profitability,META__year_avg_profitability,production_company_1_avg_profit,production_company_1_avg_revenue,production_company_2_avg_profit,production_company_2_avg_revenue,production_company_3_avg_profit,production_company_3_avg_revenue,cast_1_avg_profit,cast_1_experience,cast_1_avg_revenue,cast_1_movies_before,cast_2_avg_profit,cast_2_experience,cast_2_avg_revenue,cast_2_movies_before,cast_3_avg_profit,cast_3_experience,cast_3_avg_revenue,cast_3_movies_before,cast_4_avg_profit,cast_4_experience,cast_4_avg_revenue,cast_4_movies_before,cast_5_avg_profit,cast_5_experience,cast_5_avg_revenue,cast_5_movies_before,cast_6_avg_profit,cast_6_experience,cast_6_avg_revenue,cast_6_movies_before,cast_7_avg_profit,cast_7_experience,cast_7_avg_revenue,cast_7_movies_before,cast_8_avg_profit,cast_8_experience,cast_8_avg_revenue,cast_8_movies_before,crew__sound__music_editor_avg_profit,crew__sound__music_editor_avg_revenue,crew__sound__music_editor_movies_before,crew__sound__original_music_composer_avg_profit,crew__sound__original_music_composer_avg_revenue,crew__sound__original_music_composer_movies_before,crew__sound__sound_designer_avg_profit,crew__sound__sound_designer_avg_revenue,crew__sound__sound_designer_movies_before,crew__sound__sound_effects_editor_avg_profit,crew__sound__sound_effects_editor_avg_revenue,crew__sound__sound_effects_editor_movies_before,crew__sound__sound_re_recording_mixer_avg_profit,crew__sound__sound_re_recording_mixer_avg_revenue,crew__sound__sound_re_recording_mixer_movies_before,crew__sound__supervising_sound_editor_avg_profit,crew__sound__supervising_sound_editor_avg_revenue,crew__sound__supervising_sound_editor_movies_before,crew__directing__director__1_avg_profit,crew__directing__director__1_avg_revenue,crew__directing__director__1_movies_before,crew__directing__script_supervisor_avg_profit,crew__directing__script_supervisor_avg_revenue,crew__directing__script_supervisor_movies_before,crew__production__casting_avg_profit,crew__production__casting_avg_revenue,crew__production__casting_movies_before,crew__production__executive_producer__1_avg_profit,crew__production__executive_producer__1_avg_revenue,crew__production__executive_producer__1_movies_before,crew__production__producer__1_avg_profit,crew__production__producer__1_avg_revenue,crew__production__producer__1_movies_before,crew__production__producer__2_avg_profit,crew__production__producer__2_avg_revenue,crew__production__producer__2_movies_before,crew__editing__editor__1_avg_profit,crew__editing__editor__1_avg_revenue,crew__editing__editor__1_movies_before,crew__costume__costume_designer_avg_profit,crew__costume__costume_designer_avg_revenue,crew__costume__costume_designer_movies_before,crew__costume__costume_supervisor_avg_profit,crew__costume__costume_supervisor_avg_revenue,crew__costume__costume_supervisor_movies_before,crew__costume__makeup_artist_avg_profit,crew__costume__makeup_artist_avg_revenue,crew__costume__makeup_artist_movies_before,crew__crew__stunt_coordinator_avg_profit,crew__crew__stunt_coordinator_avg_revenue,crew__crew__stunt_coordinator_movies_before,crew__writing__screenplay__1_avg_profit,crew__writing__screenplay__1_avg_revenue,crew__writing__screenplay__1_movies_before,crew__art__art_direction_avg_profit,crew__art__art_direction_avg_revenue,crew__art__art_direction_movies_before,crew__art__production_design_avg_profit,crew__art__production_design_avg_revenue,crew__art__production_design_movies_before,crew__art__property_master_avg_profit,crew__art__property_master_avg_revenue,crew__art__property_master_movies_before,crew__art__set_decoration_avg_profit,crew__art__set_decoration_avg_revenue,crew__art__set_decoration_movies_before,crew__visualeffects__visual_effects_supervisor_avg_profit,crew__visualeffects__visual_effects_supervisor_avg_revenue,crew__visualeffects__visual_effects_supervisor_movies_before,crew__camera__director_of_photography_avg_profit,crew__camera__director_of_photography_avg_revenue,crew__camera__director_of_photography_movies_before,crew__camera__steadicam_operator_avg_profit,crew__camera__steadicam_operator_avg_revenue,crew__camera__steadicam_operator_movies_before,crew__camera__still_photographer_avg_profit,crew__camera__still_photographer_avg_revenue,crew__camera__still_photographer_movies_before,collection_avg_profit,collection_avg_revenue,cast_avg_revenue,cast_avg_profit,cast_avg_experience,cast_avg_movies_before,year_avg_revenue
5,839727,15,119,1,1941,4,30,0,,orson welles,joseph cotten,dorothy comingore,ray collins,george coulouris,agnes moorehead,paul stewart,ruth warrick,,bernard herrmann,,,clem portman,john aalberg,orson welles,,rufus le maire,george schaefer,orson welles,,robert wise,edward stevenson,,layne britton,,orson welles,van nest polglase,,charles sayers,darrell silvera,,gregg toland,,phil stern,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,rko radio pictures,mercury productions,,0,0,1,0,0,0,0,0,0,0,0,0,-0.207911690817759,0.978147600733806,0.989821441880933,-0.142314838273285,0.0,0,1,0,0,0,Citizen Kane,Citizen Kane,journalist newspaper florida capitalist banker...,"Charles Foster Kane , an enormously wealthy me...",False,tt0033467,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,0,1,23217674,0,0,26.649074044302488,4.834435714362336,2041900.0,2872445.4545454546,,,,,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,,,,,0.0,,,,,,,2593000.0,3202000.0,1.0,,,0.0,,,0.0,,,,,,0.0,,,0.0,,,0.0,,,,5077900.0,6877900.0,1.0,892000.0,2807000.0,1.0,,,,,,0.0,,,,,,0.0,1413737.5,2374112.5,8.0,,,,,,0.0,2984950.0,4842450.0,2.0,,,,,,0.0,,,,,,0.0,,,,,0.0,0.0,37038641.948785536
6,12800000,16,141,1,2000,5,17,0,,bjrk,catherine deneuve,david morse,peter stormare,joel grey,cara seymour,vladica kostic,jean-marc barr,,bjrk,kristian eidnes andersen,,,,lars von trier,,avy kaufman,peter aalbk jensen,vibeke windelv,finn gjerdrum,franois gdigier,manon rasmussen,,cdric grard,,,peter grant,karl jlusson,jesper lorents,,,robby mller,,david koskas,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,lantia cinema & audiovisivi,fine line features,zentropa entertainments,0,0,1,0,1,0,0,1,0,1,0,1,-0.207911690817759,-0.978147600733806,0.755749574354258,-0.654860733945285,2.15,0,0,0,1,0,Dancer in the Dark,Dancer in the Dark,individual dancing robbery factory worker secr...,The film is set in Washington state in 1964 an...,True,tt0168629,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0,1,0,1,1,0,1,0,1,0,0,1,0,0,1,0,40031879,0,0,2.127490546875,-1.162769204436023,,,878739.3888888888,7103150.555555556,-2492765.0,7235.0,,0.0,,0.0,-2486102.5,31.0,1813897.5,2.0,63455517.90909091,10.0,105682790.63636364,11.0,34076891.33333333,4.0,53743558.0,3.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,,,,,0.0,,,0.0,,,,,,,,,,-2492765.0,7235.0,1.0,,,,42300225.28571428,66800225.28571428,14.0,-2492765.0,7235.0,1.0,-2492765.0,7235.0,1.0,,,0.0,,,0.0,,,0.0,,,,,,0.0,,,,,,,,,0.0,,,0.0,,,0.0,,,,,,,14676794.857142854,17697789.714285713,7.0,,,,,,0.0,,,53746748.71212121,31682102.24747475,5.625,2.0,74772500.5358732
7,90000000,18,126,3,1997,5,2,0,,bruce willis,gary oldman,ian holm,milla jovovich,chris tucker,luke perry,brion james,tom lister jr.,,ric serra,mark a. mangini,julia evershade,ron bartlett,,luc besson,jean bourne,lucinda syson,,patrice ledoux,,sylvie landra,jean paul gaultier,janet tebrooke,amanda knight,marc boyle,luc besson,ira gilford,dan weil,barry wilkinson,maggie gray,mark stetson,thierry arbogast,john ward,jack english,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,columbia pictures,gaumont,sony pictures,0,0,0,0,0,0,0,1,0,0,0,0,0.207911690817759,0.978147600733806,0.755749574354258,-0.654860733945285,1.366666666666667,0,0,1,0,0,The Fifth Element,The Fifth Element,new york city clone taxi cyborg egypt future s...,"In 1914, at the outbreak of World War I, extra...",False,tt0119116,1,0,0,1,1,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1,0,1,0,1,0,0,1,1,0,1,0,1,0,1,0,263920180,0,0,1.932446444444444,-0.169600900518265,27823832.951219518,46696952.75609756,13616.8,18721357.4,19383262.71428572,35870405.57142857,83959283.66666667,10.0,122759283.66666669,15.0,40171046.75,11.0,62291303.125,8.0,34815532.57142857,21.0,48172675.42857143,7.0,-8192146.0,6.0,2807854.0,1.0,24715918.0,2.0,28215918.0,1.0,9624456.0,5.0,16624456.0,1.0,19268502.777777776,15.0,42635725.0,9.0,10621424.333333332,10.0,14888091.0,3.0,,,,79582741.0,100717416.75,4.0,276101666.0,366101666.0,1.0,,,0.0,64548284.0,95148284.0,2.0,,,,8045643.333333331,16891877.666666668,3.0,34514260.0,44014260.0,2.0,,,0.0,,,,29284974.0,45284974.0,1.0,,,,29284974.0,45284974.0,1.0,-16261389.0,1738611.0,1.0,,,0.0,80134931.5,133634931.5,2.0,109773545.0,159773545.0,1.0,29284974.0,45284974.0,1.0,,,0.0,12873135.5,25142487.0,2.0,305000000.0,333000000.0,1.0,-36562432.0,20437568.0,2.0,,,0.0,1312525.0,16947200.75,4.0,185925792.0,218425792.0,2.0,,,0.0,,,42299413.27752976,26873002.262400795,10.0,5.625,90873089.71398184
8,92620000,19,153,1,1927,1,10,0,,gustav frhlich,brigitte helm,alfred abel,rudolf klein-rogge,theodor loos,fritz rasp,erwin biswanger,heinrich george,frank strobel,gottfried huppertz,,,,,fritz lang,,,,erich pommer,,fritz lang,aenne willkomm,,,,fritz lang,karl vollbrecht,,,,,karl freund,,horst von harbou,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,universum film (ufa),,,0,0,0,0,1,0,0,0,0,0,0,0,0.951056516295154,-0.309016994374947,0.0,1.0,0.0,0,1,0,0,0,Metropolis,Metropolis,man vs machine based on novel or book undergro...,"Set in the year 2026, Metropolis takes place i...",False,tt0017136,1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,650422,0,0,-141.39985732339926,-18.88439515829816,,,,,,,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,,0.0,,,0.0,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,0.0,,,,,,0.0,,,0.0,,,,,,,,,,,,0.0,,,0.0,,,,,,,,,,,,,,,0.0,,,,,,0.0,,,,,0.0,0.0,55866046.19791806


In [7]:
df_raw.META__revenue.quantile([0.05, 0.95])

0.050000000000000       111,229.200000000011642
0.950000000000000   348,514,752.599999785423279
Name: META__revenue, dtype: float64

In [8]:
num = df_raw[
    (df_raw.META__profitability > df_raw.META__profitability.quantile(0.01)) & 
    (df_raw.META__profitability < df_raw.META__profitability.quantile(0.99)) &
    (df_raw.META__year >= 2000) &
    (df_raw.META__year <= 2018) &
    (df_raw.budget >= 250000)
].shape[0]

print(num)
print(num/df_raw.shape[0])

4600
0.6139063125583878


In [9]:
num = df_raw[
    (df_raw.META__revenue > df_raw.META__profitability.quantile(0.01)) & 
    (df_raw.META__profitability < df_raw.META__profitability.quantile(0.99)) &
    (df_raw.META__year >= 2000) &
    (df_raw.META__year <= 2018) &
    (df_raw.budget >= 250000)
].shape[0]

print(num)
print(num/df_raw.shape[0])

4660
0.6219137862004538


In [28]:
def get_real_columns(name_prefix, name_suffix, num_columns):
    return [f'{name_prefix}{j}{name_suffix}' for j in range(1, num_columns+1)]

def get_date_from_movie(movie):
    return date(movie['META__year'], movie['META__month'], movie['META__day'])


def get_info_df_movies_per_value_range_columns(df, columns):
    result = {}
    for i, row in df.iterrows():
        for col in columns:
            value = row[col]
            if not pd.isna(value):
                if value in list(result.keys()):
                    result[value]['movies'] = result[value]['movies'].append(df.iloc[[i]])
                else:
                    result[value] = {'movies': df.iloc[[i]]}
    return result

# def get_info_df_movies_per_value_range_columns(df, columns):
#     def rows_with_value(value):
#         return df[df.apply(lambda row: value in [row[column] for column in columns], axis=1)]

#     unique_values = list(set([movie[column] for i, movie in df.iterrows() for column in columns]))
#     return {value: rows_with_value(value) for value in unique_values}

def calculate_info_per_previous_movies_per_value(df, 
                                                 info_dict, 
                                                 calc_budget=False, 
                                                 calc_movies_before=False, 
                                                 calc_experience=False, 
                                                 calc_revenue=False,
                                                ):
    
    def get_average_profit(df):
        return (df['META__revenue'] - df['budget']).mean()
    
    def get_experience(df, row):
        return (row['META__year'] - df.META__year.min())
    
    def is_movie_before(movie, compare_date):
        return date(movie.META__year, movie.META__month, movie.META__day) < compare_date

    for value_name in info_dict:
        value = info_dict[value_name]
        value['avg_profit'] = {}
        if calc_budget:
            value['avg_budget'] = {}
        if calc_experience:
            value['experience'] = {}
        if calc_revenue:
            value['avg_revenue'] = {}
        if calc_movies_before:
            value['movies_before'] = {}

        movies = value['movies']
        for movie_index, movie in movies.iterrows():
            current_date = get_date_from_movie(movie)
            str_date = str(current_date)
            if calc_experience and str_date not in list(value['experience'].keys()):
                value['experience'][str_date] = get_experience(movies, movie)
            if str_date not in list(value['avg_profit'].keys()):
                movies_before = movies[movies.apply(lambda row: is_movie_before(row, current_date), axis=1)]
                if not movies_before.empty:
                    value['avg_profit'][str_date] = get_average_profit(movies_before)
                    if calc_movies_before:
                        value['movies_before'][str_date] = movies_before.shape[0]
                    if calc_revenue:
                        value['avg_revenue'][str_date] = movies_before['META__revenue'].mean()                        
                    if calc_budget:
                        value['avg_budget'][str_date] = movies_before['budget'].mean()
                else:
                    if calc_movies_before:
                        value['movies_before'][str_date] = 0
                    

def get_columns_from_info_dict(df, info_dict, real_columns, new_column_prefix='', nested_info=False):
    def get_first_key(d):
        return d[list(d.keys())[0]]
    
    new_columns_data = {}
    info_names = [key for key in get_first_key(info_dict).keys() if key != 'movies']
    print(info_names)
    for index, real_column in enumerate(real_columns):
        print(real_column)
        new_column_names = [f'{new_column_prefix}_{key}' if len(real_columns) == 1 else f'{new_column_prefix}_{index+1}_{key}' for key in info_names] 
        print(new_column_names)
            
        for new_column_name in new_column_names:
            new_columns_data[new_column_name] = []

        for movie_index, movie in df.iterrows():
            value = movie[real_column]
            str_date = str(get_date_from_movie(movie))
            if pd.isna(value):
                for col in new_column_names:
                    new_columns_data[col].append(np.nan)
            else:
                for name_index, new_column_name in enumerate(new_column_names):
                    year_info = info_dict[value][info_names[name_index]].get(str_date, np.nan)
                    new_columns_data[new_column_name].append(year_info)
    return new_columns_data

In [30]:
def add_features(df, 
                 columns,
                 new_column_prefix,
                 calc_movies_before=False, 
                 calc_budget=False, 
                 calc_experience=False,
                 calc_revenue=False):
    print('obtaine movies')
    
    t = time.process_time()
    data_info = get_info_df_movies_per_value_range_columns(df, columns)
    print(f'time: {time.process_time() - t}')
    
    data_info_copy = data_info.copy()
    print('calculate info')
    t = time.process_time()
    calculate_info_per_previous_movies_per_value(df, 
                                                 data_info_copy, 
                                                 calc_movies_before=calc_movies_before, 
                                                 calc_budget=calc_budget, 
                                                 calc_experience=calc_experience,
                                                 calc_revenue=calc_revenue)
    print(f'time: {time.process_time() - t}')
    
    print('get columns')
    t = time.process_time()
    feature_columns = get_columns_from_info_dict(df, data_info_copy, columns, new_column_prefix=new_column_prefix)
    print(f'time: {time.process_time() - t}')
    for column in feature_columns.keys():
        df[column] = feature_columns[column]
    return df


def add_companies_features(df):
    columns = get_real_columns('META__production_company_', '', 3)
    return add_features(df, columns, 'production_company', calc_revenue=True)


def add_cast_features(df):
    columns = get_real_columns('META__cast_', '_name', 8)
    return add_features(df, columns, 'cast', calc_movies_before=True, calc_revenue=True, calc_experience=True)


def add_crew_features(df):
    for column in [c for c in list(df.columns) if 'META__crew' in c]:
        df = add_features(df, [column], column.replace('META__', ''), calc_movies_before=True, calc_revenue=True)
    return df


def add_collection_features(df):
    return add_features(df, ['META__collection_name'], 'collection', calc_revenue=True)


def get_avg_cast_info(df):
    def get_average_cast_movies(row, columns):
        avgs = [row[col] for col in columns if not pd.isna(row[col])]
        return mean(avgs) if len(avgs) else np.nan
    
    def get_cast_average(df, column_name):
        columns = [col for col in df.columns if 'cast_' in col and column_name in col and not 'cast_avg' in col]
        return [get_average_cast_movies(row, columns) for i, row in df.iterrows()]
    
    for column in ['revenue', 'profit', 'experience', 'movies_before']:
        df[f'cast_avg_{column}'] = get_cast_average(df, column)
    return df


def prepare_df(df_original):
    overall_time = time.process_time()
    df = df_original.copy()
    df = add_companies_features(df)
    df = add_cast_features(df)
    df = add_crew_features(df)
    df = add_collection_features(df)
    df = get_avg_cast_info(df)
    df['year_avg_revenue'] = calc_smooth_mean(df, by='META__year', on='META__revenue', m=5)
    print(f'overall time: {time.process_time() - overall_time}')
    return df
    

In [33]:
raw_datasets = {
    'all': df_raw.copy(),
    'us': df_raw[
        df_raw.country__us == 1].copy(),
    'gb': df_raw[
        df_raw.country__gb == 1].copy(),
    'years2000-2018': df_raw[
        (df_raw.META__year >= 2000) &
        (df_raw.META__year <= 2018)].copy(),
    'years1970-1999': df_raw[
        (df_raw.META__year >= 1970) &
        (df_raw.META__year <= 1999)].copy(),
    'profitability_positive': df_raw[
        df_raw.META__profitability > 0].copy(),
    'profitability_negative': df_raw[
        df_raw.META__profitability < 0].copy(),
    'budget_start_1percent': df_raw[
        df_raw.budget >= 8875].copy(),
    'budget_start_5percent': df_raw[
        df_raw.budget >= 250000].copy(),
    'profitability_1percentile': df_raw[
        (df_raw.META__profitability >= -138) &
        (df_raw.META__profitability <= 65)].copy(),
    'profitability_5percentile': df_raw[
        (df_raw.META__profitability >= -14.9) &
        (df_raw.META__profitability <= 13.9)].copy(),
    'profitability_10percentile': df_raw[
        (df_raw.META__profitability >= -4.42) &
        (df_raw.META__profitability <= 7.4)].copy(),
    'profitability_1percentile_years2000-2018': df_raw[
        (df_raw.META__profitability >= -138) &
        (df_raw.META__profitability <= 65) &
        (df_raw.META__year >= 2000) &
        (df_raw.META__year <= 2018)].copy(),
    'profitability_1percentile_years2000-2018_budget_start_1percent': df_raw[
        (df_raw.META__profitability >= -138) &
        (df_raw.META__profitability <= 65) &
        (df_raw.META__year >= 2000) &
        (df_raw.META__year <= 2018) &
        (df_raw.budget >= 8875)].copy(),
    'profitability_1percentile_years2000-2018_budget_start_5percent': df_raw[
        (df_raw.META__profitability >= -138) &
        (df_raw.META__profitability <= 65) &
        (df_raw.META__year >= 2000) &
        (df_raw.META__year <= 2018) &
        (df_raw.budget >= 250000)].copy(),
    'revenue_1percentile': df_raw[
        (df_raw.META__revenue >= 10000) &
        (df_raw.META__revenue <= 854060072)].copy(),
    'revenue_5percentile': df_raw[
        (df_raw.META__revenue >= 111229) &
        (df_raw.META__revenue <= 348514752)].copy(),
}

In [34]:
for d in raw_datasets.keys():
    print(raw_datasets[d].shape[0])
for d in raw_datasets.keys():
    print(raw_datasets[d].shape[0]/df_raw.shape[0]*100)

7493
5695
893
4903
2042
5107
2335
7418
7129
7342
6744
5994
4814
4754
4599
7354
6743
100.0
76.00427065260911
11.917789937274788
65.43440544508208
27.252101961831045
68.1569464833845
31.162418257039903
98.99906579474175
95.14213265714668
97.98478580008008
90.00400373682103
79.99466168423862
64.24663018817563
63.445882823969036
61.37728546643534
98.1449352729214
89.99065794741759


In [35]:
for dataset in raw_datasets.keys():
    print('------------------------------------------------------')
    print(dataset)
    print('-------------------------------------------------------')
    raw_filename = f'datasets/dataset_{dataset}_raw.csv'
    filename = f'datasets/dataset_{dataset}.csv'
    raw_datasets[dataset].to_csv(raw_filename, index=False)
    raw_dataset = pd.read_csv(raw_filename)
    prepared = prepare_df(raw_dataset)
    prepared.to_csv(filename, index=False)

------------------------------------------------------
all
-------------------------------------------------------
obtaine movies
time: 32.76154373700001
calculate info
time: 51.697076400000014
get columns
['avg_profit', 'avg_revenue']
META__production_company_1
['production_company_1_avg_profit', 'production_company_1_avg_revenue']
META__production_company_2
['production_company_2_avg_profit', 'production_company_2_avg_revenue']
META__production_company_3
['production_company_3_avg_profit', 'production_company_3_avg_revenue']
time: 2.0308234389999598
obtaine movies
time: 106.78814797400003
calculate info
time: 80.79361381900003
get columns
['avg_profit', 'experience', 'avg_revenue', 'movies_before']
META__cast_1_name
['cast_1_avg_profit', 'cast_1_experience', 'cast_1_avg_revenue', 'cast_1_movies_before']
META__cast_2_name
['cast_2_avg_profit', 'cast_2_experience', 'cast_2_avg_revenue', 'cast_2_movies_before']
META__cast_3_name
['cast_3_avg_profit', 'cast_3_experience', 'cast_3_avg_rev

time: 0.6774820320001709
obtaine movies
time: 3.147450641999967
calculate info
time: 2.16235450399995
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__property_master
['crew__art__property_master_avg_profit', 'crew__art__property_master_avg_revenue', 'crew__art__property_master_movies_before']
time: 0.6782849279998118
obtaine movies
time: 7.0461755700000595
calculate info
time: 4.830708471999969
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__set_decoration
['crew__art__set_decoration_avg_profit', 'crew__art__set_decoration_avg_revenue', 'crew__art__set_decoration_movies_before']
time: 0.6725467409999055
obtaine movies
time: 2.8668473679999806
calculate info
time: 2.0953345089999402
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__visualeffects__visual_effects_supervisor
['crew__visualeffects__visual_effects_supervisor_avg_profit', 'crew__visualeffects__visual_effects_supervisor_avg_revenue', 'crew__visualeffe

time: 0.5004339019999406
obtaine movies
time: 9.560884298000019
calculate info
time: 5.448489851000204
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__costume__costume_designer
['crew__costume__costume_designer_avg_profit', 'crew__costume__costume_designer_avg_revenue', 'crew__costume__costume_designer_movies_before']
time: 0.5096633269999984
obtaine movies
time: 2.8845739339999454
calculate info
time: 1.721160736999991
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__costume__costume_supervisor
['crew__costume__costume_supervisor_avg_profit', 'crew__costume__costume_supervisor_avg_revenue', 'crew__costume__costume_supervisor_movies_before']
time: 0.5010722970000643
obtaine movies
time: 3.724021386000004
calculate info
time: 2.4718669090000276
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__costume__makeup_artist
['crew__costume__makeup_artist_avg_profit', 'crew__costume__makeup_artist_avg_revenue', 'crew__costume__mak

time: 1.2139902750000147
calculate info
time: 1.0431114590001016
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__directing__director__1
['crew__directing__director__1_avg_profit', 'crew__directing__director__1_avg_revenue', 'crew__directing__director__1_movies_before']
time: 0.07676021699990088
obtaine movies
time: 0.6961276120000548
calculate info
time: 0.4761828619998596
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__directing__script_supervisor
['crew__directing__script_supervisor_avg_profit', 'crew__directing__script_supervisor_avg_revenue', 'crew__directing__script_supervisor_movies_before']
time: 0.08008954000001722
obtaine movies
time: 1.5291948560000037
calculate info
time: 0.9028894409998429
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__casting
['crew__production__casting_avg_profit', 'crew__production__casting_avg_revenue', 'crew__production__casting_movies_before']
time: 0.07753322700000354
o

META__cast_5_name
['cast_5_avg_profit', 'cast_5_experience', 'cast_5_avg_revenue', 'cast_5_movies_before']
META__cast_6_name
['cast_6_avg_profit', 'cast_6_experience', 'cast_6_avg_revenue', 'cast_6_movies_before']
META__cast_7_name
['cast_7_avg_profit', 'cast_7_experience', 'cast_7_avg_revenue', 'cast_7_movies_before']
META__cast_8_name
['cast_8_avg_profit', 'cast_8_experience', 'cast_8_avg_revenue', 'cast_8_movies_before']
time: 3.4139326910001273
obtaine movies
time: 2.8075697539998146
calculate info
time: 1.7302352849999352
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__music_editor
['crew__sound__music_editor_avg_profit', 'crew__sound__music_editor_avg_revenue', 'crew__sound__music_editor_movies_before']
time: 0.42719095000006746
obtaine movies
time: 6.14535689000013
calculate info
time: 4.628648590000012
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__original_music_composer
['crew__sound__original_music_composer_avg_pro

time: 0.4482502589999058
obtaine movies
time: 7.869185187999847
calculate info
time: 5.446342755999922
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__camera__director_of_photography
['crew__camera__director_of_photography_avg_profit', 'crew__camera__director_of_photography_avg_revenue', 'crew__camera__director_of_photography_movies_before']
time: 0.46529749899991657
obtaine movies
time: 2.0097277059999215
calculate info
time: 1.2261116180002318
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__camera__steadicam_operator
['crew__camera__steadicam_operator_avg_profit', 'crew__camera__steadicam_operator_avg_revenue', 'crew__camera__steadicam_operator_movies_before']
time: 0.4464492950000931
obtaine movies
time: 3.531519505000233
calculate info
time: 2.308758011000009
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__camera__still_photographer
['crew__camera__still_photographer_avg_profit', 'crew__camera__still_photographer_

time: 1.1128194739999344
calculate info
time: 0.5825986489999195
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__crew__stunt_coordinator
['crew__crew__stunt_coordinator_avg_profit', 'crew__crew__stunt_coordinator_avg_revenue', 'crew__crew__stunt_coordinator_movies_before']
time: 0.18025418300021556
obtaine movies
time: 2.557151489000262
calculate info
time: 1.806729502000053
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__writing__screenplay__1
['crew__writing__screenplay__1_avg_profit', 'crew__writing__screenplay__1_avg_revenue', 'crew__writing__screenplay__1_movies_before']
time: 0.19041255999991336
obtaine movies
time: 2.0048520459999963
calculate info
time: 1.3341923110001517
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__art_direction
['crew__art__art_direction_avg_profit', 'crew__art__art_direction_avg_revenue', 'crew__art__art_direction_movies_before']
time: 0.1826217530001486
obtaine movies
time: 3.42868

time: 0.48824727100009113
obtaine movies
time: 4.7274761219996435
calculate info
time: 4.328388934000031
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__executive_producer__1
['crew__production__executive_producer__1_avg_profit', 'crew__production__executive_producer__1_avg_revenue', 'crew__production__executive_producer__1_movies_before']
time: 0.5206055540002126
obtaine movies
time: 8.896504395000193
calculate info
time: 7.149076228000013
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__producer__1
['crew__production__producer__1_avg_profit', 'crew__production__producer__1_avg_revenue', 'crew__production__producer__1_movies_before']
time: 0.496609762999924
obtaine movies
time: 2.36489691900033
calculate info
time: 2.4230386730000646
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__producer__2
['crew__production__producer__2_avg_profit', 'crew__production__producer__2_avg_revenue', '

time: 0.20446014699973603
obtaine movies
time: 0.5550610300001608
calculate info
time: 0.3909557609999865
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__sound_designer
['crew__sound__sound_designer_avg_profit', 'crew__sound__sound_designer_avg_revenue', 'crew__sound__sound_designer_movies_before']
time: 0.20363807700005054
obtaine movies
time: 0.4851073750000978
calculate info
time: 0.42065293900031975
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__sound_effects_editor
['crew__sound__sound_effects_editor_avg_profit', 'crew__sound__sound_effects_editor_avg_revenue', 'crew__sound__sound_effects_editor_movies_before']
time: 0.20845094099968264
obtaine movies
time: 1.0655528180000147
calculate info
time: 0.6495851139998194
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__sound_re_recording_mixer
['crew__sound__sound_re_recording_mixer_avg_profit', 'crew__sound__sound_re_recording_mixer_avg_revenue', 

time: 0.2546311500000229
obtaine movies
time: 0.23109235000038097
calculate info
time: 0.1072271279999768
get columns
['avg_profit', 'avg_revenue']
META__collection_name
['collection_avg_profit', 'collection_avg_revenue']
time: 0.280299690999982
overall time: 153.57499712499975
------------------------------------------------------
budget_start_1percent
-------------------------------------------------------
obtaine movies
time: 34.97370658
calculate info
time: 56.02719218399989
get columns
['avg_profit', 'avg_revenue']
META__production_company_1
['production_company_1_avg_profit', 'production_company_1_avg_revenue']
META__production_company_2
['production_company_2_avg_profit', 'production_company_2_avg_revenue']
META__production_company_3
['production_company_3_avg_profit', 'production_company_3_avg_revenue']
time: 2.325042250000024
obtaine movies
time: 119.48113457199997
calculate info
time: 83.56611134000013
get columns
['avg_profit', 'experience', 'avg_revenue', 'movies_before']
M

time: 0.7186285000002499
obtaine movies
time: 9.939138705999994
calculate info
time: 6.966343662999861
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__production_design
['crew__art__production_design_avg_profit', 'crew__art__production_design_avg_revenue', 'crew__art__production_design_movies_before']
time: 0.7279780449998725
obtaine movies
time: 3.2995587410000553
calculate info
time: 2.224464619999708
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__property_master
['crew__art__property_master_avg_profit', 'crew__art__property_master_avg_revenue', 'crew__art__property_master_movies_before']
time: 0.694418663000306
obtaine movies
time: 7.24382433899973
calculate info
time: 4.931159555000249
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__set_decoration
['crew__art__set_decoration_avg_profit', 'crew__art__set_decoration_avg_revenue', 'crew__art__set_decoration_movies_before']
time: 0.6808084500003133
obt

time: 0.6226980520000325
obtaine movies
time: 11.935108361999937
calculate info
time: 8.359756413000014
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__editing__editor__1
['crew__editing__editor__1_avg_profit', 'crew__editing__editor__1_avg_revenue', 'crew__editing__editor__1_movies_before']
time: 0.643493258000035
obtaine movies
time: 9.049647236000055
calculate info
time: 6.342315496000083
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__costume__costume_designer
['crew__costume__costume_designer_avg_profit', 'crew__costume__costume_designer_avg_revenue', 'crew__costume__costume_designer_movies_before']
time: 0.6317821860002368
obtaine movies
time: 2.634005696999793
calculate info
time: 1.7870872419998705
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__costume__costume_supervisor
['crew__costume__costume_supervisor_avg_profit', 'crew__costume__costume_supervisor_avg_revenue', 'crew__costume__costume_supervisor_movies

time: 0.6549243950003074
obtaine movies
time: 4.289156863999779
calculate info
time: 2.6846169000000373
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__supervising_sound_editor
['crew__sound__supervising_sound_editor_avg_profit', 'crew__sound__supervising_sound_editor_avg_revenue', 'crew__sound__supervising_sound_editor_movies_before']
time: 0.676545356999668
obtaine movies
time: 12.2271052469996
calculate info
time: 9.359373063000021
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__directing__director__1
['crew__directing__director__1_avg_profit', 'crew__directing__director__1_avg_revenue', 'crew__directing__director__1_movies_before']
time: 0.6648802070003512
obtaine movies
time: 5.198761270000432
calculate info
time: 3.3748415290001503
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__directing__script_supervisor
['crew__directing__script_supervisor_avg_profit', 'crew__directing__script_supervisor_avg_revenue',

META__cast_2_name
['cast_2_avg_profit', 'cast_2_experience', 'cast_2_avg_revenue', 'cast_2_movies_before']
META__cast_3_name
['cast_3_avg_profit', 'cast_3_experience', 'cast_3_avg_revenue', 'cast_3_movies_before']
META__cast_4_name
['cast_4_avg_profit', 'cast_4_experience', 'cast_4_avg_revenue', 'cast_4_movies_before']
META__cast_5_name
['cast_5_avg_profit', 'cast_5_experience', 'cast_5_avg_revenue', 'cast_5_movies_before']
META__cast_6_name
['cast_6_avg_profit', 'cast_6_experience', 'cast_6_avg_revenue', 'cast_6_movies_before']
META__cast_7_name
['cast_7_avg_profit', 'cast_7_experience', 'cast_7_avg_revenue', 'cast_7_movies_before']
META__cast_8_name
['cast_8_avg_profit', 'cast_8_experience', 'cast_8_avg_revenue', 'cast_8_movies_before']
time: 4.88384267299989
obtaine movies
time: 3.7644772399999056
calculate info
time: 2.2402809479999632
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__music_editor
['crew__sound__music_editor_avg_profit', 'crew__sound__mu

time: 0.6174307530000078
obtaine movies
time: 2.708086405999893
calculate info
time: 1.9905956909997258
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__visualeffects__visual_effects_supervisor
['crew__visualeffects__visual_effects_supervisor_avg_profit', 'crew__visualeffects__visual_effects_supervisor_avg_revenue', 'crew__visualeffects__visual_effects_supervisor_movies_before']
time: 0.5983487969997441
obtaine movies
time: 11.285699336999642
calculate info
time: 7.997172546000002
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__camera__director_of_photography
['crew__camera__director_of_photography_avg_profit', 'crew__camera__director_of_photography_avg_revenue', 'crew__camera__director_of_photography_movies_before']
time: 0.6058952589992259
obtaine movies
time: 2.402734555999814
calculate info
time: 1.4590347039993503
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__camera__steadicam_operator
['crew__camera__steadicam_

time: 1.5665531560007366
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__costume__costume_supervisor
['crew__costume__costume_supervisor_avg_profit', 'crew__costume__costume_supervisor_avg_revenue', 'crew__costume__costume_supervisor_movies_before']
time: 0.5355925839994597
obtaine movies
time: 2.6770235730000422
calculate info
time: 2.2379842029995416
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__costume__makeup_artist
['crew__costume__makeup_artist_avg_profit', 'crew__costume__makeup_artist_avg_revenue', 'crew__costume__makeup_artist_movies_before']
time: 0.5352267199996277
obtaine movies
time: 2.9340338100000736
calculate info
time: 1.9690780189994257
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__crew__stunt_coordinator
['crew__crew__stunt_coordinator_avg_profit', 'crew__crew__stunt_coordinator_avg_revenue', 'crew__crew__stunt_coordinator_movies_before']
time: 0.5394730990001335
obtaine movies
time: 5.127882892

time: 0.42424578800000745
obtaine movies
time: 3.494952580000245
calculate info
time: 2.364675585000441
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__directing__script_supervisor
['crew__directing__script_supervisor_avg_profit', 'crew__directing__script_supervisor_avg_revenue', 'crew__directing__script_supervisor_movies_before']
time: 0.4221559970001181
obtaine movies
time: 6.618364509999992
calculate info
time: 5.594956699999784
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__casting
['crew__production__casting_avg_profit', 'crew__production__casting_avg_revenue', 'crew__production__casting_movies_before']
time: 0.4222412159997475
obtaine movies
time: 3.9408069790006266
calculate info
time: 3.7466235099991536
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__executive_producer__1
['crew__production__executive_producer__1_avg_profit', 'crew__production__executive_producer__1_avg_revenue', 'crew

META__cast_8_name
['cast_8_avg_profit', 'cast_8_experience', 'cast_8_avg_revenue', 'cast_8_movies_before']
time: 3.1905782440007897
obtaine movies
time: 2.711724394000157
calculate info
time: 1.6742312000005768
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__music_editor
['crew__sound__music_editor_avg_profit', 'crew__sound__music_editor_avg_revenue', 'crew__sound__music_editor_movies_before']
time: 0.4148681110000325
obtaine movies
time: 6.05908152900065
calculate info
time: 4.595228652000515
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__original_music_composer
['crew__sound__original_music_composer_avg_profit', 'crew__sound__original_music_composer_avg_revenue', 'crew__sound__original_music_composer_movies_before']
time: 0.4157394550002209
obtaine movies
time: 2.145218697999553
calculate info
time: 1.4467854229997101
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__sound_designer
['crew__sound_

time: 0.43334584399963205
obtaine movies
time: 1.9223838469997645
calculate info
time: 1.2152322090005327
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__camera__steadicam_operator
['crew__camera__steadicam_operator_avg_profit', 'crew__camera__steadicam_operator_avg_revenue', 'crew__camera__steadicam_operator_movies_before']
time: 0.4218123969994849
obtaine movies
time: 3.27607886000078
calculate info
time: 2.15976480000063
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__camera__still_photographer
['crew__camera__still_photographer_avg_profit', 'crew__camera__still_photographer_avg_revenue', 'crew__camera__still_photographer_movies_before']
time: 0.43207372300003044
obtaine movies
time: 1.3738515389995882
calculate info
time: 0.8865554410003824
get columns
['avg_profit', 'avg_revenue']
META__collection_name
['collection_avg_profit', 'collection_avg_revenue']
time: 0.4268673309998121
overall time: 355.1935167820002
--------------------------

time: 0.4002911829993536
obtaine movies
time: 3.277320881000378
calculate info
time: 3.1994917310003075
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__writing__screenplay__1
['crew__writing__screenplay__1_avg_profit', 'crew__writing__screenplay__1_avg_revenue', 'crew__writing__screenplay__1_movies_before']
time: 0.4037151000002268
obtaine movies
time: 3.6731306559995573
calculate info
time: 3.093828003999988
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__art_direction
['crew__art__art_direction_avg_profit', 'crew__art__art_direction_avg_revenue', 'crew__art__art_direction_movies_before']
time: 0.41124124599991774
obtaine movies
time: 6.430156292999527
calculate info
time: 4.39658052499999
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__art__production_design
['crew__art__production_design_avg_profit', 'crew__art__production_design_avg_revenue', 'crew__art__production_design_movies_before']
time: 0.4142729140003

time: 0.6639088739993895
obtaine movies
time: 10.943256979999205
calculate info
time: 8.877879609000047
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__producer__1
['crew__production__producer__1_avg_profit', 'crew__production__producer__1_avg_revenue', 'crew__production__producer__1_movies_before']
time: 0.6877513409999665
obtaine movies
time: 3.132661492999432
calculate info
time: 3.278778345000319
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__production__producer__2
['crew__production__producer__2_avg_profit', 'crew__production__producer__2_avg_revenue', 'crew__production__producer__2_movies_before']
time: 0.6781304080004702
obtaine movies
time: 12.488896402000137
calculate info
time: 8.564988083999197
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__editing__editor__1
['crew__editing__editor__1_avg_profit', 'crew__editing__editor__1_avg_revenue', 'crew__editing__editor__1_movies_before']
time: 0.65935

time: 0.5906714040002043
obtaine movies
time: 2.4609885469999426
calculate info
time: 1.617129852999824
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__sound_effects_editor
['crew__sound__sound_effects_editor_avg_profit', 'crew__sound__sound_effects_editor_avg_revenue', 'crew__sound__sound_effects_editor_movies_before']
time: 0.5857584079994922
obtaine movies
time: 3.8308347459997094
calculate info
time: 2.4788722690000213
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__sound_re_recording_mixer
['crew__sound__sound_re_recording_mixer_avg_profit', 'crew__sound__sound_re_recording_mixer_avg_revenue', 'crew__sound__sound_re_recording_mixer_movies_before']
time: 0.5898780450006598
obtaine movies
time: 3.3765117419998205
calculate info
time: 2.152590736000093
get columns
['avg_profit', 'avg_revenue', 'movies_before']
META__crew__sound__supervising_sound_editor
['crew__sound__supervising_sound_editor_avg_profit', 'crew__sound__super