In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Read in the Data

Data is being read in from [learn-co-students](https://github.com/CliffordBridges/Movie-Performance-Analysis/tree/master/data)

We'll check the head of each dataframe we create to make sure the data is comprehensible to the human eye.
***

In [2]:
tn_movie_budgets = pd.read_csv('data/tn.movie_budgets.csv.gz')

In [3]:
tn_movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


***
## Summary of Step 1

### First Round of Questions About Data

1. Can we find out how these movies were released? as in on Netflix/Hulu/Box Office/Amazon Prime/YouTube?
2. How dirty are any of these data sets?
3. Do we know that grosses are USD?
4. And do dollars account for inflation?
5. How much rounding is going on in these grosses?
***

# Step 2: Let's try to clean some of this data

Clean the ```tn_movie_budgets``` by searching for any unreasonable values.
***

In [4]:
tn_movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [5]:
tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
id                   5782 non-null int64
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: int64(1), object(5)
memory usage: 271.1+ KB


In [6]:
tn_movie_budgets.shape

(5782, 6)

***
### Are any rows duplicated?
***

In [7]:
tn_movie_budgets.duplicated().sum()

0

No entire rows are dulicated, but maybe there are weird values in a "duplicated" row which makes it hard to identify. 
We'll check for duplicated names explicity.

In [8]:
tn_movie_budgets.duplicated('movie').sum()

84

In [9]:
tn_movie_budgets['repeated_name'] = tn_movie_budgets.movie.duplicated(keep=False)

There appear to be no actually duplicated data, all of the duplicated rows appear to be remakes of an original. Lets change the name of that column from ```repeated_name``` to ```remade```.

In [10]:
tn_movie_budgets['repeated_name'] = tn_movie_budgets.movie.duplicated()

In [11]:
tn_movie_budgets.rename(columns={'repeated_name':'remade'}, inplace=True)

No, there aren't duplicated rows in a negative sense. 
Just remade movies. 
We're okay with that!

***
### Are their weird values?
***

While checking for duplicates, we noticed that there were 0s in the ```domestic_gross``` and ```worldwide_gross``` columns.
Lets do the following:
- [ ] first, convert all money columns to ints from objects, 
- [ ] second, make a new column called ```international_gross``` which is ```worldwide_gross```$-$```domestic_gross```, and
- [ ] finally, decide whether or not to drop rows with too many \$0s.
***

In [12]:
tn_movie_budgets.dtypes

id                    int64
release_date         object
movie                object
production_budget    object
domestic_gross       object
worldwide_gross      object
remade                 bool
dtype: object

In [13]:
tn_movie_budgets = tn_movie_budgets.astype({'production_budget':'str', 'domestic_gross':'str', 'worldwide_gross':'str'})

In [14]:
def get_rid_of_dollar_sign(amount):
    if amount.startswith('$'):
        amount = amount[1:]
    else:
        print('Crap, one of my values didn\'t start with a dollar sign')
    return amount

In [15]:
for title in ['production_budget', 'domestic_gross', 'worldwide_gross']:
    tn_movie_budgets[title] = tn_movie_budgets[title].map(get_rid_of_dollar_sign)

In [16]:
for title in ['production_budget', 'domestic_gross', 'worldwide_gross']:
    tn_movie_budgets[title] = tn_movie_budgets[title].map(lambda x: x.replace(',','_'))

In [17]:
tn_movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,remade
0,1,"Dec 18, 2009",Avatar,425_000_000,760_507_625,2_776_345_279,False
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410_600_000,241_063_875,1_045_663_875,False
2,3,"Jun 7, 2019",Dark Phoenix,350_000_000,42_762_350,149_762_350,False
3,4,"May 1, 2015",Avengers: Age of Ultron,330_600_000,459_005_868,1_403_013_963,False
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317_000_000,620_181_382,1_316_721_747,False


In [18]:
tn_movie_budgets = tn_movie_budgets.astype({'production_budget':'int64', 'domestic_gross':'int64', 'worldwide_gross':'int64'})

In [19]:
tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 7 columns):
id                   5782 non-null int64
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null int64
domestic_gross       5782 non-null int64
worldwide_gross      5782 non-null int64
remade               5782 non-null bool
dtypes: bool(1), int64(4), object(2)
memory usage: 276.8+ KB


***
- [x] first, convert all money columns to ints from objects, 
- [ ] second, make a new column called ```international_gross``` which is ```worldwide_gross```$-$```domestic_gross```, and
- [ ] finally, decide whether or not to drop rows with too many \$0s.
***

While we're at it, we might as well make all the columns the appropriate data types...

In [20]:
tn_movie_budgets = tn_movie_budgets.astype({'movie':'str'})

In [21]:
tn_movie_budgets.release_date = pd.to_datetime(tn_movie_budgets.release_date)

In [22]:
tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 7 columns):
id                   5782 non-null int64
release_date         5782 non-null datetime64[ns]
movie                5782 non-null object
production_budget    5782 non-null int64
domestic_gross       5782 non-null int64
worldwide_gross      5782 non-null int64
remade               5782 non-null bool
dtypes: bool(1), datetime64[ns](1), int64(4), object(1)
memory usage: 276.8+ KB


...Okay, back to the business at hand.

In [23]:
tn_movie_budgets['international_gross'] = tn_movie_budgets['worldwide_gross'] - tn_movie_budgets['domestic_gross']

- [x] first, onvert all money columns to ints from objects, 
- [x] second, make a new column called ```international_gross``` which is ```worldwide_gross```$-$```domestic_gross```, and
- [ ] finally, decide whether or not to drop rows with too many \$0s.

In [24]:
tn_movie_budgets.loc[tn_movie_budgets.international_gross == 0].shape

(1619, 8)

***
Thats a lot of movies with no international gross!

Now I'm concerned about movies with no worldwide gross or no domestic gross. 
Lets see how many of those there are.
***

In [25]:
tn_movie_budgets.loc[(tn_movie_budgets.domestic_gross) == 0].shape

(548, 8)

In [26]:
tn_movie_budgets.loc[(tn_movie_budgets.domestic_gross) == 0].head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,remade,international_gross
194,95,2020-12-31,Moonfall,150000000,0,0,False,0
479,80,2017-12-13,Bright,90000000,0,0,False,0
480,81,2019-12-31,Army of the Dead,90000000,0,0,False,0
535,36,2020-02-21,Call of the Wild,82000000,0,0,False,0
617,18,2012-12-31,AstÃ©rix et ObÃ©lix: Au service de Sa MajestÃ©,77600000,0,60680125,False,60680125


In [27]:
tn_movie_budgets.loc[(tn_movie_budgets.worldwide_gross | tn_movie_budgets.domestic_gross) == 0].shape

(367, 8)

In [28]:
tn_movie_budgets.loc[(tn_movie_budgets.worldwide_gross | tn_movie_budgets.domestic_gross) == 0].head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,remade,international_gross
194,95,2020-12-31,Moonfall,150000000,0,0,False,0
479,80,2017-12-13,Bright,90000000,0,0,False,0
480,81,2019-12-31,Army of the Dead,90000000,0,0,False,0
535,36,2020-02-21,Call of the Wild,82000000,0,0,False,0
670,71,2019-08-30,PLAYMOBIL,75000000,0,0,False,0


In [29]:
tn_movie_budgets.loc[(tn_movie_budgets.domestic_gross) == 0].loc[(tn_movie_budgets.worldwide_gross) > 0].shape

(181, 8)

In [30]:
tn_movie_budgets.loc[(tn_movie_budgets.domestic_gross) == 0].loc[(tn_movie_budgets.worldwide_gross) > 0].head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,remade,international_gross
617,18,2012-12-31,AstÃ©rix et ObÃ©lix: Au service de Sa MajestÃ©,77600000,0,60680125,False,60680125
619,20,2019-01-22,Renegades,77500000,0,1521672,False,1521672
820,21,2018-10-26,Air Strike,65000000,0,516279,False,516279
1325,26,2012-12-31,Foodfight!,45000000,0,73706,False,73706
1367,68,2006-12-31,Les BronzÃ©s 3: amis pour la vie,42000000,0,83833602,False,83833602


In [31]:
tn_movie_budgets.loc[(tn_movie_budgets.worldwide_gross) == 0].shape

(367, 8)

In [32]:
tn_movie_budgets.loc[(tn_movie_budgets.worldwide_gross) == 0].head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,remade,international_gross
194,95,2020-12-31,Moonfall,150000000,0,0,False,0
479,80,2017-12-13,Bright,90000000,0,0,False,0
480,81,2019-12-31,Army of the Dead,90000000,0,0,False,0
535,36,2020-02-21,Call of the Wild,82000000,0,0,False,0
670,71,2019-08-30,PLAYMOBIL,75000000,0,0,False,0


There are 548 movies with 0 dollars in ```domestic_gross```.
Of those movies, 181 have ```international_gross``` (maybe were only released internationally), and the other 367 have no ```domestic_gross``` or ```international_gross```. 
We think these 367 movies were released online only, which means they are of particular interest to our analysis!
***
So we are deciding to keep all of our data:
- [x] first, onvert all money columns to ints from objects, 
- [x] second, make a new column called ```international_gross``` which is ```worldwide_gross```$-$```domestic_gross```, and
- [x] finally, decide whether or not to drop rows with too many \$0s.


We are also deciding to compare the set of data we think are online releases to the data we think are not. 
Let's create a new column to mark their differences, then continue cleaning the data by isolating the released between 2010 and 2018.

In [33]:
tn_movie_budgets['online_release'] = tn_movie_budgets.worldwide_gross.map(lambda x: x==0)

***
### Get rid of movies not released between 2010 and 2018
This is just a requirement of the project.
***

In [34]:
tn_movie_budgets['release_year'] = tn_movie_budgets.release_date.map(lambda x: x.year)

In [35]:
recent_tn_movie_budgets = tn_movie_budgets.loc[(2010<=tn_movie_budgets['release_year']) & (tn_movie_budgets['release_year']<=2018)]

In [36]:
recent_tn_movie_budgets.online_release.value_counts()

False    1873
True      251
Name: online_release, dtype: int64

***
## Step 2 Summary:

We think we can identify online releases, but no garbarge data or rows to drop. 
There are still 251 online releases and 1873 box office releases; enough to perform some analysis.
***

### Clean imdb_name_basics_df

In [None]:
imdb_name_basics = pd.read_csv('data/imdb.name.basics.csv.gz')
imdb_name_basics.head()

Split values in 'known_for_titles' column 

In [None]:
knownfor_expand = imdb_name_basics.known_for_titles.str.split(",", expand=True)

Expand Titles

In [None]:
# expand titles 
imdb_name_basics['known_for_titles_1'] = knownfor_expand[0]
imdb_name_basics['known_for_titles_2'] = knownfor_expand[1]
imdb_name_basics['known_for_titles_3'] = knownfor_expand[2]
imdb_name_basics['known_for_titles_4'] = knownfor_expand[3]
imdb_name_basics['known_for_titles_5'] = knownfor_expand[4]
imdb_name_basics['known_for_titles_6'] = knownfor_expand[5]

Change dataframe format from wide to long so that the key column changes from 'primary_name' to 'tconst'    

In [None]:
# drop unecessary colums. Only interested in name, primary profession and nconst
imdb_names_small = imdb_name_basics
imdb_names_small = imdb_names_small.drop(['birth_year', 
                                          'death_year', 
                                          'known_for_titles'], 
                                         axis=1)

In [None]:
# melt dataframe to change the shape 
imdb_names_melt = pd.melt(imdb_names_small, 
                          id_vars =['primary_name', 
                                    'nconst', 
                                    'primary_profession'], 
                          value_vars =['known_for_titles_1', 
                                       'known_for_titles_2', 
                                       'known_for_titles_3', 
                                       'known_for_titles_4',
                                       'known_for_titles_5', 
                                       'known_for_titles_6'])

In [None]:
# drop variable and reset index and rename variable
imdb_names_melt.drop('variable', inplace=True, axis=1)
imdb_names_melt.dropna(inplace=True)
imdb_names_melt.reset_index(inplace=True)
imdb_names_melt.drop('index', axis=1, inplace=True)
imdb_names_melt.rename(columns={'value': 'movie_titles'}, inplace=True)

In [None]:
imdb_names_melt.rename(columns={'movie_titles': 'tconst'}, inplace=True)
imdb_names_melt.set_index('tconst', inplace=True)
imdb_names_melt.head()

In [None]:
# export file
# imdb_names_melt.to_csv('data/imdb_names_update')

### Clean imdb_title_basics_df

In [None]:
imdb_title_basics = pd.read_csv('data/imdb.title.basics.csv.gz')
imdb_title_basics = imdb_title_basics.loc[imdb_title_basics['start_year'] < 2019] # restrict movies between 2010-2018
imdb_title_basics.head(7)

In [None]:
imdb_gen_expand = imdb_title_basics.genres.str.split(",", expand=True)

In [None]:
imdb_title_basics['genres_1']=imdb_gen_expand[0]
imdb_title_basics['genres_2']=imdb_gen_expand[1]
imdb_title_basics['genres_3']=imdb_gen_expand[2]
#imdb_title_basics.drop('genres', axis=1, inplace=True)

In [None]:
imdb_title_basics.set_index('tconst', inplace=True)

In [None]:
# imdb_title_basics.to_csv('data/imdb_titles')

### Clean imdb_title_principals_df & imdb_title_ratings_df

No cleaning required for these two datasets

# Step 3: Exploratory Data Analysis
***

In [None]:
# Import Clean Datasets
imdb_names = pd.read_csv('data/imdb_names_update', index_col='tconst')
imdb_titles = pd.read_csv('data/imdb_titles', index_col='tconst')
imdb_tn_budgets = pd.read_csv('data/tn_movies_bud_update')
imdb_title_principals_df = pd.read_csv('data/imdb.title.principals.csv.gz')
imdb_ratings = pd.read_csv('data/imdb.title.ratings.csv.gz')

In [None]:
# Drop 'unnamed: 0' column from tn_movie_budget dataframe
imdb_tn_budgets.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# change index to tconst variable 
imdb_title_principals_df.set_index('tconst', inplace=True)

#### Join imdb_titles and imdb_names dataframes

In [None]:
imdb_titles_names = imdb_titles.join(imdb_title_principals_df, how='inner')
imdb_titles_names.head()

In [None]:
# reset index 
imdb_titles_names.reset_index(inplace=True)

In [None]:
# create directors only dataset
imdb_title_name_directors = imdb_titles_names[imdb_titles_names['category']=='director']

In [None]:
# check hed of new directors only dataset
imdb_title_name_directors.head()

#### Join imdb_titles_names dataframe with imdb_ratings

In [None]:
# reset index for both ratings and directors only dataset
imdb_ratings.set_index('tconst', inplace=True)
imdb_title_name_directors.set_index('tconst', inplace=True)

In [None]:
# join datasets
imdb_title_ratings = imdb_title_name_directors.join(imdb_ratings, 
                              how='inner')

### Data Visualization

#### Anlysis on movie genres

In [None]:
# check the number of genres in the dataset
imdb_title_ratings.genres_1.value_counts()

In [None]:
# create directors only dataset
imdb_title_name_directors = imdb_titles_names[imdb_titles_names['category']=='director']

In [None]:
imdb_title_ratings.reset_index(inplace=True)

In [None]:
# look at the median IMDB ratings value for all genres
imdb_title_ratings.groupby('genres_1').median().sort_values('averagerating')

In [None]:
# create a data visualization to show median IMDB rating for all genres
sns.set_context('notebook')
g2 = sns.relplot(y='averagerating', 
            x='numvotes', 
            kind='scatter', 
            hue='genres_1',
            col='genres_1', 
            col_wrap=4,
            height=3,
           data=imdb_title_ratings, 
           legend=None)

axes2 = g2.axes.flatten()

# axes2[0].set_title("Internal")
# axes2[1].set_title("Internal")
# axes2[6].set_title("Internal")

for ax2 in axes2: 
    ax2.set_ylabel("Average Movie Rating (IMDB)")

for ax1 in axes2:
    ax1.set_xlabel("Number of Votes")

    
# g2.savefig('/Users/markishab/Downloads/fig_genres_pop.png')

In [None]:
# Choose non-trivial catagories to perform analysis on 
imdb_genre_set = imdb_title_ratings.loc[(imdb_title_ratings['genres_1']!='Documentary') &
                                        (imdb_title_ratings['genres_1']!='Mystery') &
                                        (imdb_title_ratings['genres_1']!='Romance') &
                                        (imdb_title_ratings['genres_1']!='History') &
                                        (imdb_title_ratings['genres_1']!='Famiily') &
                                        (imdb_title_ratings['genres_1']!='Music') &
                                        (imdb_title_ratings['genres_1']!='Musical')]

In [None]:
# create a column of boolean value for weather the row is any of three values 
imdb_genre_set['color'] = np.where((imdb_genre_set['genres_1']=='Biography') | 
                                   (imdb_genre_set['genres_1']=='Animation') |
                                   (imdb_genre_set['genres_1']=='Adventure'), '1', '0')

In [None]:
color_palette = ['#737373', '#FFB900']
plt.figure(figsize=(15,10))
sns.set_context('poster')
sns.boxplot(x='averagerating',
            y='genres_1', 
            order=(['Biography', 'Animation', 'Adventure', 'Comedy', 'Crime', 
                    'Drama', 'Action', 'Fantasy', 'Family', 'Horror']),
           data=imdb_genre_set, 
                hue='color',
                 palette=color_palette)
plt.title("AVERAGE MOVIE RATINGS BY GENRE", loc='center', y=1.08, fontweight="bold")
plt.xlabel('Average Movie Rating (IMDB)')
plt.ylabel('Movie Genres')
plt.legend(loc=False)
plt.legend().remove()
sns.despine()
# plt.savefig('/Users/markishab/Downloads/fig_genres.png')

#### Analysis on directors

In [None]:
imdb_title_ratings[imdb_title_ratings['category']=='director']

In [None]:
# find the average movie rating by director. This is one of the summary tables
avg_ratings_director = imdb_title_ratings.groupby(['nconst', 'category']).mean().sort_values(by='averagerating'
                                                          , ascending=False).drop(['start_year', 
                                                                                   'runtime_minutes', 
                                                                                  'numvotes'], axis=1)
avg_ratings_director.head()

In [None]:
# reset index 
avg_ratings_director.reset_index(inplace=True)
avg_ratings_director.set_index('nconst', inplace=True)
# avg_ratings_director.drop('index')

In [None]:
# create a new dataframe for number of movies
number_movies = imdb_title_ratings.nconst.value_counts().to_frame()

In [None]:
# rename column to number of movies
number_movies.rename(columns={'nconst':'num_movies'}, inplace=True)

In [None]:
# join avg_ratings_directors and number_movies dataframe
director_scatter = avg_ratings_director.join(number_movies, 
                         how='right')

In [None]:
# reset index and take a look at the new dataframe
director_scatter.reset_index(inplace=True)
director_scatter.head()

In [None]:
# create dataframe with only prolific directors 
director_prolific_df = director_scatter[director_scatter['num_movies']>4].sort_values('averagerating', ascending=False)
director_prolific_df.reset_index(inplace=True)

In [None]:
# drop index
director_prolific_df.drop('index', inplace=True, axis=1)

In [None]:
# restrict to top 20 directors
director_prolific_df2 = director_prolific_df.head(20)

In [None]:
# create new dataframe with top rated movies from prolific directors
director_prolific_movies = imdb_title_ratings[(imdb_title_ratings['nconst']=='nm3167849') |
                  (imdb_title_ratings['nconst']=='nm2223858') |
                  (imdb_title_ratings['nconst']=='nm0517665') |
                  (imdb_title_ratings['nconst']=='nm1126245') |
                  (imdb_title_ratings['nconst']=='nm1103151') |
                  (imdb_title_ratings['nconst']=='nm0926911') |
                  (imdb_title_ratings['nconst']=='nm2050348') |
                  (imdb_title_ratings['nconst']=='nm2833230') |
                  (imdb_title_ratings['nconst']=='nm1006692') |
                  (imdb_title_ratings['nconst']=='nm1355731') |
                  (imdb_title_ratings['nconst']=='nm2611428')].sort_values('nconst')

In [None]:
# create dataframe with highest movie rating for eadh director between 2010 and 2018
director_rating_max = director_prolific_movies.groupby('nconst').max()
director_rating_max.reset_index(inplace=True)
director_rating_max.head()

In [None]:
# create dataframe with lowest movie rating for eadh director between 2010 and 2018
director_rating_min = director_prolific_movies.groupby('nconst').min()
director_rating_min.reset_index(inplace=True)
director_rating_min.head()

In [None]:
## Source: https://python-graph-gallery.com/184-lollipop-plot-with-2-groups/
sns.set_context('poster')
plt.figure(figsize=(15,10))
my_range=range(1,len(director_rating_max.index)+1)

plt.hlines(y=my_range, xmin=director_rating_min['averagerating'], 
           xmax=director_rating_max['averagerating'], color='grey', alpha=0.4)
plt.scatter(director_rating_min['averagerating'], my_range, color='#FFB900', alpha=1, label='Lowest IMDB Rating')
plt.scatter(director_rating_max['averagerating'], my_range, color='#737373', alpha=1 , label='Highest IMDB Rating')
plt.legend()
sns.despine()

# Add title and axis names
plt.yticks(my_range, director_rating_max['nconst'])
plt.title("Movie rating range by director between 2010 and 2018", loc='left', y=1.08, fontweight="bold")
plt.xlabel('IMDB Movie Rating Range')
plt.ylabel('Directors')
# labels=['Yasmine Asha', 'Nick Rosen', 'Dawn Gifford Engle', 'Steve Ravic', 
#  'Peter Mortimer', 'Josh Lowell', 'Branko Istvancic', 'Carlos Nader', 
#  'Dick Carruthers', 'Nick Wickham', 'Tom Logan']

# plt.set_yticklabels(labels=labels)

plt.savefig('/Users/markishab/Downloads/fig_directors_prolific.png')