In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from itertools import combinations
from collections import Counter


In [2]:
data = pd.read_csv('data.csv')
data.head(5)

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year
0,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015
1,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015
2,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015
3,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015
4,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015


In [3]:
len(data)

1890

In [4]:
data.columns # запрашиваем перечень колонок в компактном виде для удобства работы

Index(['imdb_id', 'popularity', 'budget', 'revenue', 'original_title', 'cast',
       'director', 'tagline', 'overview', 'runtime', 'genres',
       'production_companies', 'release_date', 'vote_count', 'vote_average',
       'release_year'],
      dtype='object')

In [5]:
# Вопрос 1. Максимальный бюджет
data[data['budget']==data.budget.max()] 

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year
491,tt1032751,0.25054,425000000,11087569,The Warrior's Way,Kate Bosworth|Jang Dong-gun|Geoffrey Rush|Dann...,Sngmoo Lee,Assassin. Hero. Legend.,An Asian assassin (Dong-gun Jang) is forced to...,100,Adventure|Fantasy|Action|Western|Thriller,Boram Entertainment Inc.,12/2/2010,74,6.4,2010


In [6]:
# Вопрос 2. Самый продолжительный фильм
data[data['runtime'] == data.runtime.max()] 

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year
1158,tt0279111,0.469518,56000000,12923936,Gods and Generals,Stephen Lang|Jeff Daniels|Robert Duvall|Kevin ...,Ronald F. Maxwell,The nations heart was touched by...,The film centers mostly around the personal an...,214,Drama|History|War,Turner Pictures|Antietam Filmworks,2/21/2003,23,5.8,2003


In [7]:
# Вопрос 3. Самый короткий фильм
data[data['runtime'] == data.runtime.min()] 

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year
769,tt1449283,1.425344,30000000,14460000,Winnie the Pooh,Jim Cummings|Travis Oates|Jim Cummings|Bud Luc...,Stephen Anderson|Don Hall,Oh Pooh.,"During an ordinary day in Hundred Acre Wood, W...",63,Animation|Family,Walt Disney Pictures|Walt Disney Animation Stu...,4/13/2011,174,6.8,2011


In [8]:
# Вопрос 4. Средняя продолжительность фильма в датасете
data.runtime.mean() 

109.65343915343915

In [9]:
# Вопрос 5. Медианная продолжительность фильма в датасете
data.runtime.median() 

106.5

In [10]:
data['profit'] = data['revenue'] - data ['budget'] # Добавление столбца profit - прибыль от фильмов
data.head()

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit
0,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,1363528810
1,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,228436354
2,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,185238201
3,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,1868178225
4,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,1316249360


In [11]:
# Вопрос 6. Самый прибыльный фильм
data[data['profit'] == data.profit.max()]  

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit
239,tt0499549,9.432768,237000000,2781505847,Avatar,Sam Worthington|Zoe Saldana|Sigourney Weaver|S...,James Cameron,Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",162,Action|Adventure|Fantasy|Science Fiction,Ingenious Film Partners|Twentieth Century Fox ...,12/10/2009,8458,7.1,2009,2544505847


In [12]:
# Вопрос 7. Самый убыточный фильм
data[data['profit'] == data.profit.min()] 

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit
491,tt1032751,0.25054,425000000,11087569,The Warrior's Way,Kate Bosworth|Jang Dong-gun|Geoffrey Rush|Dann...,Sngmoo Lee,Assassin. Hero. Legend.,An Asian assassin (Dong-gun Jang) is forced to...,100,Adventure|Fantasy|Action|Western|Thriller,Boram Entertainment Inc.,12/2/2010,74,6.4,2010,-413912431


In [13]:
# Вопрос 8. Количество прибыльных фильмов в датасете
data[data['profit']>0].profit.count() 

1478

In [14]:
# Вопрос 9 по версии платформы. Самый кассовый фильм

data_2008 = data[data['release_year'] == 2008] # Создание датафрейма с фильмами 2008
data_2008[data_2008['revenue'] == data_2008.revenue.max()] 


Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit
600,tt0468569,8.466668,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,DC Comics|Legendary Pictures|Warner Bros.|Syncopy,7/16/2008,8432,8.1,2008,816921825


In [15]:
# Вопрос 10. Самый убыточный фильм за 2012-2014

# Создание датафрейма с фильмами в 2012-2014
data_1214 = data[(data['release_year'] > 2011)&(data['release_year'] < 2014)] 

data_1214[data_1214['profit'] == data_1214.profit.min()]   

Unnamed: 0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit
1246,tt1210819,1.21451,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,7/3/2013,1607,6.0,2013,-165710090


In [16]:
# Вопрос 11. Фильмов какого жанра больше всего
pd.DataFrame(data.genres.str.split('|').tolist()).stack().value_counts() 

Drama              782
Comedy             683
Thriller           597
Action             583
Adventure          416
Crime              315
Romance            308
Family             260
Science Fiction    248
Fantasy            223
Horror             176
Mystery            168
Animation          139
Music               64
History             62
War                 58
Western             20
Documentary          8
Foreign              2
dtype: int64

In [17]:
# Вопрос 12. Какого жанра прибыльных фильмов больше всего
pd.DataFrame(data[data['profit'] > 0].genres.str.split('|').tolist()).stack().value_counts()


Drama              560
Comedy             551
Thriller           446
Action             444
Adventure          337
Romance            242
Crime              231
Family             226
Science Fiction    195
Fantasy            188
Horror             150
Animation          120
Mystery            119
Music               47
History             46
War                 41
Western             12
Documentary          7
dtype: int64

In [18]:
# Вопрос 13. Кто из режиссёров снял больше всего фильмов
pd.DataFrame(data.director.str.split('|').tolist()).stack().value_counts() 

Steven Soderbergh    13
Clint Eastwood       12
Ridley Scott         12
Robert Rodriguez     11
Shawn Levy           10
                     ..
Trey Parker           1
Gary McKendry         1
James Gray            1
Michael Cuesta        1
Mennan Yapo           1
Length: 998, dtype: int64

In [19]:
# Вопрос 14. Кто из режиссёров снял больше всего прибыльных фильмов
pd.DataFrame(data[data['profit'] > 0].director.str.split('|').tolist()).stack().value_counts()


Ridley Scott             12
Clint Eastwood           10
Steven Soderbergh        10
Steven Spielberg         10
Shawn Levy                9
                         ..
Daniel Espinosa           1
Matt Bettinelli-Olpin     1
Fede Alvarez              1
Leigh Whannell            1
Mennan Yapo               1
Length: 814, dtype: int64

In [20]:
'''Решение вопроса 15 через объединение таблиц
Создание датафрейма с режиссёрами, указанными по одному, и соответствующими им индексами основного датафрейма'''
data_dir = pd.DataFrame(data.director.str.split('|').tolist()).stack().reset_index()
data_dir.drop(['level_1'], axis='columns', inplace=True)
data_dir.columns = ['level_0', 'dir_solo']
data_dir.head()

Unnamed: 0,level_0,dir_solo
0,0,Colin Trevorrow
1,1,George Miller
2,2,Robert Schwentke
3,3,J.J. Abrams
4,4,James Wan


In [21]:
# Присвоение индексам основного датафрейма имени и объединение по индексам
data.index.name = 'level_0'
data_merge = data_dir.merge(data, on='level_0', how='left')
data_merge.head()

Unnamed: 0,level_0,dir_solo,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit
0,0,Colin Trevorrow,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,1363528810
1,1,George Miller,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,228436354
2,2,Robert Schwentke,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,185238201
3,3,J.J. Abrams,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,1868178225
4,4,James Wan,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,1316249360


In [22]:
# Ответ на вопрос №15. Режиссёр, который принёс больше всего прибыли
data_group = data_merge.groupby(['dir_solo'])['profit'].sum().sort_values(ascending=False)
data_group.head()

dir_solo
Peter Jackson        5202593685
David Yates          3379295625
Christopher Nolan    3162548502
J.J. Abrams          2839169916
Michael Bay          2760938960
Name: profit, dtype: int64

In [23]:
#Вопрос 16. Какой актёр принёс больше всего прибыли
cnt = Counter()
for index, row in data.iterrows():
    names = row.cast.split(sep='|')
    for name in names:
        cnt[name]+=row.profit   
display(cnt.most_common(5))

[('Emma Watson', 6666245597),
 ('Daniel Radcliffe', 6514990281),
 ('Rupert Grint', 6408638290),
 ('Ian McKellen', 6087375777),
 ('Robert Downey Jr.', 5316030161)]

In [25]:
# Вопрос 17. Какой актёр принёс меньше всего прибыли в 2012 году
cnt1 = Counter()
for index, row in data[data['release_year'] == 2012].iterrows():
    names = row.cast.split(sep='|')
    for name in names:
        cnt1[name]+=row.profit   
display(cnt1.most_common())

[('Chris Hemsworth', 1542450773),
 ('Denis Leary', 1319460639),
 ('Robert Downey Jr.', 1299557910),
 ('Chris Evans', 1299557910),
 ('Mark Ruffalo', 1299557910),
 ('Scarlett Johansson', 1299557910),
 ('Anne Hathaway', 1211851057),
 ('Sacha Baron Cohen', 1097110577),
 ('Ralph Fiennes', 1059561013),
 ('Seann William Scott', 967234366),
 ('Kristen Stewart', 919384318),
 ('Daniel Craig', 908561013),
 ('Judi Dench', 908561013),
 ('Javier Bardem', 908561013),
 ('Naomie Harris', 908561013),
 ('Josh Hutcherson', 872710235),
 ('Tom Hardy', 858717867),
 ('Christian Bale', 831041287),
 ('Michael Caine', 831041287),
 ('Gary Oldman', 831041287),
 ('Jennifer Lawrence', 800949406),
 ('Ray Romano', 782244782),
 ('Chris Wedge', 782244782),
 ('John Leguizamo', 779138503),
 ('Ian McKellen', 767003568),
 ('Martin Freeman', 767003568),
 ('Richard Armitage', 767003568),
 ('Andy Serkis', 767003568),
 ('Cate Blanchett', 767003568),
 ('Emma Thompson', 753009983),
 ('Taylor Lautner', 709000000),
 ('Peter Facinel

In [26]:
# Вопрос 18. Какой актёр снялся в большем количестве высокобюджетных фильмов
pd.DataFrame(data[data['budget'] > data.budget.mean()]
             .cast.str.split('|').tolist()).stack().value_counts()

Matt Damon           18
Adam Sandler         17
Angelina Jolie       16
Tom Cruise           15
Samuel L. Jackson    15
                     ..
Harland Williams      1
Bob Peterson          1
Matt Jones            1
Mark Consuelos        1
Paul Anderson         1
Length: 1508, dtype: int64

In [27]:
# Вопрос 19. В фильмах какого жанра чаще всего снимался Nicolas Cage
pd.DataFrame(data[data.cast.str.contains('Nicolas Cage')].genres
             .str.split('|').tolist()).stack().value_counts()


Action             17
Thriller           15
Drama              12
Crime              10
Fantasy             8
Adventure           7
Comedy              6
Science Fiction     4
Mystery             3
Animation           3
Family              3
History             2
Horror              1
War                 1
Romance             1
dtype: int64

In [28]:
#Вопрос 20. Какая студия сняла больше всего фильмов
pd.DataFrame(data.production_companies.str.split('|').tolist()).stack().value_counts()


Universal Pictures                        173
Warner Bros.                              168
Paramount Pictures                        122
Columbia Pictures                         117
Twentieth Century Fox Film Corporation    109
                                         ... 
Matinee Pictures                            1
AR-TL                                       1
Schroeder Hoffman Productions               1
Nightfall Productions                       1
Xingu Films                                 1
Length: 1772, dtype: int64

In [29]:
# Вопрос 21. Какая студия сняла больше всего фильмов в 2015
pd.DataFrame(data[data['release_year'] == 2015].production_companies
             .str.split('|').tolist()).stack().value_counts()


Warner Bros.                              12
Universal Pictures                        10
Twentieth Century Fox Film Corporation     8
Columbia Pictures                          7
Paramount Pictures                         7
                                          ..
Duperele Films                             1
M6 Films                                   1
Moonlighting Films                         1
Bazelevs Production                        1
Blinding Edge Pictures                     1
Length: 246, dtype: int64

In [30]:
# Вопрос 22. Какая студия заработала больше всего в жанре комедий
cnt2 = Counter()
for index, row in data[data.genres.str.contains('Comedy')].iterrows():
    prods = row.production_companies.split(sep='|')
    for prod in prods:
        cnt2[prod]+=row.profit
display(cnt2.most_common(5))


[('Universal Pictures', 8961545581),
 ('Walt Disney Pictures', 7669710326),
 ('Twentieth Century Fox Film Corporation', 5686960294),
 ('Columbia Pictures', 5646343696),
 ('DreamWorks Animation', 4789049764)]

In [31]:
# Вопрос 23. Какая студия заработала больше всего в 2012
cnt3 = Counter()
for index, row in data[data['release_year'] == 2012].iterrows():
    names = row.production_companies.split(sep='|')
    for name in names:
        cnt3[name]+=row.profit   
display(cnt3.most_common(5))


[('Columbia Pictures', 2501406608),
 ('Universal Pictures', 1981011579),
 ('Marvel Studios', 1299557910),
 ('Warner Bros.', 1258020056),
 ('Relativity Media', 1032593938)]

In [32]:
# Вопрос 24. Самый убыточный фильм от Paramount Pictures
data[data.profit == data[data.production_companies.str.contains('Paramount Pictures')].profit.min()]


Unnamed: 0_level_0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
926,tt0267626,0.72233,100000000,35168966,K-19: The Widowmaker,Harrison Ford|Liam Neeson|Peter Sarsgaard|Joss...,Kathryn Bigelow,Fate has found its hero.,When Russia's first nuclear submarine malfunct...,138,Thriller|Drama|History,Paramount Pictures|Intermedia Films|National G...,7/19/2002,146,6.0,2002,-64831034


In [33]:
# Вопрос 25. Самый прибыльный год
display(pd.DataFrame(data.groupby(['release_year'])['profit'].sum())
        .sort_values('profit', ascending=False))


Unnamed: 0_level_0,profit
release_year,Unnamed: 1_level_1
2015,18668572378
2014,16397812953
2012,16077001687
2013,15243179791
2011,14730241341
2009,13423744372
2010,13117292530
2008,11663881990
2007,11565911801
2004,9634180720


In [34]:
# Вопрос 26. Самый прибыльный год для Warner Broth
x = pd.DataFrame(data[data.production_companies.str.contains('Warner Bros')]
                 .groupby(['release_year'])['profit'].sum()).sort_values('profit', ascending=False)
display(x)


Unnamed: 0_level_0,profit
release_year,Unnamed: 1_level_1
2014,2295464519
2007,2201675217
2008,2134595031
2010,1974712985
2011,1871393682
2003,1855493377
2009,1822454136
2013,1636453400
2004,1631933725
2005,1551980298


In [35]:
# Вопрос 27. В каком месяце выпущено больше фильмов
data['month'] = data.release_date.apply(lambda x: x.split('/')[0]) #добавление столбца с месяцами
data['month'].value_counts()


9     227
12    191
10    186
8     161
3     156
4     149
6     147
11    146
7     142
5     140
2     135
1     110
Name: month, dtype: int64

In [36]:
# Вопрос 28. Сколько фильмов вышло суммарно летом
len(data.query('month in ["6", "7", "8"]'))

450

In [37]:
# Вопрос 29. Какой режиссёр выпустил больше фильмов зимой
data_winter = data.query('month in ["1", "2", "12"]') # Датафрейм с зимними фильмами
pd.DataFrame(data_winter.director.str.split('|').tolist()).stack().value_counts()

Peter Jackson        7
Clint Eastwood       6
Steven Soderbergh    6
Martin Scorsese      4
Shawn Levy           4
                    ..
Brian Robbins        1
Jesse Dylan          1
MÃ¥ns MÃ¥rlind       1
Guy Ferland          1
Wayne Kramer         1
Length: 359, dtype: int64

In [38]:
# Вопрос 30. Какой месяц по годам самый прибыльный
# Создание анализ сводной таблицы
pivot = pd.DataFrame(data.pivot_table(values=['profit'],
index=['month'],
columns=['release_year'],
aggfunc='sum',
margins=True))
pivot.drop(['All'], inplace=True)
pivot.idxmax(axis=0).value_counts()

6     8
5     4
12    4
7     1
dtype: int64

In [39]:
# Вопрос 31. У какой студии в среднем самый длинный фильм по количеству символов
data_len = pd.DataFrame(data['original_title']).original_title.str.len().reset_index() #Новый датафрейм
data.index.name = 'level_0' # Присвоение имени столбцу индексов основного датафрейма
data_len.columns = ['level_0', 'title_len']
data_len = data_len.merge(data['original_title'], on='level_0') # Добавление столбца с наименованиями фильмов
data_prod = pd.DataFrame(data['production_companies'].str.split('|').tolist()).stack().reset_index()
data_prod.columns = ['level_0', 'level_1', 'production_companies'] # Датафрейм со студиями
data_prod.drop(['level_1'], axis='columns', inplace=True)
data_len = data_len.merge(data_prod, on='level_0', how='right') #Объединение двух новых датафреймов 
data_len.groupby(['production_companies']).title_len.mean().sort_values(ascending=False)


production_companies
Four By Two Productions       83.0
Jim Henson Company, The       59.0
Dos Corazones                 47.0
Polsky Films                  46.0
Museum Canada Productions     46.0
                              ... 
Everest Entertainment          3.0
Berlanti Productions           3.0
XM2 Productions                2.0
Ixtlan Productions             2.0
Global Entertainment Group     2.0
Name: title_len, Length: 1772, dtype: float64

In [40]:
# Вопрос 32. У какой студии в среднем самый длинный фильм по количеству слов
words = []
for i in data_len['original_title'].str.split().tolist():
    words.append(len(i))
data_len['words'] = words
data_len.groupby(['production_companies'])['words'].mean().sort_values(ascending=False)

production_companies
Four By Two Productions                  12.0
Jim Henson Company, The                  10.0
Polsky Films                              9.0
The Saul Zaentz Company                   9.0
Dos Corazones                             9.0
                                         ... 
Many Rivers Productions                   1.0
Maple Shade Films                         1.0
Blue Productions                          1.0
Blue Sea Productions                      1.0
Great Wight Productions/ Osford Films     1.0
Name: words, Length: 1772, dtype: float64

In [41]:
# Вопрос 33. Сколько слов без учёта регистра в наименованиях
words = set()
for i in data['original_title'].to_list():
    for j in i.lower().split():
        words.add(j)
print(len(words))

2461


In [42]:
# Вопрос 34. Фильмы в 1% лучших по рейтингу
ref = np.percentile(data['vote_average'], q=[0,99,100])
data[data['vote_average']>ref[1]]

Unnamed: 0_level_0,imdb_id,popularity,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,profit,month
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9,tt2096673,6.326804,175000000,853708609,Inside Out,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",94,Comedy|Animation|Family,Walt Disney Pictures|Pixar Animation Studios|W...,6/9/2015,3935,8.0,2015,678708609,6
34,tt3170832,3.557846,6000000,35401758,Room,Brie Larson|Jacob Tremblay|Joan Allen|Sean Bri...,Lenny Abrahamson,Love knows no boundaries,Jack is a young boy of 5 years old who has liv...,117,Drama|Thriller,Element Pictures|No Trace Camping|A24|Duperele...,10/16/2015,1520,8.0,2015,29401758,10
118,tt0816692,24.949134,165000000,621752480,Interstellar,Matthew McConaughey|Jessica Chastain|Anne Hath...,Christopher Nolan,Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,169,Adventure|Drama|Science Fiction,Paramount Pictures|Legendary Pictures|Warner B...,11/5/2014,6498,8.0,2014,456752480,11
119,tt2015381,14.311205,170000000,773312399,Guardians of the Galaxy,Chris Pratt|Zoe Saldana|Dave Bautista|Vin Dies...,James Gunn,All heroes start somewhere.,"Light years from Earth, 26 years after being a...",121,Action|Science Fiction|Adventure,Marvel Studios|Moving Picture Company (MPC)|Bu...,7/30/2014,5612,7.9,2014,603312399,7
125,tt2084970,8.110711,14000000,233555708,The Imitation Game,Benedict Cumberbatch|Keira Knightley|Matthew G...,Morten Tyldum,The true enigma was the man who cracked the code.,Based on the real life story of legendary cryp...,113,History|Drama|Thriller|War,Black Bear Pictures|Bristol Automotive,11/14/2014,3478,8.0,2014,219555708,11
128,tt2267998,6.438727,61000000,369330363,Gone Girl,Ben Affleck|Rosamund Pike|Carrie Coon|Neil Pat...,David Fincher,You don't know what you've got 'til it's...,With his wife's disappearance having become th...,145,Mystery|Thriller|Drama,Twentieth Century Fox Film Corporation|Regency...,10/1/2014,3720,7.9,2014,308330363,10
138,tt2278388,4.93082,30000000,174600318,The Grand Budapest Hotel,Ralph Fiennes|Tony Revolori|F. Murray Abraham|...,Wes Anderson,A perfect holiday without leaving home.,The Grand Budapest Hotel tells of a legendary ...,99,Comedy|Drama,Fox Searchlight Pictures|Scott Rudin Productio...,2/26/2014,2802,7.9,2014,144600318,2
370,tt1375666,9.363643,160000000,825500000,Inception,Leonardo DiCaprio|Joseph Gordon-Levitt|Ellen P...,Christopher Nolan,Your mind is the scene of the crime.,"Cobb, a skilled thief who commits corporate es...",148,Action|Thriller|Science Fiction|Mystery|Adventure,Legendary Pictures|Warner Bros.|Syncopy,7/14/2010,9767,7.9,2010,665500000,7
600,tt0468569,8.466668,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,DC Comics|Legendary Pictures|Warner Bros.|Syncopy,7/16/2008,8432,8.1,2008,816921825,7
873,tt0253474,2.364204,35000000,120072577,The Pianist,Adrien Brody|Thomas Kretschmann|Frank Finlay|M...,Roman Polanski,Music was his passion. Survival was his master...,The Pianist is a film adapted from the biograp...,150,Drama|War,Bac Films|Canal+Polska|Heritage Films|Studio B...,9/24/2002,938,7.9,2002,85072577,9


In [44]:
# Вопрос 35. Какие актёры чаще всего снимаются вместе
cnt4 = Counter()
for index, row in data.iterrows():
    casting = row.cast.split('|')
    pairs = list(combinations(casting,2))
    for pair in pairs:
        cnt4[pair]+=1   
display(cnt4.most_common(5))


[(('Daniel Radcliffe', 'Rupert Grint'), 8),
 (('Daniel Radcliffe', 'Emma Watson'), 8),
 (('Rupert Grint', 'Emma Watson'), 7),
 (('Ben Stiller', 'Owen Wilson'), 6),
 (('Johnny Depp', 'Helena Bonham Carter'), 6)]

In [45]:
#Вопрос 36 по версии платформы. У какого режиссёра самый высокий процент фильмов со сборами выше бюджета
data_dir = pd.DataFrame(pd.DataFrame(data.director.str.split('|').tolist()).stack().value_counts())
data_dir.columns = ['film']
data_dir_plus = pd.DataFrame(pd.DataFrame(data[data['profit'] > 0].director.str.split('|').tolist()).stack().value_counts())
data_dir_plus.columns = ['profitable']
data_dir_plus = data_dir_plus.merge(data_dir, left_index=True, right_index=True, how='left')
data_dir_plus['percent'] = data_dir_plus['profitable'] / data_dir_plus['film']*100
display(data_dir_plus['percent'].nlargest(15))

Ridley Scott         100.0
Steven Spielberg     100.0
Tim Burton           100.0
Brett Ratner         100.0
Peter Jackson        100.0
Michael Bay          100.0
Antoine Fuqua        100.0
Christopher Nolan    100.0
Zack Snyder          100.0
Adam McKay           100.0
Joel Coen            100.0
Louis Leterrier      100.0
Sam Raimi            100.0
David Fincher        100.0
Andy Fickman         100.0
Name: percent, dtype: float64