# Импорт библиотек и датасета

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
data = pd.read_csv('movie_bd_v5.csv')

# Предобработка датасета

In [4]:
answers = {} # создадим словарь для ответов

data['profit'] = data.revenue - data.budget # колонка - прибыль
data['release_date'] = pd.to_datetime(data['release_date']) # для работы с датами
data['release_month'] = data.release_date.dt.month # колонка - месяц выхода
data['length_title'] = data['original_title'].apply(lambda x: len(x)) # колонка - длина названия
data['word_count_overview'] = data['overview'].str.findall(r"(\w+'*\w*)").str.len() # колонка - кол-во слов в описании
data['word_overview'] = data['overview'].str.findall(r"(\w+'*\w*)") # колонка - слова в названии
# тут другие ваши предобработки колонок например:

#the time given in the dataset is in string format.
#So we need to change this in datetime format
# ...overview

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889 entries, 0 to 1888
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   imdb_id               1889 non-null   object        
 1   budget                1889 non-null   int64         
 2   revenue               1889 non-null   int64         
 3   original_title        1889 non-null   object        
 4   cast                  1889 non-null   object        
 5   director              1889 non-null   object        
 6   tagline               1889 non-null   object        
 7   overview              1889 non-null   object        
 8   runtime               1889 non-null   int64         
 9   genres                1889 non-null   object        
 10  production_companies  1889 non-null   object        
 11  release_date          1889 non-null   datetime64[ns]
 12  vote_average          1889 non-null   float64       
 13  release_year      

In [144]:
data.head()

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
0,tt0369610,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,2015-06-09,6.5,2015,1363528810,6,14,27,"[Twenty, two, years, after, the, events, of, J..."
1,tt1392190,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,2015-05-13,7.1,2015,228436354,5,18,110,"[An, apocalyptic, story, set, in, the, furthes..."
2,tt2908446,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,2015-03-18,6.3,2015,185238201,3,9,22,"[Beatrice, Prior, must, confront, her, inner, ..."
3,tt2488496,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,2015-12-15,7.5,2015,1868178225,12,28,26,"[Thirty, years, after, defeating, the, Galacti..."
4,tt2820852,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,2015-04-01,7.3,2015,1316249360,4,9,14,"[Deckard, Shaw, seeks, revenge, against, Domin..."


# Определение функций

In [105]:
def gen_df(col_1, col_2, series, func):
    '''генерация дата фрейма с применинем функции'''
    df = pd.concat([pd.DataFrame([item], columns = [col_1]) for item in series],
               ignore_index = True)
    df[col_2] = df[col_1].apply(func)
    return df.sort_values(col_2, ascending = False)


def genres_count(genre):
    '''Подсчет количества фильмов соответствующих жанров
    '''   
    count = len(data[data['genres'].str.contains(genre, na = False)])
    return count

def genres_count_profit(genre):
    df = data[data['profit'] > 0]
    count = len(df[df['genres'].str.contains(genre, na = False)])
    return count

def genres_count_2(df, genre):
    count = len(df[df['genres'].str.contains(genre, na = False)])
    return print(count, genre)


def count_director_genre(df, director, genre):
    '''Подсчет кол-ва фильмов который снял режисер по жанру
    '''
    count = len(df[(df.director.str.contains(director, na = False)) &
              (df.genres.str.contains(genre, na = False))])
    return print(count, director)

def count_director_action(director):
    ''' см выше'''
    count = len(data[(data.director.str.contains(director, na = False)) &
              (data.genres.str.contains('Action', na = False))])
    return count


def actor_revenue_year(df, actor, year):
    '''Вычисление максимальных доходов по актеру и году
    '''
    revenue = df[(df.cast.str.contains(actor, na = False)) &\
               (df.release_year == year)].revenue.sum()
    return print(revenue/10**6, actor)

def actor_revenue_2012(actor):
    '''см выше'''
    revenue = data[(data.cast.str.contains(actor, na = False)) &\
               (data.release_year == 2012)].revenue.sum()
    return revenue


def actor_high_budget(df, actor):
    '''Вычисление количества высокобюджетных фильмов в которых снялся актер
    '''
    count = len(df[(df.cast.str.contains(actor, na = False)) &\
               (df.budget > df.budget.mean())])
    return print(count, actor)

def genre_Cage(df, genre):
    '''подсчет жанров Кейджа
    '''
    count = len(df[(df.cast.str.contains('Nicolas Cage', na = False)) &\
                  (df.genres.str.contains(genre, na = False))])
    return print(count, genre)

def winter_director(df, director):
    '''подсчет количества фильмов за зиму в зависимости от режисера
    '''
    count = len(df[(df.release_month.isin([10, 11, 12])) &\
                  (df.director == director)])
    return print(count, director)

def mean_length_titles(df, studio):
    '''Определение средней длины символов в названиях по студиям
    '''
    length = df[df['production_companies'].str.contains(studio, na = False)]\
    .length_title.mean()
    return print(round(length, 2), studio)

def mean_overview(df,studio):
    ''' Определение среднеого описания по студиям
    '''
    mean = df[df['production_companies'].str.contains(studio, na = False)]\
    ['word_count_overview'].mean()
    return print(round(mean, 2), studio)

def best_movie_percent(df, movie, percent):
    '''Входит ли фильм в процент лучших
    '''
    df_in = df[df['vote_average'] >= df['vote_average'].quantile(1 - percent/100)]
    mark = len(df_in[df_in['original_title'].str.contains(movie, na = False)])
    if mark == 1:
        return True
    return False

def couple_actors(df, couple):
    '''Подсчет фильмов пары актеров
    '''
    actors = couple.split(' & ')
    count = len(df[(df['cast'].str.contains(actors[0], na = False)) &
                   (df['cast'].str.contains(actors[1]))])
    return print(count, couple)

# Ответы на вопросы

## 1. У какого фильма из списка самый большой бюджет?

In [6]:
answers['1'] = 'Pirates of the Caribbean: On Stranger Tides (tt1298650)'

In [7]:
data[data.budget == data.budget.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,Johnny Depp|PenÃ©lope Cruz|Geoffrey Rush|Ian M...,Rob Marshall,Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,Adventure|Action|Fantasy,Walt Disney Pictures|Jerry Bruckheimer Films|M...,2011-05-11,6.3,2011,641683000,5,43,74,"[Captain, Jack, Sparrow, crosses, paths, with,..."


## 2. Какой из фильмов самый длительный (в минутах)?

In [8]:
answers['2'] = 'Gods and Generals (tt0279111)'

In [9]:
data[data.runtime == data.runtime.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
1157,tt0279111,56000000,12923936,Gods and Generals,Stephen Lang|Jeff Daniels|Robert Duvall|Kevin ...,Ronald F. Maxwell,The nations heart was touched by...,The film centers mostly around the personal an...,214,Drama|History|War,Turner Pictures|Antietam Filmworks,2003-02-21,5.8,2003,-43076064,2,17,48,"[The, film, centers, mostly, around, the, pers..."


## 3. Какой из фильмов самый короткий (в минутах)?





In [10]:
answers['3'] = 'Winnie the Pooh (tt1449283)'

In [11]:
data[data.runtime == data.runtime.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
768,tt1449283,30000000,14460000,Winnie the Pooh,Jim Cummings|Travis Oates|Jim Cummings|Bud Luc...,Stephen Anderson|Don Hall,Oh Pooh.,"During an ordinary day in Hundred Acre Wood, W...",63,Animation|Family,Walt Disney Pictures|Walt Disney Animation Stu...,2011-04-13,6.8,2011,-15540000,4,15,52,"[During, an, ordinary, day, in, Hundred, Acre,..."


## 4. Какова средняя длительность фильмов?


In [12]:
answers['4'] = '110'

In [13]:
round(data.runtime.mean())

110

## 5. Каково медианное значение длительности фильмов? 

In [14]:
answers['5'] = '107'

In [15]:
round(data.runtime.median())

107

## 6. Какой самый прибыльный фильм?

In [17]:
answers['6'] = 'Avatar (tt0499549)'

In [16]:
data[data.profit == data.profit.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
239,tt0499549,237000000,2781505847,Avatar,Sam Worthington|Zoe Saldana|Sigourney Weaver|S...,James Cameron,Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",162,Action|Adventure|Fantasy|Science Fiction,Ingenious Film Partners|Twentieth Century Fox ...,2009-12-10,7.1,2009,2544505847,12,6,28,"[In, the, 22nd, century, a, paraplegic, Marine..."


## 7. Какой фильм самый убыточный? 

In [20]:
answers['7'] = 'The Lone Ranger (tt1210819)'

In [21]:
data[data.profit == data.profit.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
1245,tt1210819,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,2013-07-03,6.0,2013,-165710090,7,15,68,"[The, Texas, Rangers, chase, down, a, gang, of..."


## 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [22]:
answers['8'] = '1478'

In [23]:
len(data[data.profit > 0])

1478

## 9. Какой фильм оказался самым кассовым в 2008 году?

In [19]:
answers['9'] = 'The Dark Knight (tt0468569)'

In [18]:
data[data.release_year == 2008]\
.sort_values('revenue', ascending = False).head(1)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
599,tt0468569,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,DC Comics|Legendary Pictures|Warner Bros.|Syncopy,2008-07-16,8.1,2008,816921825,7,15,67,"[Batman, raises, the, stakes, in, his, war, on..."


## 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [20]:
answers['10'] = 'The Lone Ranger (tt1210819)'

In [21]:
data.query('2012 <= release_year <= 2014').sort_values('profit').head(1)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
1245,tt1210819,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,2013-07-03,6.0,2013,-165710090,7,15,68,"[The, Texas, Rangers, chase, down, a, gang, of..."


## 11. Какого жанра фильмов больше всего?

In [22]:
answers['11'] = 'Drama'

In [6]:
genres = 'Drama, Comedy, Action, Thriller, Adventure'.split(', ')
gen_df('genre', 'count', genres, genres_count)

Unnamed: 0,genre,count
0,Drama,782
1,Comedy,683
3,Thriller,596
2,Action,582
4,Adventure,415


In [31]:
def gen_df2(col_1, col_2, series, func, arg_1='na'):
    '''генерация дата фрейма с применинем функции'''
    df = pd.concat([pd.DataFrame([item], columns = [col_1]) for item in series],
               ignore_index = True)
    if arg_1 == 'na':
        df[col_2] = df[col_1].apply(func)
    else:
        df[col_2] = df[col_1].apply(func, args = (arg_1,))
    return df.sort_values(col_2, ascending = False)

def genres_count2(genre, all_profit='all'): # all - весь датасет, profit - окупившиеся 
    '''Подсчет количества фильмов соответствующих жанров
    '''   
    if all_profit == 'all':
        df = data
    else:
        df = data[data['profit'] > 0]
    count = len(df[df['genres'].str.contains(genre, na = False)])
    return count

In [32]:
gen_df2('genre', 'count', genres, genres_count2, 'all')

Unnamed: 0,genre,count
0,Drama,782
1,Comedy,683
3,Thriller,596
2,Action,582
4,Adventure,415


## 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [14]:
answers['12'] = 'Drama'

In [33]:
genres = 'Drama, Comedy, Action, Thriller, Adventure'.split(', ')
gen_df2('genre', 'count', genres, genres_count2, 'profit')

Unnamed: 0,genre,count
0,Drama,560
1,Comedy,551
3,Thriller,446
2,Action,444
4,Adventure,337


In [None]:
genres = 'Drama, Comedy, Action, Thriller, Adventure'.split(', ')
gen_df('genre', 'count', genres)

Вариант 2

In [78]:
data_profit = data[data.profit > 0]
for genre in genres:
    genres_count_2(data_profit, genre)

560 Drama
551 Comedy
444 Action
446 Thriller
337 Adventure


## 13. У какого режиссера самые большие суммарные кассовые сборы?

In [72]:
answers['13'] = 'Peter Jackson'

In [73]:
directors = ["Steven Spielberg",
            "Christopher Nolan",
            "David Yates",
            "James Cameron",
            "Peter Jackson"
            ]
data[data['director'].isin(directors)].groupby('director').agg('sum')\
.sort_values('revenue', ascending = False)

Unnamed: 0_level_0,budget,revenue,runtime,vote_average,release_year,profit,release_month,length_title,word_count_overview
director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Peter Jackson,1288000000,6490593685,1355,58.1,16059,5202593685,95,265,367
Christopher Nolan,1005000000,4167548502,1135,61.1,16057,3162548502,63,97,387
David Yates,775000000,4154295625,567,29.6,8037,3379295625,30,167,178
Steven Spielberg,937000000,3386700791,1356,66.9,20064,2449700791,85,195,632
James Cameron,250000000,2798546718,254,13.8,4012,2548546718,16,25,90


## 14. Какой режисер снял больше всего фильмов в стиле Action?

In [57]:
answers['14'] = 'Robert Rodriguez'

In [21]:
directors = ["Ridley Scott",
             "Guy Ritchie",
             "Robert Rodriguez",
             "Quentin Tarantino",
             "Tony Scott"
            ]

for director in directors:
    count_director_genre(data, director, 'Action')

6 Ridley Scott
4 Guy Ritchie
9 Robert Rodriguez
5 Quentin Tarantino
5 Tony Scott


In [47]:
def count_director_genre2(director, genre):
    '''Подсчет кол-ва фильмов который снял режисер по жанру
    '''
    count = len(data[(data.director.str.contains(director, na = False)) &
              (data.genres.str.contains(genre, na = False))])
    return count

In [48]:
gen_df2('director', 'Action', directors, count_director_genre2, 'Action')

Unnamed: 0,director,Action
2,Robert Rodriguez,9
0,Ridley Scott,6
3,Quentin Tarantino,5
4,Tony Scott,5
1,Guy Ritchie,4


Вариант 2

In [22]:
gen_df('director', 'Action', directors, count_director_action)

Unnamed: 0,director,Action
2,Robert Rodriguez,9
0,Ridley Scott,6
3,Quentin Tarantino,5
4,Tony Scott,5
1,Guy Ritchie,4


## 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [37]:
answers['15'] = 'Chris Hemsworth'

In [53]:
actors = 'Nicolas Cage, Tom Hardy, Chris Hemsworth, Jim Sturgess, Emma Stone'\
.split(', ')

for actor in actors:
    actor_revenue_year(data, actor, 2012)

2.106557 Nicolas Cage
1134.717867 Tom Hardy
2027.450773 Chris Hemsworth
138.589343 Jim Sturgess
752.215857 Emma Stone


In [50]:
def actor_revenue_year2(actor, year):
    '''Вычисление максимальных доходов по актеру и году
    '''
    revenue = data[(data.cast.str.contains(actor, na = False)) &\
               (data.release_year == year)].revenue.sum()
    return revenue

In [54]:
gen_df2('actor', 'revenue', actors, actor_revenue_year2, 2012)

Unnamed: 0,actor,revenue
2,Chris Hemsworth,2027450773
1,Tom Hardy,1134717867
4,Emma Stone,752215857
3,Jim Sturgess,138589343
0,Nicolas Cage,2106557


Вариант 2

In [110]:
gen_df('actor', 'revenue', actors, actor_revenue_2012)

Unnamed: 0,actor,revenue
2,Chris Hemsworth,2027450773
1,Tom Hardy,1134717867
4,Emma Stone,752215857
3,Jim Sturgess,138589343
0,Nicolas Cage,2106557


## 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [39]:
answers['16'] = 'Matt Damon'

In [59]:
actors = 'Tom Cruise, Mark Wahlberg, Matt Damon, Angelina Jolie, Adam Sandler'\
.split(', ')

for actor in actors:
    actor_high_budget(data, actor)

15 Tom Cruise
14 Mark Wahlberg
18 Matt Damon
16 Angelina Jolie
17 Adam Sandler


In [60]:
def actor_high_budget2(actor):
    '''Вычисление количества высокобюджетных фильмов в которых снялся актер
    '''
    count = len(data[(data.cast.str.contains(actor, na = False)) &\
               (data.budget > data.budget.mean())])
    return count

In [61]:
gen_df2('actor', 'count', actors, actor_high_budget2, 'na')

Unnamed: 0,actor,count
2,Matt Damon,18
4,Adam Sandler,17
3,Angelina Jolie,16
0,Tom Cruise,15
1,Mark Wahlberg,14


## 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [63]:
answers['17'] = 'Action'

In [62]:
genres = 'Drama, Action, Thriller, Adventure, Crime'.split(', ')

for genre in genres:
    genre_Cage(data, genre)

12 Drama
17 Action
15 Thriller
7 Adventure
10 Crime


In [64]:
def genre_actor(genre, actor):
    '''подсчет жанров актера
    '''
    count = len(data[(data.cast.str.contains(actor, na = False)) &\
                  (data.genres.str.contains(genre, na = False))])
    return count

In [65]:
gen_df2('genre', 'count_Cage', genres, genre_actor, 'Nicolas Cage')

Unnamed: 0,genre,count_Cage
1,Action,17
2,Thriller,15
0,Drama,12
4,Crime,10
3,Adventure,7


## 18. Самый убыточный фильм от Paramount Pictures

In [43]:
answers['18'] = 'K-19: The Widowmaker (tt0267626)'

In [44]:
data[data.production_companies.str.contains("Paramount Pictures", na = False)]\
.sort_values('profit').head(1)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit,release_month,length_title,word_count_overview,word_overview
925,tt0267626,100000000,35168966,K-19: The Widowmaker,Harrison Ford|Liam Neeson|Peter Sarsgaard|Joss...,Kathryn Bigelow,Fate has found its hero.,When Russia's first nuclear submarine malfunct...,138,Thriller|Drama|History,Paramount Pictures|Intermedia Films|National G...,2002-07-19,6.0,2002,-64831034,7,20,23,"[When, Russia's, first, nuclear, submarine, ma..."


## 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [45]:
answers['19'] = '2015'

In [69]:
years = [2002, 2008, 2012, 2014, 2015]
data[data['release_year'].isin(years)].groupby('release_year').agg('sum')\
.sort_values('revenue', ascending = False)

Unnamed: 0_level_0,budget,revenue,runtime,vote_average,profit,release_month,length_title,word_count_overview
release_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015,6780630004,25449202382,13355,755.2,18668572378,842,1601,5616
2014,7008050000,23405862953,13444,774.5,16397812953,854,1864,6454
2012,7002000000,23079001687,12145,682.1,16077001687,724,1684,6037
2008,6588900000,18252781990,12979,731.1,11663881990,847,1862,6562
2002,5134000000,14136361487,11528,653.1,9002361487,751,1535,6106


## 20. Какой самый прибыльный год для студии Warner Bros?

In [47]:
answers['20'] = '2014'

In [70]:
years = [2008, 2010, 2012, 2014, 2015]
data[(data['release_year'].isin(years)) &\
     (data.production_companies.str.contains('Warner Bros', na = False))]\
.groupby('release_year').agg('sum').sort_values('profit', ascending = False)

Unnamed: 0_level_0,budget,revenue,runtime,vote_average,profit,release_month,length_title,word_count_overview
release_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2014,947600000,3243064519,1576,88.9,2295464519,89,176,681
2008,916000000,3050595031,1366,72.0,2134595031,81,149,520
2010,1034000000,3008712985,1421,79.6,1974712985,92,201,886
2012,611500000,1869520056,579,30.0,1258020056,33,70,279
2015,820800003,1691168351,1358,75.9,870368348,82,137,621


## 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [49]:
answers['21'] = 'september'

In [75]:
months = [1, 5, 6, 9, 12]
data[data.release_month.isin(months)].groupby(by = 'release_month')\
['imdb_id'].count()

release_month
1     110
5     140
6     147
9     227
12    190
Name: imdb_id, dtype: int64

## 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [77]:
answers['22'] = '450'

In [76]:
season = [6, 7, 8]
len(data[data.release_month.isin(season)])

450

## 23. Для какого режиссера зима – самое продуктивное время года? 

In [53]:
answers['23'] = 'Clint Eastwood'

In [79]:
directors = ["Steven Soderbergh",
             "Christopher Nolan",
             "Clint Eastwood",
             "Ridley Scott",
             "Peter Jackson"
            ]

for director in directors:
    winter_director(data, director)

5 Steven Soderbergh
3 Christopher Nolan
9 Clint Eastwood
5 Ridley Scott
8 Peter Jackson


In [80]:
def director_season(director, season):
    '''подсчет количества фильмов за сезон в зависимости от режисера
    '''
    count = len(data[(data.release_month.isin(season)) &\
                  (data.director == director)])
    return count

In [82]:
winter = [10, 11, 12]
gen_df2('director', 'winter', directors, director_season, winter)

Unnamed: 0,director,winter
2,Clint Eastwood,9
4,Peter Jackson,8
0,Steven Soderbergh,5
3,Ridley Scott,5
1,Christopher Nolan,3


## 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

In [55]:
answers['24'] = 'Four By Two Productions'

In [83]:
studios = ["Universal",
           "Warner Bros",
           "Jim Henson Company, The",
           "Paramount Pictures",
           "Four By Two Productions"]

for studio in studios:
    mean_length_titles(data, studio)

14.76 Universal
15.25 Warner Bros
59.0 Jim Henson Company, The
17.03 Paramount Pictures
83.0 Four By Two Productions


In [84]:
def mean_length_titles2(studio):
    '''Определение средней длины символов в названиях по студиям
    '''
    length = data[data['production_companies'].str.contains(studio, na = False)]\
    .length_title.mean()
    return round(length, 2)

In [85]:
gen_df2('company', 'lenght_title', studios, mean_length_titles2, 'na')

Unnamed: 0,company,lenght_title
4,Four By Two Productions,83.0
2,"Jim Henson Company, The",59.0
3,Paramount Pictures,17.03
1,Warner Bros,15.25
0,Universal,14.76


## 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

In [57]:
answers['25'] = 'Midnight Picture Show'

In [86]:
studios = ["Universal Pictures",
          "Warner Bros",
           "Midnight Picture Show",
           "Paramount Pictures",
           "Total Entertainment"
          ]

for studio in studios:
    mean_overview(data, studio)

54.7 Universal Pictures
54.99 Warner Bros
175.0 Midnight Picture Show
56.07 Paramount Pictures
87.0 Total Entertainment


In [87]:
def mean_overview2(studio):
    ''' Определение среднеого описания по студиям
    '''
    mean = data[data['production_companies'].str.contains(studio, na = False)]\
    ['word_count_overview'].mean()
    return round(mean, 2)

In [88]:
gen_df2('company', 'mean_overview', studios, mean_overview2, 'na')

Unnamed: 0,company,mean_overview
2,Midnight Picture Show,175.0
4,Total Entertainment,87.0
3,Paramount Pictures,56.07
1,Warner Bros,54.99
0,Universal Pictures,54.7


## 26. Какие фильмы входят в 1 процент лучших по рейтингу? 

In [59]:
answers['26'] = 'Inside Out, The Dark Knight, 12 Years a Slave'

In [106]:
movie_list = ["Inside Out, The Dark Knight, 12 Years a Slave",
             "BloodRayne, The Adventures of Rocky & Bullwinkle",
             "Batman Begins, The Lord of the Rings: The Return of the King, Upside Down",
             "300, Lucky Number Slevin, Kill Bill: Vol. 1",
             "Upside Down, Inside Out, Iron Man"
             ]

for movies in movie_list:
    scroll = movies.split(', ')
    for movie in scroll:
        print(best_movie_percent(data, movie, 1) ,movie)
    print('\n')

True Inside Out
True The Dark Knight
True 12 Years a Slave


False BloodRayne
False The Adventures of Rocky & Bullwinkle


False Batman Begins
True The Lord of the Rings: The Return of the King
False Upside Down


False 300
False Lucky Number Slevin
False Kill Bill: Vol. 1


False Upside Down
True Inside Out
False Iron Man




In [120]:
def best_movie_percent2(movies, percent):
    '''Входит ли фильм в процент лучших
    '''
    mark = True
    df_in = data[data['vote_average'] >= data['vote_average']\
                 .quantile(1 - percent/100)]
    scroll = movies.split(', ')
    for pic in scroll:
        marker = len(df_in[df_in['original_title']\
                           .str.contains(pic, na = False)])
        if marker == 1:
            marker = True
        else:
            marker = False
        mark = mark and marker
    return mark

In [122]:
gen_df2('movie_list', 'in?', movie_list, best_movie_percent2, 1)

Unnamed: 0,movie_list,in?
0,"Inside Out, The Dark Knight, 12 Years a Slave",True
1,"BloodRayne, The Adventures of Rocky & Bullwinkle",False
2,"Batman Begins, The Lord of the Rings: The Retu...",False
3,"300, Lucky Number Slevin, Kill Bill: Vol. 1",False
4,"Upside Down, Inside Out, Iron Man",False


## 27. Какие актеры чаще всего снимаются в одном фильме вместе?


In [61]:
answers['27'] = 'Daniel Radcliffe & Rupert Grint'

In [93]:
two_actors = ["Johnny Depp & Helena Bonham Carter",
         "Ben Stiller & Owen Wilson",
         "Vin Diesel & Paul Walker",
         "Adam Sandler & Kevin James",
         "Daniel Radcliffe & Rupert Grint"
         ]

for two in two_actors:
    couple_actors(data, two)

6 Johnny Depp & Helena Bonham Carter
6 Ben Stiller & Owen Wilson
5 Vin Diesel & Paul Walker
5 Adam Sandler & Kevin James
8 Daniel Radcliffe & Rupert Grint


In [91]:
def couple_actors2(couple):
    '''Подсчет фильмов пары актеров
    '''
    actors = couple.split(' & ')
    count = len(data[(data['cast'].str.contains(actors[0], na = False)) &
                   (data['cast'].str.contains(actors[1], na = False))])
    return count

In [94]:
gen_df2('couple_actors', 'count', two_actors, couple_actors2, 'na')

Unnamed: 0,couple_actors,count
4,Daniel Radcliffe & Rupert Grint,8
0,Johnny Depp & Helena Bonham Carter,6
1,Ben Stiller & Owen Wilson,6
2,Vin Diesel & Paul Walker,5
3,Adam Sandler & Kevin James,5


# Submission

In [63]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{'1': 'Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 '2': 'Gods and Generals (tt0279111)',
 '3': 'Winnie the Pooh (tt1449283)',
 '4': '110',
 '5': 'Avatar (tt0499549)',
 '6': '107',
 '7': 'The Lone Ranger (tt1210819)',
 '8': '1478',
 '9': 'The Dark Knight (tt0468569)',
 '10': 'The Lone Ranger (tt1210819)',
 '11': 'Drama',
 '12': 'Drama',
 '13': 'Peter Jackson',
 '14': 'Robert Rodriguez',
 '15': 'Chris Hemsworth',
 '16': 'Matt Damon',
 '17': 'Action',
 '18': 'K-19: The Widowmaker (tt0267626)',
 '19': '2015',
 '20': '2014',
 '21': 'Сентябрь',
 '22': '450',
 '23': 'Clint Eastwood',
 '24': 'Four By Two Productions',
 '25': 'Midnight Picture Show',
 '26': 'Inside Out, The Dark Knight, 12 Years a Slave',
 '27': 'Daniel Radcliffe & Rupert Grint'}

In [64]:
# и убедиться что ни чего не пропустил)
len(answers)

27