In [5]:
# imports
import pandas as pd

#### Exercise 1

In [6]:
# Load the data
df = pd.read_csv('../data/imdb_top_1000.csv')

# drop irrelevant columns: Poster_link and Overview
df.drop(columns=['Poster_Link', 'Overview'], inplace=True)

In [7]:
# Select rows with valid (Numeric) years
valid_years = df[df['Released_Year'].str.isalpha() != True]
min_year = valid_years['Released_Year'].min()
max_year = valid_years['Released_Year'].max()

oldest_movie = df[df['Released_Year'] == min_year]['Series_Title'].values
newest_movies = df[df['Released_Year'] == max_year]['Series_Title'].values

print(f'The oldest movie released in {min_year}:\n\t{oldest_movie[0]}')
print(f'The newest movies released in {max_year}:')
for m in newest_movies:
    print('\t' + m)

The oldest movie released in 1920:
	Das Cabinet des Dr. Caligari
The newest movies released in 2020:
	Hamilton
	Soorarai Pottru
	Soul
	Dil Bechara
	The Trial of the Chicago 7
	Druk


In [8]:
# Select title and rating of 10 highest rated movies
top10 = df.sort_values(by='IMDB_Rating', ascending=False).head(10)[['Series_Title', 'IMDB_Rating']]

print('Top 10 Highest Rated Movies\n')
print('Rank\tRating\tTitle')
for ind, [title, rating] in enumerate(top10.values):
    print(ind+1, '\t', rating, '\t', title)

Top 10 Highest Rated Movies

Rank	Rating	Title
1 	 9.3 	 The Shawshank Redemption
2 	 9.2 	 The Godfather
3 	 9.0 	 The Dark Knight
4 	 9.0 	 The Godfather: Part II
5 	 9.0 	 12 Angry Men
6 	 8.9 	 The Lord of the Rings: The Return of the King
7 	 8.9 	 Pulp Fiction
8 	 8.9 	 Schindler's List
9 	 8.8 	 The Lord of the Rings: The Fellowship of the Ring
10 	 8.8 	 Forrest Gump


In [9]:
# Top Movie by Genre
genre_ratings = df[['Series_Title', 'Genre', 'IMDB_Rating']].copy()
genre_ratings['Genre'] = genre_ratings['Genre'].str.split(', ')
genre_ratings = genre_ratings.explode('Genre').reset_index(drop=True)

print('Top Rated Movie By Genre\n')
for genre, movies in genre_ratings.groupby('Genre'):
    print(f"{genre}:{' '*(10-len(genre))}{movies.sort_values(by='IMDB_Rating', ascending=False).iloc[0]['Series_Title']}")

Top Rated Movie By Genre

Action:    The Dark Knight
Adventure: The Lord of the Rings: The Return of the King
Animation: Sen to Chihiro no kamikakushi
Biography: Schindler's List
Comedy:    Gisaengchung
Crime:     The Godfather
Drama:     The Shawshank Redemption
Family:    Sen to Chihiro no kamikakushi
Fantasy:   Star Wars: Episode V - The Empire Strikes Back
Film-Noir: Sunset Blvd.
History:   Schindler's List
Horror:    Psycho
Music:     Whiplash
Musical:   Anand
Mystery:   Se7en
Romance:   Forrest Gump
Sci-Fi:    Inception
Sport:     Bacheha-Ye aseman
Thriller:  Gisaengchung
War:       Saving Private Ryan
Western:   Il buono, il brutto, il cattivo


In [10]:
# Director with the most movies
print(f"{df['Director'].value_counts().index[0]} has the most movies in this list")

Alfred Hitchcock has the most movies in this list


In [11]:
# Star in the most movies
stars = df['Star1'].value_counts()
stars = stars.add(df['Star2'].value_counts(), fill_value=0).sort_values(ascending=False)
stars = stars.add(df['Star3'].value_counts(), fill_value=0).sort_values(ascending=False)
stars = stars.add(df['Star4'].value_counts(), fill_value=0).sort_values(ascending=False)
print(f'Star in the most movies: {stars.index[0]}')

Star in the most movies: Robert De Niro


In [12]:
# Top Grossing Movie by Genre
genre_gross = df[['Series_Title', 'Genre', 'Gross']].copy()
genre_gross.dropna(inplace=True)
genre_gross['Gross'] = genre_gross['Gross'].str.replace(',', '').astype(int)
genre_gross['Genre'] = genre_gross['Genre'].str.split(', ')
genre_gross = genre_gross.explode('Genre').reset_index(drop=True)

print('Top Grossing Movie By Genre\n')
for genre, movies in genre_gross.groupby('Genre'):
    print(f"{genre}:{' '*(10-len(genre))}{movies.sort_values(by='Gross', ascending=False).iloc[0]['Series_Title']}")

Top Grossing Movie By Genre

Action:    Star Wars: Episode VII - The Force Awakens
Adventure: Star Wars: Episode VII - The Force Awakens
Animation: Incredibles 2
Biography: The Blind Side
Comedy:    Toy Story 4
Crime:     The Dark Knight
Drama:     Avengers: Endgame
Family:    E.T. the Extra-Terrestrial
Fantasy:   Avatar
Film-Noir: Notorious
History:   Gone with the Wind
Horror:    The Exorcist
Music:     Bohemian Rhapsody
Musical:   Fiddler on the Roof
Mystery:   The Sixth Sense
Romance:   Titanic
Sci-Fi:    Star Wars: Episode VII - The Force Awakens
Sport:     The Blind Side
Thriller:  Joker
War:       Saving Private Ryan
Western:   Dances with Wolves


In [13]:
# Lowest Grossing Movie by Director
director_gross = df[['Series_Title', 'Director', 'Gross']].copy()
director_gross.dropna(inplace=True)
director_gross['Gross'] = director_gross['Gross'].str.replace(',', '').astype(int)

print('Lowest Grossing Movie by Director\n')
for director, movies in director_gross.groupby('Director'):
    print(f"{director}:{' '*(25-len(director))}{movies.sort_values(by='Gross', ascending=True).iloc[0]['Series_Title']}")

Lowest Grossing Movie by Director

Aamir Khan:               Taare Zameen Par
Abdellatif Kechiche:      La vie d'Adèle
Abhishek Kapoor:          Kai po che!
Adam McKay:               The Big Short
Aditya Dhar:              Uri: The Surgical Strike
Akira Kurosawa:           Ikiru
Alain Resnais:            Hiroshima mon amour
Alan J. Pakula:           All the President's Men
Alan Parker:              Pink Floyd: The Wall
Alejandro Amenábar:       Abre los ojos
Alejandro G. Iñárritu:    Amores perros
Alejandro Jodorowsky:     La montaña sagrada
Alex Garland:             Ex Machina
Alex Proyas:              Dark City
Alexander Payne:          Nebraska
Alfonso Cuarón:           A Little Princess
Alfonso Gomez-Rejon:      Me and Earl and the Dying Girl
Alfred Hitchcock:         Dial M for Murder
Anders Thomas Jensen:     Adams æbler
Andrei Tarkovsky:         Andrei Rublev
Andrew Adamson:           Shrek
Andrew Davis:             The Fugitive
Andrew Lau:               Mou gaan dou
Andrew Nicc

In [14]:
# Save to parquet
df.to_parquet('../data/imdb_top_1000.parquet', index=False)

In [21]:
# Total Gross Revenue across all movies
print(f"Total Gross Revenue: ${df['Gross'].dropna().str.replace(',', '').astype(int).sum():,}")

Total Gross Revenue: $56,536,877,976
