In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

movie_df = pd.read_csv('./data/IMDB-Movie.csv', index_col='Rank',sep=',')

movie_df.head(3)

Unnamed: 0_level_0,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0


In [4]:
#1. How many different Directors of movies are available?
movie_df['Director'].nunique()

644

In [None]:
#2. What is the title of the shortest movie?
movie_df.groupby('Title')['Runtime (Minutes)'].min().idxmin()

In [5]:
#3. Who is the director of the longest movie?
movie_df.groupby('Director')['Runtime (Minutes)'].max().idxmax()

'Robert Rodriguez'

In [None]:
#4. How many Movies have been directed by Ridley Scott?
len(movie_df[movie_df['Director'] == 'Ridley Scott'])

In [9]:
#5. Which is the highest and lowest rated movies?
movie_df[movie_df['Rating'] == movie_df['Rating'].max()]['Title']#highest rated movie

Rank
55    The Dark Knight
Name: Title, dtype: object

In [7]:
print(movie_df[movie_df['Rating'] == movie_df['Rating'].min()]['Title'])

Rank
830    Disaster Movie
Name: Title, dtype: object


In [11]:
#We can drop the duplicates with drop_duplicates() temp_df.shape confirms this worked

#Making a duplicate for this data set
temp_df = pd.concat([movie_df, movie_df], ignore_index=True)
temp_df.shape

temp_df.head()

temp_df.iloc[1000:1005]

#Removing duplicate
temp_df = temp_df.drop_duplicates()
temp_df.shape

(1000, 11)

In [None]:
#Dataset cleanup
#Many times datasets will have verbose column names with symbols, upper and lowercase words, spaces, and typos. 
#To make selecting data by column name easier we can spend a little time cleaning up their names.
movie_df.columns
movie_df[['Title', 'Rating']]

#Let's simplify some of the names in the dataframe using the rename() method.
movie_df.rename(columns={
        'Runtime (Minutes)': 'Runtime_minutes', 
        'Revenue (Millions)': 'Revenue_millions'
    }, inplace=True)


movie_df.columns

#Let's simplify it even further and set all column names to lowercase
movie_df.columns = [col.lower() for col in movie_df]
movie_df.columns


In [13]:
#Handling missing values
#Let's calculate the total number of nulls in the dataset.
new_df = movie_df.copy()
new_df.isnull().sum()

#Let's remove these with the dropna() method.
drop_df = movie_df.copy()
drop_df.dropna(inplace=True) # Remember to use inplace=True to make the changes permanent!

drop_df.shape

#Oh no! The index which was the Rank is now not suitable to be the index anymore. Let's make put back the normal index.
new_df.reset_index(inplace=True) # Reset the index to the default integer index

# Let's set the title as the index
new_df.set_index('title', inplace=True)

new_df.head()

Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64