In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Data Pre-Processing

1. Load the CSV into a DataFrame

In [3]:
df = pd.read_csv("imdb_top_movies.csv")

In [4]:
df.head()

Unnamed: 0,Title,Year,Rating,Genre,Director(s),Box Office Revenue,Lead Actors
0,1. The Shawshank Redemption,1994,9.3 (3M),"Epic, Period Drama, Prison Drama, Drama","The Ink Spots, Jack Lawrence, Stephen King, Ti...","Gross worldwide$29,332,133","Bob Gunton, Morgan Freeman, Tim Robbins"
1,2. The Godfather,1972,9.2 (2.1M),"Epic, Gangster, Tragedy, Crime, Drama","Mario Puzo, Marlon Brando, Peter Clemenza, Al ...","Gross worldwide$250,342,198","Al Pacino, Marlon Brando, James Caan"
2,3. The Dark Knight,2008,9.0 (3M),"Action Epic, Epic, Superhero, Tragedy, Action,...","Aaron Eckhart, Michael Caine, The Joker, Jonat...","Gross worldwide$1,009,057,329","Aaron Eckhart, Christian Bale, Heath Ledger"
3,4. The Godfather Part II,1974,9.0 (1.4M),"Epic, Gangster, Tragedy, Crime, Drama","Mario Puzo, Francesco Pennino, Al Pacino, Robe...","Gross worldwide$47,964,222","Al Pacino, Robert De Niro, Robert Duvall"
4,5. 12 Angry Men,1957,9.0 (917K),"Legal Drama, Psychological Drama, Crime, Drama","Henry Fonda, Jack Warden, Lee J. Cobb, Juror #...","Gross worldwide$2,945","Henry Fonda, Martin Balsam, Lee J. Cobb"


2. Data Cleaning and Type Conversion

In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
df.shape

(250, 7)

In [7]:
#Null Value check

df.isna().sum()

Title                 0
Year                  0
Rating                0
Genre                 0
Director(s)           0
Box Office Revenue    0
Lead Actors           0
dtype: int64

In [8]:
#Dropping Duplicates
df= df.drop_duplicates()
df

Unnamed: 0,Title,Year,Rating,Genre,Director(s),Box Office Revenue,Lead Actors
0,1. The Shawshank Redemption,1994,9.3 (3M),"Epic, Period Drama, Prison Drama, Drama","The Ink Spots, Jack Lawrence, Stephen King, Ti...","Gross worldwide$29,332,133","Bob Gunton, Morgan Freeman, Tim Robbins"
1,2. The Godfather,1972,9.2 (2.1M),"Epic, Gangster, Tragedy, Crime, Drama","Mario Puzo, Marlon Brando, Peter Clemenza, Al ...","Gross worldwide$250,342,198","Al Pacino, Marlon Brando, James Caan"
2,3. The Dark Knight,2008,9.0 (3M),"Action Epic, Epic, Superhero, Tragedy, Action,...","Aaron Eckhart, Michael Caine, The Joker, Jonat...","Gross worldwide$1,009,057,329","Aaron Eckhart, Christian Bale, Heath Ledger"
3,4. The Godfather Part II,1974,9.0 (1.4M),"Epic, Gangster, Tragedy, Crime, Drama","Mario Puzo, Francesco Pennino, Al Pacino, Robe...","Gross worldwide$47,964,222","Al Pacino, Robert De Niro, Robert Duvall"
4,5. 12 Angry Men,1957,9.0 (917K),"Legal Drama, Psychological Drama, Crime, Drama","Henry Fonda, Jack Warden, Lee J. Cobb, Juror #...","Gross worldwide$2,945","Henry Fonda, Martin Balsam, Lee J. Cobb"
...,...,...,...,...,...,...,...
245,246. A Silent Voice: The Movie,2016,8.1 (117K),"Anime, Coming-of-Age, Psychological Drama, Shō...","Saori Hayami, Naoko Yamada, Lexi Marman, Shoko...","Gross worldwide$30,819,442","Miyu Irino, Saori Hayami, Aoi Yûki"
246,247. The Help,2011,8.1 (510K),"Period Drama, Drama","Hilly Holbrook, Emma Stone, Jerry Leiber, Tate...","Gross worldwide$221,802,186","Emma Stone, Octavia Spencer, Viola Davis"
247,248. Amores Perros,2000,8.0 (261K),"Tragedy, Drama, Thriller","Emilio Echevarría, Guillermo Arriaga, Alejandr...","Gross worldwide$20,908,467","Goya Toledo, Gael García Bernal, Emilio Echeva..."
248,249. Rebecca,1940,8.1 (153K),"Dark Romance, Psychological Drama, Psychologic...","Laurence Olivier, Mrs. Danvers, The Second Mrs...","Gross worldwide$113,328","Laurence Olivier, George Sanders, Joan Fontaine"


In [9]:
df.dtypes

Title                 object
Year                   int64
Rating                object
Genre                 object
Director(s)           object
Box Office Revenue    object
Lead Actors           object
dtype: object

In [10]:
# Convert "Year" to numeric, coercing invalid entries to NaN
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")

# Convert "Rating" to numeric, coercing invalid entries to NaN
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")

In [11]:
# Convert "Box Office Revenue" to numeric
# Assume the format might be like "$123,456,789" or "Unknown".
def parse_box_office(value):
    if isinstance(value, str):
        value = value.strip()
        if value.lower() == "unknown":
            return np.nan
        # Remove $, commas, etc.
        value = value.replace("$", "").replace(",", "")
        # Try converting to float
        try:
            return float(value)
        except ValueError:
            return np.nan
    return np.nan

df["Box Office Revenue"] = df["Box Office Revenue"].apply(parse_box_office)

3. Descriptive Statistics

In [15]:
# Basic descriptive stats for Year, Rating, and Box Office
print("\n=== Descriptive Statistics ===")
df.describe()


=== Descriptive Statistics ===


Unnamed: 0,Year,Rating,Box Office Revenue
count,250.0,0.0,0.0
mean,1988.42,,
std,25.589351,,
min,1921.0,,
25%,1972.25,,
50%,1995.0,,
75%,2009.0,,
max,2024.0,,


In [16]:
print("\nGenre Distribution (Top 10):")
print(df["Genre"].value_counts().head(10))


Genre Distribution (Top 10):
Genre
Psychological Drama, Tragedy, Drama                                                                                                                      4
Drama                                                                                                                                                    3
Drama, Romance                                                                                                                                           2
Adventure Epic, Desert Adventure, Globetrotting Adventure, Quest, Action, Adventure                                                                      2
Psychological Thriller, Suspense Mystery, Drama, Mystery, Thriller                                                                                       2
Action Epic, Adventure Epic, Dark Fantasy, Epic, Fantasy Epic, Globetrotting Adventure, Quest, Sci-Fi Epic, Space Sci-Fi, Sword & Sorcery                2
Drama, Family                     

In [17]:
print("\nDirector(s) Distribution (Top 10):")
print(df["Director(s)"].value_counts().head(10))


Director(s) Distribution (Top 10):
Director(s)
The Ink Spots, Jack Lawrence, Stephen King, Tim Robbins, Hank Williams, Andy Dufresne, Bob Gunton, Frank Darabont, Morgan Freeman                                                      1
Carl Foreman, David Lean, Alec Guinness, Jack Hawkins, Michael Wilson, William Holden, Colonel Nicholson, Malcolm Arnold, Kenneth Alford, Geoffrey Horne, Pierre Boulle, John Scott    1
Samuel Barber, John Hurt, Mel Brooks, Christopher De Vore, London Symphony Orchestra, André Previn, Anthony Hopkins, Eric Bergren, Anne Bancroft, David Lynch, John Merrick            1
Tony Wendice, Cary Grant, Grace Kelly, Robert Cummings, Mark Halliday, Margot Mary Wendice, Charles Dorat, Ray Milland, Frederick Knott, Alfred Hitchcock                              1
Murilo Hauser, Fernanda Montenegro, Martha, Tim Maia, Marcelo Rubens Paiva, Eunice Paiva, Fernanda Torres, Heitor Lorega, Selton Mello, Walter Salles, Léo Maia                        1
Hattie McDaniel, Margaret M

In [18]:
print("\nLead Actors (Top 10):")
print(df["Lead Actors"].value_counts().head(10))


Lead Actors (Top 10):
Lead Actors
Carrie Fisher, Harrison Ford, Mark Hamill           3
Elijah Wood, Viggo Mortensen, Ian McKellen          2
Bob Gunton, Morgan Freeman, Tim Robbins             1
Natalie Portman, Hugo Weaving, Rupert Graves        1
Jonny Lee Miller, Ewen Bremner, Ewan McGregor       1
Jack Hawkins, William Holden, Alec Guinness         1
Woody Harrelson, Frances McDormand, Sam Rockwell    1
Pedro Pascal, Lupita Nyong'o, Kit Connor            1
Amy Poehler, Bill Hader, Lewis Black                1
Jason Flemyng, Nick Moran, Dexter Fletcher          1
Name: count, dtype: int64
