In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt

# Data Cleaning

In [3]:
df = pd.read_csv("imdb_top_1000.csv") # DataFrame of the top 1000 movies
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
df.head()["Poster_Link"]

0    https://m.media-amazon.com/images/M/MV5BMDFkYT...
1    https://m.media-amazon.com/images/M/MV5BM2MyNj...
2    https://m.media-amazon.com/images/M/MV5BMTMxNT...
3    https://m.media-amazon.com/images/M/MV5BMWMwMG...
4    https://m.media-amazon.com/images/M/MV5BMWU4N2...
Name: Poster_Link, dtype: object

In [5]:
df.drop(columns=["Poster_Link"], inplace=True) # Removed an unneeded column.
df.head(3)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444


## Handling Missing Values

In [6]:
df.isnull().sum() # Determining which columns have null values, and how many.

Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [7]:
len(df[df.isnull().any(axis=1)]) # The number of rows with null values

286

In [8]:
df["Certificate"].unique()

array(['A', 'UA', 'U', 'PG-13', 'R', nan, 'PG', 'G', 'Passed', 'TV-14',
       '16', 'TV-MA', 'Unrated', 'GP', 'Approved', 'TV-PG', 'U/A'],
      dtype=object)

In [9]:
df.fillna({"Certificate": "Unrated"}, inplace=True)

In [10]:
df["Certificate"].unique()

array(['A', 'UA', 'U', 'PG-13', 'R', 'Unrated', 'PG', 'G', 'Passed',
       'TV-14', '16', 'TV-MA', 'GP', 'Approved', 'TV-PG', 'U/A'],
      dtype=object)

In [11]:
df.fillna({"Meta_score": -1}, inplace=True)
df.fillna({"Gross": -1}, inplace=True)
# -1 represents missing values

In [12]:
df.isnull().sum()

Series_Title     0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64

## Assigning Proper Data Types

In [13]:
df.dtypes

Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [14]:
df["Runtime"]

0      142 min
1      175 min
2      152 min
3      202 min
4       96 min
        ...   
995    115 min
996    201 min
997    118 min
998     97 min
999     86 min
Name: Runtime, Length: 1000, dtype: object

In [15]:
import re

# Function to format the data in the "Runtime" column so that it only shows the numbers in the cells 
def extract_minutes(df, column_name):

  def extract_number(minute_string):
    # Helper function to extract the number from a single string
    if isinstance(minute_string, str):
        match = re.match(r'(\d+)\s*min', minute_string)
        if match:
          return int(match.group(1))
        else:
          return None  # Handles the case where the pattern doesn't match
    else:
        return minute_string # If the value is not a string, return the value as is.

  return df[column_name].apply(extract_number)

df["Runtime (min)"] = extract_minutes(df, "Runtime")

In [16]:
df.head(1)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,Runtime (min)
0,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469,142


In [17]:
df[["Runtime", "Runtime (min)"]]

Unnamed: 0,Runtime,Runtime (min)
0,142 min,142
1,175 min,175
2,152 min,152
3,202 min,202
4,96 min,96
...,...,...
995,115 min,115
996,201 min,201
997,118 min,118
998,97 min,97
