In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("imdb_top_1000.csv") # DataFrame of the top 1000 movies
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [3]:
df.head()["Poster_Link"]

0    https://m.media-amazon.com/images/M/MV5BMDFkYT...
1    https://m.media-amazon.com/images/M/MV5BM2MyNj...
2    https://m.media-amazon.com/images/M/MV5BMTMxNT...
3    https://m.media-amazon.com/images/M/MV5BMWMwMG...
4    https://m.media-amazon.com/images/M/MV5BMWU4N2...
Name: Poster_Link, dtype: object

In [4]:
df.drop(columns=["Poster_Link"], inplace=True) # Removed an unneeded column.
df.head(3)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444


# Handling Missing Values

In [5]:
df.isnull().sum() # Determining which columns have null values, and how many.

Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [6]:
len(df[df.isnull().any(axis=1)]) # The number of rows with null values

286

In [7]:
df["Certificate"].unique()

array(['A', 'UA', 'U', 'PG-13', 'R', nan, 'PG', 'G', 'Passed', 'TV-14',
       '16', 'TV-MA', 'Unrated', 'GP', 'Approved', 'TV-PG', 'U/A'],
      dtype=object)

In [8]:
df.fillna({"Certificate": "Unrated"}, inplace=True)

In [9]:
df["Certificate"].unique()

array(['A', 'UA', 'U', 'PG-13', 'R', 'Unrated', 'PG', 'G', 'Passed',
       'TV-14', '16', 'TV-MA', 'GP', 'Approved', 'TV-PG', 'U/A'],
      dtype=object)

In [10]:
df.fillna({"Meta_score": -1}, inplace=True)
df.fillna({"Gross": -1}, inplace=True)
# -1 represents missing values

In [11]:
df.isnull().sum()

Series_Title     0
Released_Year    0
Certificate      0
Runtime          0
Genre            0
IMDB_Rating      0
Overview         0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64

# Assigning Proper Data Types

In [12]:
df.dtypes

Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [13]:
df["Runtime"]

0      142 min
1      175 min
2      152 min
3      202 min
4       96 min
        ...   
995    115 min
996    201 min
997    118 min
998     97 min
999     86 min
Name: Runtime, Length: 1000, dtype: object

In [14]:
import re

# Function to format the data in the "Runtime" column so that it only shows the numbers in the cells 
def extract_minutes(df, column_name):

  def extract_number(minute_string):
    # Helper function to extract the number from a single string
    if isinstance(minute_string, str):
        match = re.match(r'(\d+)\s*min', minute_string)
        if match:
          return int(match.group(1))
        else:
          return None  # Handles the case where the pattern doesn't match
    else:
        return minute_string # If the value is not a string, return the value as is.

  return df[column_name].apply(extract_number)

df["Runtime (min)"] = extract_minutes(df, "Runtime")

In [15]:
df.head(1)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,Runtime (min)
0,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469,142


In [16]:
df[["Runtime", "Runtime (min)"]]

Unnamed: 0,Runtime,Runtime (min)
0,142 min,142
1,175 min,175
2,152 min,152
3,202 min,202
4,96 min,96
...,...,...
995,115 min,115
996,201 min,201
997,118 min,118
998,97 min,97


In [17]:
# The "Runtime" column's position
old_column_index = df.columns.get_loc("Runtime")

# Extract "Runtime (min)" (new column), meaning we're removing this column from the dataframe, and storing it's data in a variable
new_column_series = df.pop("Runtime (min)")

# Remove "Runtime" (old column) from the dataframe
df.drop(columns=["Runtime"], inplace=True)

# Adding "Runtime (min)" at the position of where "Runtime" used to be (before it got deleted)
df.insert(old_column_index, "Runtime (min)", new_column_series)

In [18]:
df.head(1)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime (min),Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469


In [19]:
df["Runtime (min)"].dtypes

dtype('int64')

In [20]:
len(df["Genre"].unique())

202

In [21]:
df.dtypes

Series_Title      object
Released_Year     object
Certificate       object
Runtime (min)      int64
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [22]:
df["Released_Year"].unique()

array(['1994', '1972', '2008', '1974', '1957', '2003', '1993', '2010',
       '1999', '2001', '1966', '2002', '1990', '1980', '1975', '2020',
       '2019', '2014', '1998', '1997', '1995', '1991', '1977', '1962',
       '1954', '1946', '2011', '2006', '2000', '1988', '1985', '1968',
       '1960', '1942', '1936', '1931', '2018', '2017', '2016', '2012',
       '2009', '2007', '1984', '1981', '1979', '1971', '1963', '1964',
       '1950', '1940', '2013', '2005', '2004', '1992', '1987', '1986',
       '1983', '1976', '1973', '1965', '1959', '1958', '1952', '1948',
       '1944', '1941', '1927', '1921', '2015', '1996', '1989', '1978',
       '1961', '1955', '1953', '1925', '1924', '1982', '1967', '1951',
       '1949', '1939', '1937', '1934', '1928', '1926', '1920', '1970',
       '1969', '1956', '1947', '1945', '1930', '1938', '1935', '1933',
       '1932', '1922', '1943', 'PG'], dtype=object)

In [23]:
df.loc[df["Released_Year"] == "PG"]

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime (min),Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
966,Apollo 13,PG,U,140,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,77.0,Ron Howard,Tom Hanks,Bill Paxton,Kevin Bacon,Gary Sinise,269197,173837933


In [24]:
df = df.drop(966)
df = df.reset_index(drop=True)

In [25]:
len(df.loc[df["Released_Year"] == "PG"])

0

In [26]:
df.tail(1)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime (min),Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
998,The 39 Steps,1935,Unrated,86,"Crime, Mystery, Thriller",7.6,A man in London tries to help a counter-espion...,93.0,Alfred Hitchcock,Robert Donat,Madeleine Carroll,Lucie Mannheim,Godfrey Tearle,51853,-1


In [27]:
df["Released_Year"] = df["Released_Year"].astype(int)

In [28]:
df.dtypes

Series_Title      object
Released_Year      int64
Certificate       object
Runtime (min)      int64
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [29]:
print(df["Gross"].head(5))
print(df["Gross"].tail(5))

0     28,341,469
1    134,966,411
2    534,858,444
3     57,300,000
4      4,360,000
Name: Gross, dtype: object
994            -1
995            -1
996    30,500,000
997            -1
998            -1
Name: Gross, dtype: object


In [30]:
df["Gross"] = df["Gross"].str.replace(",", "", regex=False)
df["Gross"] = pd.to_numeric(df["Gross"])

In [31]:
df["Gross"].head()

0     28341469.0
1    134966411.0
2    534858444.0
3     57300000.0
4      4360000.0
Name: Gross, dtype: float64

In [32]:
df["Gross"].loc[df["Gross"] % 1 != 0].unique() # Checking for decminal numbers

array([nan])

In [33]:
print("Number of NaN values:", df["Gross"].isnull().sum())
print("Number of -1 values:", len(df["Gross"].loc[df["Gross"] == -1]))

Number of NaN values: 169
Number of -1 values: 0


For some reason, the "-1" values were converted back into "NaN" values

In [34]:
df.fillna({"Gross": -1}, inplace=True)
df["Gross"] = df["Gross"].astype(int)

In [35]:
df.dtypes

Series_Title      object
Released_Year      int64
Certificate       object
Runtime (min)      int64
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross              int64
dtype: object

In [36]:
df.head(3)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime (min),Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444


In [39]:
df.drop(columns=["Overview"], inplace=True) # "Overview" is unneeded
df.head(3)

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime (min),Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,A,142,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,The Godfather,1972,A,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444


In [41]:
df.isnull().sum()

Series_Title     0
Released_Year    0
Certificate      0
Runtime (min)    0
Genre            0
IMDB_Rating      0
Meta_score       0
Director         0
Star1            0
Star2            0
Star3            0
Star4            0
No_of_Votes      0
Gross            0
dtype: int64

In [42]:
df.to_csv("cleaned_data.csv")