In [100]:
import pandas as pd
import numpy as np

In [102]:
df = pd.read_csv("messy_IMDB_dataset.csv", encoding="ISO-8859-1", sep=None, engine="python")

In [104]:
df.head()

Unnamed: 0,IMBD title ID,Original titlÊ,Release year,Genrë¨,Duration,Country,Content Rating,Director,Unnamed: 8,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,,$ 222831817,1.780.147,"8,9f"


# Steps for cleaning

- Rename Columns
- Drop Unnecessary Columns
- Handle Missing Values
- Convert Data Types
- Standardised Formatting
- Remove or Correct Invalid Data

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   IMBD title ID   100 non-null    object 
 1   Original titlÊ  100 non-null    object 
 2   Release year    100 non-null    object 
 3   Genrë¨          100 non-null    object 
 4   Duration        99 non-null     object 
 5   Country         100 non-null    object 
 6   Content Rating  77 non-null     object 
 7   Director        100 non-null    object 
 8   Unnamed: 8      0 non-null      float64
 9   Income          100 non-null    object 
 10   Votes          100 non-null    object 
 11  Score           100 non-null    object 
dtypes: float64(1), object(11)
memory usage: 9.6+ KB


In [111]:
df.describe(include="all")

Unnamed: 0,IMBD title ID,Original titlÊ,Release year,Genrë¨,Duration,Country,Content Rating,Director,Unnamed: 8,Income,Votes,Score
count,100,100,100,100,99.0,100,77,100,0.0,100,100,100.0
unique,100,100,99,59,71.0,18,7,64,,100,100,28.0
top,tt0111161,The Shawshank Redemption,2000-05-19,Drama,119.0,USA,R,Christopher Nolan,,$ 28815245,2.278.845,8.6
freq,1,1,2,9,3.0,62,45,6,,1,1,11.0
mean,,,,,,,,,,,,
std,,,,,,,,,,,,
min,,,,,,,,,,,,
25%,,,,,,,,,,,,
50%,,,,,,,,,,,,
75%,,,,,,,,,,,,


# Renaming Columns

In [115]:
df.columns

Index(['IMBD title ID', 'Original titlÊ', 'Release year', 'Genrë¨', 'Duration',
       'Country', 'Content Rating', 'Director', 'Unnamed: 8', 'Income',
       ' Votes ', 'Score'],
      dtype='object')

In [117]:
# Removing white spaces
df.columns = df.columns.str.strip()

In [119]:
df.columns

Index(['IMBD title ID', 'Original titlÊ', 'Release year', 'Genrë¨', 'Duration',
       'Country', 'Content Rating', 'Director', 'Unnamed: 8', 'Income',
       'Votes', 'Score'],
      dtype='object')

In [121]:


df.rename(columns={"IMBD title ID": "IMDB_title_ID",
                  "Original titlÊ": "Original_title",
                  "Release year": "Release_year",
                  "Genrë¨": "Genre",
                  "Content Rating": "Content_Rating",
                   "Votes": "Votes",
                   "Income": "Income",
                   "Score": "Score",
                   "Director": "Director",
                   "Duration": "Duration",
                   "Country": "Country"
                  }, inplace=True)

In [123]:
df.columns

Index(['IMDB_title_ID', 'Original_title', 'Release_year', 'Genre', 'Duration',
       'Country', 'Content_Rating', 'Director', 'Unnamed: 8', 'Income',
       'Votes', 'Score'],
      dtype='object')

In [125]:
df.head(3)

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Unnamed: 8,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142,USA,R,Frank Darabont,,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175,USA,R,Francis Ford Coppola,,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152,US,PG-13,Christopher Nolan,,$ 1005455211,2.241.615,9.0


In [127]:
# Drop the Irrelevant column

df.drop("Unnamed: 8", axis=1, inplace=True)

# Handling Missing Values

In [130]:
# Handling Missing Values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IMDB_title_ID   100 non-null    object
 1   Original_title  100 non-null    object
 2   Release_year    100 non-null    object
 3   Genre           100 non-null    object
 4   Duration        99 non-null     object
 5   Country         100 non-null    object
 6   Content_Rating  77 non-null     object
 7   Director        100 non-null    object
 8   Income          100 non-null    object
 9   Votes           100 non-null    object
 10  Score           100 non-null    object
dtypes: object(11)
memory usage: 8.8+ KB


In [132]:
# To know the exact column with the missing value and the movies name


df[df["Duration"].isnull()]

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
13,,,,,,,,,,,
14,tt0133093,The Matrix,1999-05-07,"Action, Sci-Fi",,USA,R,"Lana Wachowski, Lilly Wachowski",$ 465718588,1.632.315,++8.7


In [134]:
# i had to do a google search to get the movie duration
df.loc[14, "Duration"] = 136

In [136]:
df.loc[14]

IMDB_title_ID                           tt0133093
Original_title                         The Matrix
Release_year                           1999-05-07
Genre                              Action, Sci-Fi
Duration                                      136
Country                                       USA
Content_Rating                                  R
Director          Lana Wachowski, Lilly Wachowski
Income                                $ 465718588
Votes                                   1.632.315
Score                                       ++8.7
Name: 14, dtype: object

In [138]:
df[df["Content_Rating"].isnull()].head()

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
13,,,,,,,,,,,
27,tt0118799,La vita B9 bella,1997-12-20,"Comedy, Drama, Romance",116.0,Italy1,,Roberto Benigni,$ 230098753,605.648,8.6
28,tt6751668,Gisaengchung,2019-11-07,"Comedy, Drama, Thriller",132.0,South Korea,,Bong Joon Ho,$ 257604912,470.931,8.6
36,tt0110413,LÃ©on,1995-04-07,"Action, Crime, Drama",110.0,France,,Luc Besson,$ 19552639,1.007.598,8.5
40,tt7286456,Joker,2019-10-03,"Crime, Drama, Thriller",122.0,USA,,Todd Phillips,$ 1074251311,855.097,8.4


In [140]:
# There are Numerous missing values in the Content_Rating columns
df["Content_Rating"].fillna("Not Rated", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Content_Rating"].fillna("Not Rated", inplace=True)


In [142]:
# Drop Row with Critical missing data

df.dropna(subset=["IMDB_title_ID", "Original_title", "Release_year"], inplace=True)

In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 100
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   IMDB_title_ID   100 non-null    object
 1   Original_title  100 non-null    object
 2   Release_year    100 non-null    object
 3   Genre           100 non-null    object
 4   Duration        100 non-null    object
 5   Country         100 non-null    object
 6   Content_Rating  100 non-null    object
 7   Director        100 non-null    object
 8   Income          100 non-null    object
 9   Votes           100 non-null    object
 10  Score           100 non-null    object
dtypes: object(11)
memory usage: 9.4+ KB


In [146]:
df.head(3)

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142,USA,R,Frank Darabont,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175,USA,R,Francis Ford Coppola,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152,US,PG-13,Christopher Nolan,$ 1005455211,2.241.615,9.0


# Converting data types

In [150]:
df.head()

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,$ 222831817,1.780.147,"8,9f"


In [152]:
# Handling Inconsistent Dates

df.loc[[1, 2, 5, 9, 12, 15, 18, 45, 70, 83, 84], ["Release_year"]]

Unnamed: 0,Release_year
1,09 21 1972
2,23 -07-2008
5,22 Feb 04
9,10-29-99
12,23rd December of 1966
15,01/16-03
18,18/11/1976
45,21-11-46
70,"The 6th of marzo, year 1951"
83,1984-02-34


In [154]:
# standardize the format of some inconsistent date


df.loc[1, "Release_year"] = "1972-09-21"
df.loc[2, "Release_year"] = "2008-07-23"
df.loc[5, "Release_year"] = "2004-02-22"
df.loc[9, "Release_year"] = "1999-10-29"
df.loc[12, "Release_year"] = "1966-12-23"
df.loc[15, "Release_year"] = "2003-01-16"
df.loc[18, "Release_year"] = "1976-11-18"
df.loc[45, "Release_year"] = "1946-11-21"
df.loc[70, "Release_year"] = "1951-03-06"
df.loc[83, "Release_year"] = "1984-12-09"
df.loc[84, "Release_year"] = "1976-02-08"


In [156]:
df.head()

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,1972-09-21,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,2008-07-23,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,$ 222831817,1.780.147,"8,9f"


In [160]:
df['Income'] = df["Income"].replace({"o": "0", r"\$": "", ",":""}, regex=True)

df["Votes"] = df["Votes"].replace({"\,": "", "\.": ""}, regex=True)

  df["Votes"] = df["Votes"].replace({"\,": "", "\.": ""}, regex=True)
  df["Votes"] = df["Votes"].replace({"\,": "", "\.": ""}, regex=True)


In [162]:
df.head()

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,28815245,2278845,9.3
1,tt0068646,The Godfather,1972-09-21,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,246120974,1572674,9.2
2,tt0468569,The Dark Knight,2008-07-23,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,1005455211,2241615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,408035783,1098714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,222831817,1780147,"8,9f"


In [164]:
df["Release_year"] = pd.to_datetime(df["Release_year"], errors="coerce")

df["Income"] = pd.to_numeric(df["Income"], errors="coerce")

df["Votes"] = pd.to_numeric(df["Votes"], errors="coerce")

In [167]:
df.head()

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,28815245,2278845,9.3
1,tt0068646,The Godfather,1972-09-21,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,246120974,1572674,9.2
2,tt0468569,The Dark Knight,2008-07-23,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,1005455211,2241615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,408035783,1098714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,222831817,1780147,"8,9f"


In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 100
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   IMDB_title_ID   100 non-null    object        
 1   Original_title  100 non-null    object        
 2   Release_year    100 non-null    datetime64[ns]
 3   Genre           100 non-null    object        
 4   Duration        100 non-null    object        
 5   Country         100 non-null    object        
 6   Content_Rating  100 non-null    object        
 7   Director        100 non-null    object        
 8   Income          100 non-null    int64         
 9   Votes           100 non-null    int64         
 10  Score           100 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 13.4+ KB


In [173]:
df["Score"] = df["Score"].replace({"[^0-9.]": "", "\.\.": ".", "\.$": ""}, regex=True)

  df["Score"] = df["Score"].replace({"[^0-9.]": "", "\.\.": ".", "\.$": ""}, regex=True)
  df["Score"] = df["Score"].replace({"[^0-9.]": "", "\.\.": ".", "\.$": ""}, regex=True)


In [175]:
df.head(2)

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142,USA,R,Frank Darabont,28815245,2278845,9.3
1,tt0068646,The Godfather,1972-09-21,"Crime, Drama",175,USA,R,Francis Ford Coppola,246120974,1572674,9.2


In [177]:
# Convert to Numeric
df["Score"] = pd.to_numeric(df["Score"], errors="coerce")

In [179]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 100
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   IMDB_title_ID   100 non-null    object        
 1   Original_title  100 non-null    object        
 2   Release_year    100 non-null    datetime64[ns]
 3   Genre           100 non-null    object        
 4   Duration        100 non-null    object        
 5   Country         100 non-null    object        
 6   Content_Rating  100 non-null    object        
 7   Director        100 non-null    object        
 8   Income          100 non-null    int64         
 9   Votes           100 non-null    int64         
 10  Score           100 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(7)
memory usage: 13.4+ KB


In [181]:
# Normalise the column
df["Score"] = df["Score"].apply(lambda x: x/10 if x> 10 else x)

In [183]:
df.head()

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,28815245,2278845,9.3
1,tt0068646,The Godfather,1972-09-21,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,246120974,1572674,9.2
2,tt0468569,The Dark Knight,2008-07-23,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,1005455211,2241615,9.0
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,408035783,1098714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,222831817,1780147,8.9


# Final Quality Check

In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 100
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   IMDB_title_ID   100 non-null    object        
 1   Original_title  100 non-null    object        
 2   Release_year    100 non-null    datetime64[ns]
 3   Genre           100 non-null    object        
 4   Duration        100 non-null    object        
 5   Country         100 non-null    object        
 6   Content_Rating  100 non-null    object        
 7   Director        100 non-null    object        
 8   Income          100 non-null    int64         
 9   Votes           100 non-null    int64         
 10  Score           100 non-null    float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(7)
memory usage: 13.4+ KB


In [194]:
df.head()

Unnamed: 0,IMDB_title_ID,Original_title,Release_year,Genre,Duration,Country,Content_Rating,Director,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,28815245,2278845,9.3
1,tt0068646,The Godfather,1972-09-21,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,246120974,1572674,9.2
2,tt0468569,The Dark Knight,2008-07-23,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,1005455211,2241615,9.0
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,408035783,1098714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,222831817,1780147,8.9


# Saving the cleaned csv

In [197]:
df.to_csv("IMDB_dataset_cleaned.csv", index=False)