In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read and merge imdb and the reviewer data

In [112]:
df = pd.read_csv('movie_details5000.csv')
df['Title'] = df['Title'].str[:-7]
df.head()
df.to_csv('the_reviewer.csv')

In [113]:
df1 = pd.read_csv('the_reviewer.csv')
df2 = pd.read_csv('imdb.csv')
df = pd.merge(df1, df2, on='Title')
drop_columns = ['Rating_Count', 'Keywords', 'Release', 'Movie_ID', 'Filming_Location', 'Worldwide Box Office','Unnamed: 0']
df.drop(drop_columns, axis=1, inplace=True)
df.head(5)

Unnamed: 0,Title,Theater counts,Opening Weekend,Budget,Cast,Crew,Studios,Genre,Languages,Countries,Release_Data,Runtime,Gross_worldwide,Rating,ListOfCertificate
0,Avatar,3452,"$77,025,481","$237,000,000","Sam Worthington,Zoe Saldana,Sigourney Weaver,M...",James Cameron,"Twentieth Century Fox,Dune Entertainment,Light...","Action,Adventure,Fantasy,Sci-Fi","English,Spanish",United States,2009-12-18,2 hours 42 minutes,"$2,847,246,203",7.8,PG-13
1,Avengers: Endgame,4662,"$357,115,007","$356,000,000","Robert Downey Jr.,Chris Evans,Mark Ruffalo,Chr...","Stan Lee,Joe Russo,Christopher Markus,Stephen ...","Marvel Studios,Walt Disney Pictures","Action,Adventure,Drama,Sci-Fi","English,Japanese,Xhosa,German",United States,2019-04-26,3 hours 1 minute,"$2,797,501,328",8.4,PG-13
2,Titanic,2674,"$28,638,131","$1,805,000","Clifton Webb,Barbara Stanwyck,Robert Wagner,Au...","Richard L. Breen,Jean Negulesco,Walter Reisch,...",Twentieth Century Fox,"Drama,History,Romance","English,Basque,French,Spanish",United States,1953-07-13,1 hour 38 minutes,,7.0,
3,Titanic,2674,"$28,638,131","$200,000,000","Leonardo DiCaprio,Kate Winslet,Billy Zane,Kath...",James Cameron,"Twentieth Century Fox,Paramount Pictures,Light...","Drama,Romance","English,Swedish,Italian,French","United States,Mexico,Australia",1997-12-18,3 hours 14 minutes,"$2,201,647,264",7.8,PG-13
4,Avengers: Infinity War,4474,"$257,698,183","$321,000,000","Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo...","Stan Lee,Joe Russo,Christopher Markus,Stephen ...","Marvel Studios,Jason Roberts Productions,South...","Action,Adventure,Sci-Fi",English,United States,2018-04-25,2 hours 29 minutes,"$2,048,359,754",8.4,PG-13


### Drop all records which Gross_worldwide is null

In [114]:
df.dropna(subset=['Gross_worldwide'], inplace=True)
df.reset_index(drop=True, inplace=True)

### Parse some attribute

In [115]:
def parse_currency(before_parsed):
    if '$' in before_parsed:
        after_parsed = int(before_parsed.strip('$').replace(',', ""))
    else:
        after_parsed = int(int(before_parsed.strip('€').replace(',', "")) * 1.14)
    return after_parsed

In [116]:
def parse_runtime(time):
    time = str(time)
    time = time.replace(" ", "")
    if "hour" in time:
        hours = 0
        if "hours" in time:
            hours = int(time[0]) * 60
            after_eliminate_hour = time.replace("hours", "")
        else:
            hours = 60
            after_eliminate_hour = time.replace("hour", "")
        minutes = 0
        if "minute" in after_eliminate_hour:
            if "minutes" in after_eliminate_hour:
                minutes = int(after_eliminate_hour.replace("minutes", "")[1:])
            else:
                minutes = 1
        return int(hours) + int(minutes)
    else:
        return int(time.replace("minutes", ""))

In [117]:
def parse_multi_value(field_value):
    array_value = field_value.split(',')
    if '' in array_value:
        return []
    return array_value

### Fill null value

In [118]:
df.isnull().sum()

Title                  0
Theater counts       787
Opening Weekend      240
Budget               308
Cast                   1
Crew                   1
Studios                2
Genre                249
Languages              2
Countries              1
Release_Data           0
Runtime                0
Gross_worldwide        0
Rating                 0
ListOfCertificate     92
dtype: int64

In [119]:
df.head(2)

Unnamed: 0,Title,Theater counts,Opening Weekend,Budget,Cast,Crew,Studios,Genre,Languages,Countries,Release_Data,Runtime,Gross_worldwide,Rating,ListOfCertificate
0,Avatar,3452,"$77,025,481","$237,000,000","Sam Worthington,Zoe Saldana,Sigourney Weaver,M...",James Cameron,"Twentieth Century Fox,Dune Entertainment,Light...","Action,Adventure,Fantasy,Sci-Fi","English,Spanish",United States,2009-12-18,2 hours 42 minutes,"$2,847,246,203",7.8,PG-13
1,Avengers: Endgame,4662,"$357,115,007","$356,000,000","Robert Downey Jr.,Chris Evans,Mark Ruffalo,Chr...","Stan Lee,Joe Russo,Christopher Markus,Stephen ...","Marvel Studios,Walt Disney Pictures","Action,Adventure,Drama,Sci-Fi","English,Japanese,Xhosa,German",United States,2019-04-26,3 hours 1 minute,"$2,797,501,328",8.4,PG-13


In [120]:
df['Theater counts'].fillna('0', inplace=True)
df['Opening Weekend'].fillna('$0', inplace=True)
df['Budget'].fillna('$0', inplace=True)
df['ListOfCertificate'].fillna('G', inplace=True)
df.fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Theater counts'].fillna('0', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Opening Weekend'].fillna('$0', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

### Parsing attribute

In [121]:
df['Gross_worldwide'] = df['Gross_worldwide'].apply(parse_currency)
df['Opening Weekend'] = df['Opening Weekend'].apply(parse_currency)
df['Budget'] = df['Budget'].apply(parse_currency)

In [122]:
df['Theater counts'] = df['Theater counts'].str.replace(',', '')
df['Theater counts'] = pd.to_numeric(df['Theater counts'], errors='coerce')
df['Runtime'] = df['Runtime'].apply(parse_runtime)

In [123]:
cols = ['Cast', 'Genre', 'Studios', 'ListOfCertificate', 'Languages', 'Countries', 'Crew']
for col in cols:
    df[col] = df[col].apply(parse_multi_value)

In [124]:
df['Release_Data'] = pd.to_datetime(df['Release_Data'], format='%Y-%m-%d')
df['Release_Year'] = df['Release_Data'].apply(lambda x: x.year)
df['Release_Month'] = df['Release_Data'].apply(lambda x: x.month)
df['Release_Day'] = df['Release_Data'].apply(lambda x: x.day)

### Replace value 0 to mean

In [125]:
mean_bugdet = int(df['Budget'][df['Budget'] != 0].mean())
print(mean_bugdet)
mean_opening = int(df['Opening Weekend'][df['Opening Weekend'] != 0].mean())
print(mean_opening)
mean_theater = int(df['Theater counts'][df['Theater counts'] != 0].mean())
print(mean_theater)
df['Budget'] = df['Budget'].replace(0, mean_bugdet)
df['Opening Weekend'] = df['Opening Weekend'].replace(0, mean_opening)
df['Theater counts'] = df['Theater counts'].replace(0, mean_theater)


47309015
18260007
2381


In [126]:
cerfs = []
for data in df['ListOfCertificate']:
    for cerf in data:
        if cerf not in cerfs:
            cerfs.append(cerf)
print(cerfs)

['PG-13', 'PG', 'G', 'R', 'M', 'X', 'NC-17', 'GP', 'M/PG']


In [127]:
casts = []
for data in df['Cast']:
    for cast in data:
        if cast not in casts:
            casts.append(cast)
print(len(casts))

30967


In [128]:
crews = []
for data in df['Crew']:
    for crew in data:
        if crew not in crews:
            crews.append(crew)
print(len(crews))

4819


In [129]:
languages = []
for data in df['Languages']:
    for language in data:
        if language not in languages:
            languages.append(language)
print(len(languages))

147


In [130]:
studios = []
for data in df['Studios']:
    for studio in data:
        if studio not in studios:
            studios.append(studio)
print(len(studios))

2550


In [131]:
genres = []
for data in df['Genre']:
    for genre in data:
        if genre not in genres:
            genres.append(genre)
print(len(genres))

23


In [132]:
countries = []
for data in df['Countries']:
    for country in data:
        if country not in countries:
            countries.append(country)
print(len(countries))

74


In [138]:
df.head()

Unnamed: 0,Title,Theater counts,Opening Weekend,Budget,Cast,Crew,Studios,Genre,Languages,Countries,Release_Data,Runtime,Gross_worldwide,Rating,ListOfCertificate,Release_Year,Release_Month,Release_Day
1,Avengers: Endgame,4662,357115007,356000000,"[Robert Downey Jr., Chris Evans, Mark Ruffalo,...","[Stan Lee, Joe Russo, Christopher Markus, Step...","[Marvel Studios, Walt Disney Pictures]","[Action, Adventure, Drama, Sci-Fi]","[English, Japanese, Xhosa, German]",[United States],2019-04-26,181,2797501328,8.4,[PG-13],2019,4,26
2,Titanic,2674,28638131,200000000,"[Leonardo DiCaprio, Kate Winslet, Billy Zane, ...",[James Cameron],"[Twentieth Century Fox, Paramount Pictures, Li...","[Drama, Romance]","[English, Swedish, Italian, French]","[United States, Mexico, Australia]",1997-12-18,194,2201647264,7.8,[PG-13],1997,12,18
3,Avengers: Infinity War,4474,257698183,321000000,"[Robert Downey Jr., Chris Hemsworth, Mark Ruff...","[Stan Lee, Joe Russo, Christopher Markus, Step...","[Marvel Studios, Jason Roberts Productions, So...","[Action, Adventure, Sci-Fi]",[English],[United States],2018-04-25,149,2048359754,8.4,[PG-13],2018,4,25
4,Jurassic World,4274,208806270,150000000,"[Chris Pratt, Bryce Dallas Howard, Ty Simpkins...","[Colin Trevorrow, Amanda Silver, Rick Jaffa]","[Universal Pictures, Amblin Entertainment, Leg...","[Action, Adventure, Sci-Fi]",[English],[United States],2015-06-12,124,1670516444,7.0,[PG-13],2015,6,12
5,The Lion King,4725,191770759,260000000,"[Donald Glover, Beyoncé, Seth Rogen, Chiwetel ...","[Jeff Nathanson, Jon Favreau, Irene Mecchi, Jo...","[Walt Disney Pictures, Fairview Entertainment,...","[Animation, Adventure, Drama, Family, Musical]","[English, Xhosa, Zulu, French, Spanish, Hindi]","[United States, United Kingdom, South Africa]",2019-07-19,118,1662899439,6.8,[PG],2019,7,19


In [134]:
df.to_csv('cleaning_data.csv')