[Reference](https://medium.com/@siglimumuni/data-cleaning-and-transformation-in-python-with-marvel-movie-dataset-22682374fbd3)

# Loading the dataset

In [1]:
#import relevant libraries
import pandas as pd
import re

#load dataset
df = pd.read_csv("https://raw.githubusercontent.com/siglimumuni/Datasets/master/marvel_movies_scraped.csv")

#show all columns
pd.set_option('display.max_columns',30)

#check the first 5 rows
df.head()

Unnamed: 0,Title,Directed by,Written by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Production company,Distributed by,Release date,Running time,Country,Language,Budget,Box office,Production companies,Screenplay by,Story by,Release dates,Countries,Languages
0,Howard the Duck,Willard Huyck,Willard Huyck Gloria Katz,Howard the Duck by Steve Gerber,Gloria Katz,\n Lea Thompson \n Jeffrey Jones \n Tim Robbin...,Richard H. Kline,Michael Chandler Sidney Wolinsky,\n John Barry \n Sylvester Levay \n Songs: \n ...,Lucasfilm,Universal Pictures,"\n August1,1986 ( 1986-08-01 ) \n",111 minutes,United States,English,$30-37 million,$38 million,,,,,,
1,Blade,Stephen Norrington,David S. Goyer,Blade by Marv Wolfman Gene Colan,\n Peter Frankfurt \n Wesley Snipes \n Robert ...,\n Wesley Snipes \n Stephen Dorff \n Kris Kris...,Theo van de Sande,Paul Rubell,Mark Isham,,New Line Cinema,"\n August21,1998 ( 1998-08-21 ) \n",120 minutes,United States,English,$45million,$131.2million,\n Marvel Enterprises \n Amen Ra Films \n Imag...,,,,,
2,X-Men,Bryan Singer,,X-Men by Stan Lee Jack Kirby,\n Lauren Shuler Donner \n Ralph Winter \n,\n Patrick Stewart \n Hugh Jackman \n Ian McKe...,Newton Thomas Sigel,\n Steven Rosenblum \n Kevin Stitt \n John Wri...,Michael Kamen,,20th Century Fox,,104 minutes,United States,English,$75 million,$296.3 million,\n Marvel Entertainment Group \n The Donners' ...,David Hayter,\n Tom DeSanto \n Bryan Singer \n,"\n July12,2000 ( 2000-07-12 ) ( Ellis Island ...",,
3,Blade II,Guillermo del Toro,David S. Goyer,Blade by Marv Wolfman Gene Colan,\n Peter Frankfurt \n Wesley Snipes \n Patrick...,\n Wesley Snipes \n Kris Kristofferson \n Ron ...,Gabriel Beristain,Peter Amundson,Marco Beltrami,,New Line Cinema,"\n March22,2002 ( 2002-03-22 ) \n",117 minutes,United States,English,$54 million,$155 million,\n Marvel Enterprises \n Amen Ra Films \n Imag...,,,,,
4,Spider-Man,Sam Raimi,,Spider-Man by Stan Lee Steve Ditko,\n Laura Ziskin \n Ian Bryce \n,\n Tobey Maguire \n Willem Dafoe \n Kirsten Du...,Don Burgess,\n Bob Murawski \n Arthur Coburn \n,Danny Elfman,,Sony Pictures Releasing,,121 minutes,United States,English,$139million,$825million,\n Columbia Pictures \n Marvel Enterprises \n ...,David Koepp,,"\n April29,2002 ( 2002-04-29 ) ( Mann Village...",,


# Merge data in duplicated columns

In [2]:
#map affected columns as key-value pairs
map = {
        'Production companies':'Production company',
        'Release dates':'Release date',
        'Languages':'Language',
        'Countries':'Country'
      }

#iterate through each pair and perform operation
for key,value in map.items():
  df[key].fillna(df[value], inplace=True)
  #delete unwanted column
  del df[value]

#print all columns
print(df.columns)

Index(['Title', 'Directed by', 'Written by', 'Based on', 'Produced by',
       'Starring', 'Cinematography', 'Edited by', 'Music by', 'Distributed by',
       'Running time', 'Budget', 'Box office', 'Production companies',
       'Screenplay by', 'Story by', 'Release dates', 'Countries', 'Languages'],
      dtype='object')


In [3]:
#check for nulls
print(df[map.keys()].isnull().sum())

#print first 10 rows of merged columns
df[map.keys()].head(10)

Production companies    0
Release dates           0
Languages               0
Countries               0
dtype: int64


Unnamed: 0,Production companies,Release dates,Languages,Countries
0,Lucasfilm,"\n August1,1986 ( 1986-08-01 ) \n",English,United States
1,\n Marvel Enterprises \n Amen Ra Films \n Imag...,"\n August21,1998 ( 1998-08-21 ) \n",English,United States
2,\n Marvel Entertainment Group \n The Donners' ...,"\n July12,2000 ( 2000-07-12 ) ( Ellis Island ...",English,United States
3,\n Marvel Enterprises \n Amen Ra Films \n Imag...,"\n March22,2002 ( 2002-03-22 ) \n",English,United States
4,\n Columbia Pictures \n Marvel Enterprises \n ...,"\n April29,2002 ( 2002-04-29 ) ( Mann Village...",English,United States
5,\n New Regency Enterprises \n Marvel Enterpris...,"\n February14,2003 ( 2003-02-14 ) \n",English,United States
6,\n Marvel Enterprises \n The Donners' Company ...,"\n April24,2003 ( 2003-04-24 ) (London, UK) \...",English,United States
7,\n Universal Pictures \n Marvel Studios \n Val...,"\n June20,2003 ( 2003-06-20 ) \n",English,United States
8,\n Lions Gate Films \n Marvel Entertainment \n,"\n April16,2004 ( 2004-04-16 ) \n",English,United States
9,\n Columbia Pictures \n Marvel Enterprises \n ...,"\n June25,2004 ( 2004-06-25 ) ( Mann Village ...",English,United States


In [4]:
#split one value in the starring column
df['Starring'].loc[0].split('\n')

['', ' Lea Thompson ', ' Jeffrey Jones ', ' Tim Robbins ', '']

In [5]:
#split value, remove empty strings, and whitespaces
to_list = [item.strip() for item in df['Starring'].loc[0].split('\n') if item !='']
to_list

['Lea Thompson', 'Jeffrey Jones', 'Tim Robbins']

In [6]:
#loop though every affected column and convert values to lists
for column in ['Starring','Edited by','Written by','Produced by','Music by','Production companies','Release dates','Story by']:
  df[column] = df[column].apply(lambda x: x if pd.isnull(x) else [item.strip() for item in x.split('\n') if item != ''])

df.head()

Unnamed: 0,Title,Directed by,Written by,Based on,Produced by,Starring,Cinematography,Edited by,Music by,Distributed by,Running time,Budget,Box office,Production companies,Screenplay by,Story by,Release dates,Countries,Languages
0,Howard the Duck,Willard Huyck,[Willard Huyck Gloria Katz],Howard the Duck by Steve Gerber,[Gloria Katz],"[Lea Thompson, Jeffrey Jones, Tim Robbins]",Richard H. Kline,[Michael Chandler Sidney Wolinsky],"[John Barry, Sylvester Levay, Songs:, Thomas D...",Universal Pictures,111 minutes,$30-37 million,$38 million,[Lucasfilm],,,"[August1,1986 ( 1986-08-01 )]",United States,English
1,Blade,Stephen Norrington,[David S. Goyer],Blade by Marv Wolfman Gene Colan,"[Peter Frankfurt, Wesley Snipes, Robert Engelman]","[Wesley Snipes, Stephen Dorff, Kris Kristoffer...",Theo van de Sande,[Paul Rubell],[Mark Isham],New Line Cinema,120 minutes,$45million,$131.2million,"[Marvel Enterprises, Amen Ra Films, Imaginary ...",,,"[August21,1998 ( 1998-08-21 )]",United States,English
2,X-Men,Bryan Singer,,X-Men by Stan Lee Jack Kirby,"[Lauren Shuler Donner, Ralph Winter]","[Patrick Stewart, Hugh Jackman, Ian McKellen, ...",Newton Thomas Sigel,"[Steven Rosenblum, Kevin Stitt, John Wright]",[Michael Kamen],20th Century Fox,104 minutes,$75 million,$296.3 million,"[Marvel Entertainment Group, The Donners' Comp...",David Hayter,"[Tom DeSanto, Bryan Singer]","[July12,2000 ( 2000-07-12 ) ( Ellis Island ),...",United States,English
3,Blade II,Guillermo del Toro,[David S. Goyer],Blade by Marv Wolfman Gene Colan,"[Peter Frankfurt, Wesley Snipes, Patrick Palmer]","[Wesley Snipes, Kris Kristofferson, Ron Perlma...",Gabriel Beristain,[Peter Amundson],[Marco Beltrami],New Line Cinema,117 minutes,$54 million,$155 million,"[Marvel Enterprises, Amen Ra Films, Imaginary ...",,,"[March22,2002 ( 2002-03-22 )]",United States,English
4,Spider-Man,Sam Raimi,,Spider-Man by Stan Lee Steve Ditko,"[Laura Ziskin, Ian Bryce]","[Tobey Maguire, Willem Dafoe, Kirsten Dunst, J...",Don Burgess,"[Bob Murawski, Arthur Coburn]",[Danny Elfman],Sony Pictures Releasing,121 minutes,$139million,$825million,"[Columbia Pictures, Marvel Enterprises, Laura ...",David Koepp,,"[April29,2002 ( 2002-04-29 ) ( Mann Village T...",United States,English


In [7]:
#check release dates for one row
df['Release dates'].loc[2]

['July12,2000 ( 2000-07-12 )  ( Ellis Island )',
 'July14,2000 ( 2000-07-14 )  (United States)']

In [8]:
#extract date
df['Release dates'].loc[2][0].split('(')[1].replace(')','').strip()

'2000-07-12'

In [9]:
#apply lambda function to extract single date from release dates column
df['Release dates'] = df['Release dates'].apply(lambda x: pd.to_datetime(x[0].split('(')[1].replace(')','').strip()))

#rename column
df.rename(columns={'Release dates':'Release date'}, inplace=True)

df['Release date'].head()

0   1986-08-01
1   1998-08-21
2   2000-07-12
3   2002-03-22
4   2002-04-29
Name: Release date, dtype: datetime64[ns]

# Extract the movie length digits and convert to integer

In [10]:
import re

#find and extract only digits from every entry in the column
df['Running time'] = df['Running time'].apply(lambda x: x if pd.isnull(x) else round(int(re.findall(r"\d{2,3}",x)[0])),0)

df['Running time'].head()

0    111
1    120
2    104
3    117
4    121
Name: Running time, dtype: object

In [11]:
def convert_value(value):
  if pd.isnull(value):
    return value
  else:
    #extract any digits or the word million or billion
    digits_amount = re.findall(r"\d+\.?\d*|million|billion",value)
    
    #convert digits to float and amount to integer
    digits = float(digits_amount[0])
    amount = 1000000 if 'million' in digits_amount else 1000000000
    
    #multiply digits by amount to get the full value in int
    integer_value = digits * amount
    return integer_value

In [12]:
#apply function to box office and budget columns
for column in ['Box office','Budget']:
  df[column] = df[column].apply(convert_value)

df[['Box office','Budget']].head()

Unnamed: 0,Box office,Budget
0,38000000.0,30000000.0
1,131200000.0,45000000.0
2,296300000.0,75000000.0
3,155000000.0,54000000.0
4,825000000.0,139000000.0


In [13]:
# import pprint

# #specify API endpoint
# URL = 'http://www.omdbapi.com/?apikey=your_api_key'

# #retrieve movie details
# response = requests.get(URL,params={'t':'X-Men'}).json()

# #pretty print
# pprint.pprint(response)
# @siglimumuni

In [14]:
# def get_rotten_tomatoes(movie):
#   URL = 'http://www.omdbapi.com/?apikey=your_api_key'

#   response = requests.get(URL,params={'t':movie}).json()
#   try:
#     for dictionary in response['Ratings']:
#       if dictionary['Source'] == 'Rotten Tomatoes':
#         return int(dictionary['Value'].replace('%',''))
#   except KeyError:
#     return None

# #test function
# get_rotten_tomatoes('Iron Man')

In [15]:
#movie titles with issues
print(df.loc[58]['Title'])
print(df.loc[63]['Title'])

Shang-Chi and the Legendof the Ten Rings
Doctor Strange in theMultiverse of Madness


In [16]:
#replace values with correct titles
df['Title'] = df['Title'].replace(
                                  ['Shang-Chi and the Legendof the Ten Rings','Doctor Strange in theMultiverse of Madness'],
                                  ['Shang-Chi and the Legend of the Ten Rings','Doctor Strange in the Multiverse of Madness']
                                  )

In [18]:
# #create new column for rotten tomatoes scores
# df['Rotten Tomatoes'] = [get_rotten_tomatoes(movie) for movie in df['Title']]

# df['Rotten Tomatoes'].head()

In [19]:
#unreleased movies
df[['Title','Release date']].tail(6)

Unnamed: 0,Title,Release date
65,Black Panther: Wakanda Forever,2022-11-11
66,Kraven the Hunter,2023-01-13
67,Ant-Man and the Wasp: Quantumania,2023-02-17
68,The Marvels,2023-07-28
69,Guardians of the Galaxy Vol. 3,2023-05-05
70,Madame Web,2023-10-06


In [20]:
#drop last 6 rows 
df.drop(df.tail(6).index, inplace = True)

In [21]:
#saving the dataset as a CSV
df.to_csv('marvel_movies_clean.csv',index=False)