# MovieTinder

file to do the cleaning, while Jupiter notebook has better visualization for tables.

In [10]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import ast

df = pd.read_csv('data/movies_metadata.csv')
print("Original shape: ", df.shape)


# drop the columns which have a missing value percentage of above 55 
toBeDropped = df.columns[df.isnull().mean() > 0.55] 
df = df.drop(columns= toBeDropped)
print("Shape after dropping NaN columns: ", df.shape, " Columns deleted: ", len(toBeDropped))

# from this we can see that most rows are complete (without missing values) while only 10 rows have less than 21 finite values in their row with 21
# being the total amount of colums, therefore we do not drop any rows found by this. 
# print(len(df.dropna(thresh=20)))

title = df['title']
df = df.drop(columns = ['original_title', 'title', 'status', 'video', 'poster_path', 'production_countries','spoken_languages', 'vote_count'])
df.insert(0, 'Title', title) #move title to the front
print("Shape after dropping unwanted columns: ", df.shape)

df = df[df['production_companies'].notna()]

#  rows which have a boolean type value for their production companies, yes i did this manualy and yes it could have been automated
df = df.drop(index = 29503)
df = df.drop(index = 19730)
df = df.drop(index = 35587)


# clean the genres and production companies columns

for index, row in df.iterrows():
    res = ast.literal_eval(row['genres']) 
    res2 = ast.literal_eval(row['production_companies'])
    genres = []
    companies = []
    
    
    for genre in res:
        genres.append(genre['name'])
    
    for comp in res2:
        companies.append(comp['name'])
    
    if len(companies) == 0:
        df.at[index,'production_companies'] = np.nan
    else: 
        df.at[index,'production_companies'] = companies
    
    if len(genres) == 0:
        df.at[index,'genres'] = np.nan
    else: 
        df.at[index, 'genres'] = genres

df = df[df['genres'].notna()]
df



Original shape:  (45466, 24)
Shape after dropping NaN columns:  (45466, 21)  Columns deleted:  3
Shape after dropping unwanted columns:  (45466, 14)


Unnamed: 0,Title,adult,budget,genres,id,imdb_id,original_language,overview,popularity,production_companies,release_date,revenue,runtime,vote_average
0,Toy Story,False,30000000,"[Animation, Comedy, Family]",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",21.9469,[Pixar Animation Studios],1995-10-30,373554033.0,81.0,7.7
1,Jumanji,False,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,17.0155,"[TriStar Pictures, Teitler Film, Interscope Co...",1995-12-15,262797249.0,104.0,6.9
2,Grumpier Old Men,False,0,"[Romance, Comedy]",15602,tt0113228,en,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",1995-12-22,0.0,101.0,6.5
3,Waiting to Exhale,False,16000000,"[Comedy, Drama, Romance]",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",3.85949,[Twentieth Century Fox Film Corporation],1995-12-22,81452156.0,127.0,6.1
4,Father of the Bride Part II,False,0,[Comedy],11862,tt0113041,en,Just when George Banks has recovered from his ...,8.38752,"[Sandollar Productions, Touchstone Pictures]",1995-02-10,76578911.0,106.0,5.7
5,Heat,False,60000000,"[Action, Crime, Drama, Thriller]",949,tt0113277,en,"Obsessive master thief, Neil McCauley leads a ...",17.9249,"[Regency Enterprises, Forward Pass, Warner Bros.]",1995-12-15,187436818.0,170.0,7.7
6,Sabrina,False,58000000,"[Comedy, Romance]",11860,tt0114319,en,An ugly duckling having undergone a remarkable...,6.67728,"[Paramount Pictures, Scott Rudin Productions, ...",1995-12-15,0.0,127.0,6.2
7,Tom and Huck,False,0,"[Action, Adventure, Drama, Family]",45325,tt0112302,en,"A mischievous young boy, Tom Sawyer, witnesses...",2.56116,[Walt Disney Pictures],1995-12-22,0.0,97.0,5.4
8,Sudden Death,False,35000000,"[Action, Adventure, Thriller]",9091,tt0114576,en,International action superstar Jean Claude Van...,5.23158,"[Universal Pictures, Imperial Entertainment, S...",1995-12-22,64350171.0,106.0,5.5
9,GoldenEye,False,58000000,"[Adventure, Action, Thriller]",710,tt0113189,en,James Bond must unmask the mysterious head of ...,14.686,"[United Artists, Eon Productions]",1995-11-16,352194034.0,130.0,6.6


In [9]:
df.to_csv('dataClean/movieDataClean.csv', sep=',', encoding='utf-8', index = False)

In [5]:
# check if everything went fine
df = pd.read_csv('dataClean/movieDataClean.csv',error_bad_lines=False)
df.head()

Unnamed: 0,Title,adult,budget,genres,id,imdb_id,original_language,overview,popularity,production_companies,release_date,revenue,runtime,vote_average
0,Toy Story,False,30000000,"['Animation', 'Comedy', 'Family']",862,tt0114709,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,['Pixar Animation Studios'],1995-10-30,373554033.0,81.0,7.7
1,Jumanji,False,65000000,"['Adventure', 'Fantasy', 'Family']",8844,tt0113497,en,When siblings Judy and Peter discover an encha...,17.015539,"['TriStar Pictures', 'Teitler Film', 'Intersco...",1995-12-15,262797249.0,104.0,6.9
2,Grumpier Old Men,False,0,"['Romance', 'Comedy']",15602,tt0113228,en,A family wedding reignites the ancient feud be...,11.7129,"['Warner Bros.', 'Lancaster Gate']",1995-12-22,0.0,101.0,6.5
3,Waiting to Exhale,False,16000000,"['Comedy', 'Drama', 'Romance']",31357,tt0114885,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,['Twentieth Century Fox Film Corporation'],1995-12-22,81452156.0,127.0,6.1
4,Father of the Bride Part II,False,0,['Comedy'],11862,tt0113041,en,Just when George Banks has recovered from his ...,8.387519,"['Sandollar Productions', 'Touchstone Pictures']",1995-02-10,76578911.0,106.0,5.7
