In [1]:
import pandas as pd
import numpy as np
import os
import json

from pymongo import MongoClient
from dotenv import load_dotenv

# Preprocessing

## Loading All CSV Data

In [2]:
# movies df 
df_moty = pd.read_csv("../Data/Data-2/Best Movie by Year Netflix.csv", index_col='index')
df_bmn = pd.read_csv("../Data/Data-2/Best Movies Netflix.csv", index_col='index')

# shows df 
df_soty = pd.read_csv('../Data/Data-2/Best Show by Year Netflix.csv', index_col='index')
df_bsn = pd.read_csv('../Data/Data-2/Best Shows Netflix.csv', index_col='index')

# all shows and movies
df_titles = pd.read_csv("../Data/Data-2/raw_titles.csv", index_col='index')
# credits
df_credits = pd.read_csv("../Data/Data-2/raw_credits.csv", index_col='index')

In [3]:
df_moty.head()

Unnamed: 0_level_0,TITLE,RELEASE_YEAR,SCORE,MAIN_GENRE,MAIN_PRODUCTION
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,White Christmas,1954,7.5,romance,US
1,The Guns of Navarone,1961,7.5,war,US
2,My Fair Lady,1964,7.8,drama,US
3,Bonnie and Clyde,1967,7.7,drama,US
4,Dirty Harry,1971,7.7,thriller,US


In [4]:
df_titles.head()

Unnamed: 0_level_0,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0


## Checking and Cleaning some Data

### Simplifying the data by adding the is_awarded column to the titles Dataframe

This aims to reduce the number of collections in our MongoDB database by enriching a single collection without losing information. By doing this we achieve workaround for MongoDB's limitations on performing JOIN queries between collections

In [None]:
awarded_movies = list(df_moty['TITLE'])
awarded_shows = list(df_soty['TITLE'])

In [None]:
total_awarded = len(awarded_movies) + len(awarded_shows)

print("Expected number of awarded titles: ", int(total_awarded))

Expected number of awarded titles:  80


In [7]:
def set_isAwarded(title):
    return title['title'] in awarded_movies or title['title'] in awarded_shows

In [8]:
df_titles['title'] = df_titles['title'].astype(str)
df_titles['release_year'] = df_titles['release_year'].astype(int)
df_titles['is_awarded'] = df_titles.apply(set_isAwarded, axis=1)

In [9]:
df_titles.is_awarded.value_counts()

is_awarded
False    5722
True       84
Name: count, dtype: int64

We can observe that the is_awarded = True has more values then expected. Therefore we will need to take a closer look.

In [10]:
print("Are There duplicate combinations of title and type?:",df_titles[['title', 'type']].duplicated().any()) 
print("Are all ID values unique?:", df_titles['id'].is_unique)
print("Are there duplicate combinations of title, type and release year?:",df_titles[['title', 'type', 'release_year']].duplicated().any()) 


Are There duplicate combinations of title and type?: True
Are all ID values unique?: True
Are there duplicate combinations of title, type and release year?: False


As we can see, there are some combinations of *title* and *type* (meaning there are movie/shows) with the same title, but, however seeing that the *id* values are unique we discard the option that the items are exact copies of each other.\
Taking in consideration that we are working with movie / shows data it was thought that perhaps there has been a remake of a title. This hypothesis gained some more strength when we verified that there where no combinations of *type*, *title* - *year*\
\
For this project the remade titles will not be considered nor tagged as award-winning because that could disrupt some possible queries.

In [11]:
awarded_movies, year_movie = list(df_moty['TITLE']), list(df_moty['RELEASE_YEAR'])
awarded_shows, year_show = list(df_soty['TITLE']), list(df_soty['RELEASE_YEAR'])

awarded_movies = [[x,y] for x,y in zip(awarded_movies, year_movie)]
awarded_shows = [[x,y] for x,y in zip(awarded_shows, year_show)]

In [12]:
def set_isAwarded(title):
    return [title['title'], title['release_year']] in awarded_movies or [title['title'], title['release_year']] in awarded_shows

In [13]:
df_titles['title'] = df_titles['title'].astype(str)
df_titles['release_year'] = df_titles['release_year'].astype(int)
df_titles['is_awarded'] = df_titles.apply(set_isAwarded, axis=1)

In [14]:
df_titles.is_awarded.value_counts()

is_awarded
False    5726
True       80
Name: count, dtype: int64

### Simplifying the data by adding the is_topTitle column to the titles Dataframe

## 

Similarly to the is_awarded column, the is_topTitle aims to simplfy the schema and get df_bmn/df_bsn into df_titles\

assuming the same problem with remaked will happen, we start with best_movies already taking into account the release_year

In [15]:
best_movies, year_bestMovie = list(df_bmn['TITLE']), list(df_bmn['RELEASE_YEAR'])
best_shows, year_bestShows = list(df_bsn['TITLE']), list(df_bsn['RELEASE_YEAR'])

best_movies = [[x,y] for x,y in zip(best_movies, year_bestMovie)]
best_shows = [[x,y] for x,y in zip(best_shows, year_bestShows)]

In [16]:
total_best = len(best_movies) + len(best_shows)

print("Expected number of best titles: ", int(total_best))

Expected number of best titles:  633


In [17]:
print(len(best_movies))
print(len(best_shows))
print(len(best_movies) + len(best_shows))

387
246
633


In [18]:
def set_isTopTitle(title):
    return [title['title'], title['release_year']] in best_movies or [title['title'], title['release_year']] in best_shows

In [19]:
df_titles['title'] = df_titles['title'].astype(str)
df_titles['release_year'] = df_titles['release_year'].astype(int)
df_titles['is_topTitle'] = df_titles.apply(set_isTopTitle, axis=1)

In [20]:
print(df_titles.shape)

(5806, 14)


In [21]:
df_titles['is_topTitle'].value_counts()

is_topTitle
False    5172
True      634
Name: count, dtype: int64

In [22]:
print("Are There duplicate combinations of title and type?:",df_titles[['title', 'type']].duplicated().any()) 
print("Are all ID values unique?:", df_titles['id'].is_unique)
print("Are there duplicate combinations of title, type and release year?:",df_titles[['title', 'type', 'release_year']].duplicated().any()) 

Are There duplicate combinations of title and type?: True
Are all ID values unique?: True
Are there duplicate combinations of title, type and release year?: False


In [23]:
top_titles = df_titles[df_titles['is_topTitle']]

# Check if any rows in top_titles are not in best_movies or best_shows
mismatches = top_titles[
    ~top_titles.apply(lambda x: [x['title'], x['release_year']] in best_movies or [x['title'], x['release_year']] in best_shows, axis=1)
]

print("Mismatched entries in df_titles where is_topTitle is True but no exact match in best_movies or best_shows:")
print(mismatches)

Mismatched entries in df_titles where is_topTitle is True but no exact match in best_movies or best_shows:
Empty DataFrame
Columns: [id, title, type, release_year, age_certification, runtime, genres, production_countries, seasons, imdb_id, imdb_score, imdb_votes, is_awarded, is_topTitle]
Index: []


In [24]:
df_titles['is_topTitle'] = df_titles.apply(set_isTopTitle, axis=1)
print(df_titles['is_topTitle'].value_counts())

is_topTitle
False    5172
True      634
Name: count, dtype: int64


In [25]:
# weird bug. there is one miss idendentified TopTitle as the total count is correct but there is one extra True and missing one False. Will need to check later.

In [26]:
df_titles = df_titles[['type', 'title', 'release_year', 'is_topTitle']]

In [27]:
df_titles.to_csv('./yes.csv')

# MongoDB

In [28]:
load_dotenv('../mySecrets.env')
PASSWORD = os.getenv('PASSWORD')

# Connect to MongoDB atlas
client = MongoClient(f"mongodb+srv://fc64335:{PASSWORD}@fcul.l0hna.mongodb.net/?retryWrites=true&w=majority&appName=FCUL")

#drop database if you need
client.drop_database("DV_project")
print("Database dropped")

# Create a new database or connect to an existing one
db = client['DV_project']

Database dropped


Lets start by merging the movie and show of the year into the best movies/shows dataframes, adding the is_awarded, boolean value to the top title

In [29]:
df_bsn['is_awarded'] = df_bsn['TITLE'].isin(df_soty['TITLE'])
df_bsn.head()

Unnamed: 0_level_0,TITLE,RELEASE_YEAR,SCORE,NUMBER_OF_VOTES,DURATION,NUMBER_OF_SEASONS,MAIN_GENRE,MAIN_PRODUCTION,is_awarded
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Breaking Bad,2008,9.5,1727694,48,5,drama,US,True
1,Avatar: The Last Airbender,2005,9.3,297336,24,3,scifi,US,True
2,Our Planet,2019,9.3,41386,50,1,documentary,GB,False
3,Kota Factory,2019,9.3,66985,42,2,drama,IN,True
4,The Last Dance,2020,9.1,108321,50,1,documentary,US,True


In [30]:
df_bsn.is_awarded.value_counts()
# Not all Shows of the Year are in Netflix's Best shows selection. In part because some shows althouhg have been awarded in their release year, do not reach current SCORE threshold.

is_awarded
False    218
True      28
Name: count, dtype: int64

In [31]:
df_bmn['is_awarded'] = df_bmn['TITLE'].isin(df_moty['TITLE'])
df_bmn.head()

Unnamed: 0_level_0,TITLE,RELEASE_YEAR,SCORE,NUMBER_OF_VOTES,DURATION,MAIN_GENRE,MAIN_PRODUCTION,is_awarded
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,David Attenborough: A Life on Our Planet,2020,9.0,31180,83,documentary,GB,True
1,Inception,2010,8.8,2268288,148,scifi,GB,True
2,Forrest Gump,1994,8.8,1994599,142,drama,US,True
3,Anbe Sivam,2003,8.7,20595,160,comedy,IN,False
4,Bo Burnham: Inside,2021,8.7,44074,87,comedy,US,True


In [32]:
df_bmn.is_awarded.value_counts()

is_awarded
False    340
True      47
Name: count, dtype: int64

In [33]:
df_titles = pd.read_csv("../Data/Data-2/raw_titles.csv", index_col='index')
df_titles.head()

Unnamed: 0_level_0,id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0


In [34]:
df_moty['type'] = 'MOVIE'
df_bsy['type'] = 'SHOW'

NameError: name 'df_bsy' is not defined

In [None]:
df_awarded = pd.concat([df_moty[['TITLE', 'type']], df_bsy[['TITLE', 'type']]])
df_awarded.head()

In [None]:
print(df_moty.shape) #49
print(df_bsy.shape)  #31


In [None]:
df_titles['is_awarded'] = df_titles[['title']].isin(df_awarded['TITLE'])
df_titles.head()

In [None]:
df_titles['is_awarded'].value_counts()

In [None]:
# # Drop the collection
db['movies'].drop()
db['shows'].drop()
db['credits'].drop()
print("Collection dropped")

In [None]:
db.create_collection("cities", validator={
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['name', 'country', 'foundationYear'],
        'properties': {
            'name': {
                'bsonType': 'string',
                'description': 'must be a string and is required'
            },
            'country': {
                'bsonType': 'string',
                'description': 'must be a string and is required'
            },
            'is_capital': {
                'bsonType': 'bool',
                'description': 'must be a boolean if provided'
            },
            'population': {
                'bsonType': 'int',
                'minimum': 0,
                'description': 'must be a non-negative integer if provided'
            },
            'foundationYear': {
                'bsonType': 'int',
                'maximum': 2024,
                'description': "must be a integer, is required and can't be in the future"
            }
        }
    }
})

In [None]:
collection_cities = db["Movie of the Year"]
initial_count = collection_cities.count_documents({})
print(f"# documents: {initial_count}") 
try:
    data_dict = df_cleaned.to_dict(orient="records")
    result = collection_cities.insert_many(data_dict, ordered=False)
    valid_count = collection_cities.count_documents({}) - initial_count
    print(f"{valid_count} documents inserted.")

except Exception as e:
    attempted_count = len(data_dict)
    valid_count = collection_cities.count_documents({}) - initial_count
    print(f"{valid_count} documents inserted")
    print(f"{attempted_count - valid_count} documents failed to insert")


print(f"# documents: {collection_cities.count_documents({})}") 