In [2]:
# This is a simple script to preprocess the data from the file netflix1.csv into import statements for a MongoDB database.

# It will split the date_added from a day/month/year format into 3 separate fields.
# the listed_in field will be split into a list of strings for mongodb

import csv

db.netflix.insertOne({'show_id': 's1', 'type': 'Movie', 'title': 'Dick Johnson Is Dead', 'director': 'Kirsten Johnson', 'country': 'United States', 'release_year': '2020', 'rating': 'PG-13', 'duration': '90 min', 'listed_in': ['Documentaries'], 'day': '9', 'month': '25', 'year': '2021'})
db.netflix.insertOne({'show_id': 's3', 'type': 'TV Show', 'title': 'Ganglands', 'director': 'Julien Leclercq', 'country': 'France', 'release_year': '2021', 'rating': 'TV-MA', 'duration': '1 Season', 'listed_in': ['Crime TV Shows', 'International TV Shows', 'TV Action & Adventure'], 'day': '9', 'month': '24', 'year': '2021'})
db.netflix.insertOne({'show_id': 's6', 'type': 'TV Show', 'title': 'Midnight Mass', 'director': 'Mike Flanagan', 'country': 'United States', 'release_year': '2021', 'rating': 'TV-MA', 'duration': '1 Season', 'listed_in': ['TV Dramas', 'TV Horror', 'TV Mysteries'], 'day': '9', 'month': '24', 'year': '2021'})
db.netflix.insertOne({'show_id': 's14', 'type': 'Movie', 'title': 'Confessi

In [3]:
# function that given the day, month, year returns the day of the week
import datetime

def get_day_of_week(day, month, year):
    # convert the day, month, year to a datetime object
    date = datetime.datetime(year, month, day)
    # return the day of the week
    return date.strftime("%A")

In [35]:
# Output the MongoDB insert statements into a python file

# create a new txt file
f = open("netflixToMongoDB.py", "w")
f.write("#!/usr/bin/env python3\n")
f.write("# This file contains the MongoDB insert statements for the netflix1.csv file\n")


with open('netflix1.csv', newline='') as csvfile:
    # these are the column headers: show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
    reader = csv.DictReader(csvfile)
    
    # MongoDB insert many
    print("db.netflix.insertMany([")
    f.write("db.netflix.insertMany([\n")
    
    for row in reader:
        # split director field into a list of strings
        row['director'] = row['director'].split(', ')
        
        # split the date_added field into 3 fields: day, month, year
        date = row['date_added'].split('/')
        row['month_added'] = int(date[0])
        row['day_added'] = int(date[1])
        row['year_added'] = int(date[2])
        row['day_of_week_added'] = get_day_of_week(int(date[1]), int(date[0]), int(date[2]))
        del row['date_added']
        
        # release year to int
        row['release_year'] = int(row['release_year'])
        
        # if movie, add movie_min field
        if row['type'] == 'Movie':
            row['movie_min'] = int(row['duration'].split(' ')[0])
        else:
            row['tv_seasons'] = int(row['duration'].split(' ')[0])
    
        # split the listed_in field into a list of strings
        row['listed_in'] = row['listed_in'].split(', ')
        
        
        # print the individual mongodb insert command
        print(str(row)+",")
        f.write(str(row)+",\n")
        
    print("])")
    f.write("])\n")
    f.close()

db.netflix.insertMany([
{'show_id': 's1', 'type': 'Movie', 'title': 'Dick Johnson Is Dead', 'director': ['Kirsten Johnson'], 'country': 'United States', 'release_year': 2020, 'rating': 'PG-13', 'duration': '90 min', 'listed_in': ['Documentaries'], 'month_added': 9, 'day_added': 25, 'year_added': 2021, 'day_of_week_added': 'Saturday', 'movie_min': 90},
{'show_id': 's3', 'type': 'TV Show', 'title': 'Ganglands', 'director': ['Julien Leclercq'], 'country': 'France', 'release_year': 2021, 'rating': 'TV-MA', 'duration': '1 Season', 'listed_in': ['Crime TV Shows', 'International TV Shows', 'TV Action & Adventure'], 'month_added': 9, 'day_added': 24, 'year_added': 2021, 'day_of_week_added': 'Friday', 'tv_seasons': 1},
{'show_id': 's6', 'type': 'TV Show', 'title': 'Midnight Mass', 'director': ['Mike Flanagan'], 'country': 'United States', 'release_year': 2021, 'rating': 'TV-MA', 'duration': '1 Season', 'listed_in': ['TV Dramas', 'TV Horror', 'TV Mysteries'], 'month_added': 9, 'day_added': 24, '