In [1]:
!pip install pymongo pandas jupyter ipykernel



In [2]:
from pymongo import MongoClient
from bson.json_util import dumps
import pandas as pd
import gzip

In [3]:
client = MongoClient('mongodb://localhost:27017/')
db = client['imdb_database']

In [4]:
def create_collections():
    collections = {
        "title.akas": {
            "validator": {
                "$jsonSchema": {
                    "bsonType": "object",
                    "required": ["titleId", "ordering", "title", "region", "language", "types", "attributes", "isOriginalTitle"],
                    "properties": {
                        "titleId": {"bsonType": "string"},
                        "ordering": {"bsonType": "int"},
                        "title": {"bsonType": "string"},
                        "region": {"bsonType": "string"},
                        "language": {"bsonType": "string"},
                        "types": {"bsonType": "array", "items": {"bsonType": "string"}},
                        "attributes": {"bsonType": "array", "items": {"bsonType": "string"}},
                        "isOriginalTitle": {"bsonType": "bool"}
                    }
                }
            },
            "indexes": ["titleId"]
        },
        "title.basics": {
            "validator": {
                "$jsonSchema": {
                    "bsonType": "object",
                    "required": ["tconst", "titleType", "primaryTitle", "originalTitle", "isAdult", "startYear", "runtimeMinutes", "genres"],
                    "properties": {
                        "tconst": {"bsonType": "string"},
                        "titleType": {"bsonType": "string"},
                        "primaryTitle": {"bsonType": "string"},
                        "originalTitle": {"bsonType": "string"},
                        "isAdult": {"bsonType": "bool"},
                        "startYear": {"bsonType": "string"},
                        "endYear": {"bsonType": "string"},
                        "runtimeMinutes": {"bsonType": "int"},
                        "genres": {"bsonType": "array", "items": {"bsonType": "string"}}
                    }
                }
            },
            "indexes": ["tconst"]
        },
        "title.crew": {
            "validator": {
                "$jsonSchema": {
                    "bsonType": "object",
                    "required": ["tconst", "directors", "writers"],
                    "properties": {
                        "tconst": {"bsonType": "string"},
                        "directors": {"bsonType": "array", "items": {"bsonType": "string"}},
                        "writers": {"bsonType": "array", "items": {"bsonType": "string"}}
                    }
                }
            },
            "indexes": ["tconst"]
        },
        "title.episode": {
            "validator": {
                "$jsonSchema": {
                    "bsonType": "object",
                    "required": ["tconst", "parentTconst", "seasonNumber", "episodeNumber"],
                    "properties": {
                        "tconst": {"bsonType": "string"},
                        "parentTconst": {"bsonType": "string"},
                        "seasonNumber": {"bsonType": "int"},
                        "episodeNumber": {"bsonType": "int"}
                    }
                }
            },
            "indexes": ["tconst"]
        },
        "title.principals": {
            "validator": {
                "$jsonSchema": {
                    "bsonType": "object",
                    "required": ["tconst", "ordering", "nconst", "category", "job", "characters"],
                    "properties": {
                        "tconst": {"bsonType": "string"},
                        "ordering": {"bsonType": "int"},
                        "nconst": {"bsonType": "string"},
                        "category": {"bsonType": "string"},
                        "job": {"bsonType": "string"},
                        "characters": {"bsonType": "array", "items": {"bsonType": "string"}}
                    }
                }
            },
            "indexes": ["tconst"]
        },
        "title.ratings": {
            "validator": {
                "$jsonSchema": {
                    "bsonType": "object",
                    "required": ["tconst", "averageRating", "numVotes"],
                    "properties": {
                        "tconst": {"bsonType": "string"},
                        "averageRating": {"bsonType": "double"},
                        "numVotes": {"bsonType": "int"}
                    }
                }
            },
            "indexes": ["tconst"]
        },
        "name.basics": {
            "validator": {
                "$jsonSchema": {
                    "bsonType": "object",
                    "required": ["nconst", "primaryName", "birthYear", "primaryProfession", "knownForTitles"],
                    "properties": {
                        "nconst": {"bsonType": "string"},
                        "primaryName": {"bsonType": "string"},
                        "birthYear": {"bsonType": "string"},
                        "deathYear": {"bsonType": "string"},
                        "primaryProfession": {"bsonType": "array", "items": {"bsonType": "string"}},
                        "knownForTitles": {"bsonType": "array", "items": {"bsonType": "string"}}
                    }
                }
            },
            "indexes": ["nconst"]
        }
    }

    for collection_name, options in collections.items():
        try:
            db.create_collection(collection_name, validator=options["validator"])
            for index in options["indexes"]:
                db[collection_name].create_index([(index, 1)])
        except Exception as e:
            print(f"Collection {collection_name} already exists or error occurred: {e}")

# Function to load data from TSV files into MongoDB collections

In [5]:
def load_data(file_path, collection_name):
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        df = pd.read_csv(file, delimiter='\t', na_values='\\N')
        df = df.fillna('')
        data = df.to_dict(orient='records')
        db[collection_name].insert_many(data)

In [6]:
# Create collections
create_collections()

import os
print(os.getcwd())

# Load data into collections
load_data('imdb/title.akas.tsv.gz', 'title.akas')


Collection title.akas already exists or error occurred: collection title.akas already exists
Collection title.basics already exists or error occurred: collection title.basics already exists
Collection title.crew already exists or error occurred: collection title.crew already exists
Collection title.episode already exists or error occurred: collection title.episode already exists
Collection title.principals already exists or error occurred: collection title.principals already exists
Collection title.ratings already exists or error occurred: collection title.ratings already exists
Collection name.basics already exists or error occurred: collection name.basics already exists
/workspaces/SimilityVectorEmbedding


: 

: 

In [None]:
load_data('imdb/title.basics.tsv.gz', 'title.basics')

In [None]:
load_data('imdb/title.crew.tsv.gz', 'title.crew')

In [None]:
load_data('imdb/title.episode.tsv.gz', 'title.episode')

In [None]:
load_data('imdb/title.principals.tsv.gz', 'title.principals')

In [None]:
load_data('imdb/title.ratings.tsv.gz', 'title.ratings')

In [None]:
load_data('imdb/name.basics.tsv.gz', 'name.basics')

In [None]:
# Verify data
print(db['title.akas'].find_one())
print(db['title.basics'].find_one())
print(db['title.crew'].find_one())
print(db['title.episode'].find_one())
print(db['title.principals'].find_one())
print(db['title.ratings'].find_one())
print(db['name.basics'].find_one())
