In [None]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pprint import pprint as pp

In [None]:
# Import data from MovieLens

In [None]:
url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Create the 'data' directory if it doesn't exist
data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)

# Save the ZIP file inside the 'data' directory
zip_path = os.path.join(data_dir, 'ml-latest-small.zip')

# Get the data
response = requests.get(url, stream=True)

# Save the data
with open(zip_path, 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

# Extract the ZIP file within the 'data' directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # zip_ref.extractall()
    for member in zip_ref.namelist():
        target_path = os.path.join(data_dir, member)
        zip_ref.extract(member, path=data_dir)

print(f"Dataset downloaded and extracted to: {data_dir}")

In [None]:
## Check the links data

In [None]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
links_df

In [None]:
"""
When I initially loaded the imdbId values as ints, some had only 6 digits and were missing
a preceding 0 which caused them to no correspond to the correct IMDB url. When loaded as 
strings, they appear to have the preceding 0s but we need to check this.
"""

# Check for imdbIds of the wrong length.
links_df[links_df.imdbId.apply(lambda x:len(x)!=7)]

In [None]:
# Check for NaN vals
print(links_df.isna().sum()) 

In [None]:
''' 
We need to make sure these movieIds are filtered out of the other datasets because we are training
the Matrix Factorization model on the tmdb data (see web-scraping section below) and we need all the models to be working on the same data.
'''

links_df[links_df.isna().tmdbId==True]

In [None]:
## Filter out movies with missing tmdbIds

In [None]:
### movies.csv

In [None]:
movies_missing_tmdbIds = links_df[links_df.isna().tmdbId==True]

movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
movies_df.head()

In [None]:
movies_df.shape

In [None]:
""" These don't look like popular movies anyway. """
movies_df[movies_df.index.isin(movies_missing_tmdbIds.index)]

In [None]:
""" The eight movies have been removed. """
movies_df = movies_df[~movies_df.index.isin(movies_missing_tmdbIds.index)]
movies_df.shape

In [None]:
### ratings.csv

In [None]:
ratings_df = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings_df.head()

In [None]:
ratings_df.shape

In [None]:
"""
Let's just double check how much data we're dropping.
-> No big deal. These 8 movies have only been rated once or twice anyway.
"""
ratings_df[ratings_df.movieId.isin(movies_missing_tmdbIds.movieId)]

In [None]:
""" Remove the movies. """
ratings_df = ratings_df[~ratings_df.movieId.isin(movies_missing_tmdbIds.movieId)]
ratings_df.shape

In [None]:
### tags.csv

In [None]:
tags_df = pd.read_csv('../data/ml-latest-small/tags.csv')
tags_df.head()

In [None]:
tags_df.shape

In [None]:
""" Remove the movies. """
tags_df = tags_df[~tags_df.movieId.isin(movies_missing_tmdbIds.movieId)]
tags_df.shape

In [None]:
### links.csv

In [None]:
print(f"Num links before: {links_df.shape[0]}")
links_df = links_df[~links_df.movieId.isin(movies_missing_tmdbIds.movieId)]
print(f"Num links after: {links_df.shape[0]}")

In [None]:
## Overwrite the CSVs

In [None]:
movies_df.to_csv('../data/ml-latest-small/movies.csv', index=False)
ratings_df.to_csv('../data/ml-latest-small/ratings.csv', index=False)
tags_df.to_csv('../data/ml-latest-small/tags.csv', index=False)
links_df.to_csv('../data/ml-latest-small/links.csv', index=False)

In [None]:
## Reload the data and check the size

In [None]:
movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('../data/ml-latest-small/ratings.csv')
tags_df = pd.read_csv('../data/ml-latest-small/tags.csv')
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)

print(f"movies_df shape: {movies_df.shape}")
print(f"ratings_df shape: {ratings_df.shape}")
print(f"tags_df shape: {tags_df.shape}")
print(f"links_df shape: {links_df.shape}")

links_df.head()