In [1]:
import os
import requests
import zipfile
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pprint import pprint as pp

# Import data from MovieLens

In [2]:
url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

# Create the 'data' directory if it doesn't exist
data_dir = '../data'
os.makedirs(data_dir, exist_ok=True)

# Save the ZIP file inside the 'data' directory
zip_path = os.path.join(data_dir, 'ml-latest-small.zip')

# Get the data
response = requests.get(url, stream=True)

# Save the data
with open(zip_path, 'wb') as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)

# Extract the ZIP file within the 'data' directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # zip_ref.extractall()
    for member in zip_ref.namelist():
        target_path = os.path.join(data_dir, member)
        zip_ref.extract(member, path=data_dir)

print(f"Dataset downloaded and extracted to: {data_dir}")

KeyboardInterrupt: 

## Check the links data

In [3]:
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,0114709,862
1,2,0113497,8844
2,3,0113228,15602
3,4,0114885,31357
4,5,0113041,11862
...,...,...,...
9729,193581,5476944,432131
9730,193583,5914996,445030
9731,193585,6397426,479308
9732,193587,8391976,483455


In [4]:
"""
When I initially loaded the imdbId values as ints, some had only 6 digits and were missing
a preceding 0 which caused them to no correspond to the correct IMDB url. When loaded as 
strings, they appear to have the preceding 0s but we need to check this.
"""

# Check for imdbIds of the wrong length.
links_df[links_df.imdbId.apply(lambda x:len(x)!=7)]

Unnamed: 0,movieId,imdbId,tmdbId


In [5]:
# Check for NaN vals
print(links_df.isna().sum()) 

movieId    0
imdbId     0
tmdbId     0
dtype: int64


In [6]:
''' 
We need to make sure these movieIds are filtered out of the other datasets because we are training
the Matrix Factorization model on the tmdb data (see web-scraping section below) and we need all the models to be working on the same data.
'''

links_df[links_df.isna().tmdbId==True]

Unnamed: 0,movieId,imdbId,tmdbId


## Filter out movies with missing tmdbIds

### movies.csv

In [7]:
movies_missing_tmdbIds = links_df[links_df.isna().tmdbId==True]

movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies_df.shape

(9734, 3)

In [9]:
""" These don't look like popular movies anyway. """
movies_df[movies_df.index.isin(movies_missing_tmdbIds.index)]

Unnamed: 0,movieId,title,genres


In [10]:
""" The eight movies have been removed. """
movies_df = movies_df[~movies_df.index.isin(movies_missing_tmdbIds.index)]
movies_df.shape

(9734, 3)

### ratings.csv

In [11]:
ratings_df = pd.read_csv('../data/ml-latest-small/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
ratings_df.shape

(100823, 4)

In [13]:
"""
Let's just double check how much data we're dropping.
-> No big deal. These 8 movies have only been rated once or twice anyway.
"""
ratings_df[ratings_df.movieId.isin(movies_missing_tmdbIds.movieId)]

Unnamed: 0,userId,movieId,rating,timestamp


In [14]:
""" Remove the movies. """
ratings_df = ratings_df[~ratings_df.movieId.isin(movies_missing_tmdbIds.movieId)]
ratings_df.shape

(100823, 4)

### tags.csv

In [15]:
tags_df = pd.read_csv('../data/ml-latest-small/tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [16]:
tags_df.shape

(3683, 4)

In [17]:
""" Remove the movies. """
tags_df = tags_df[~tags_df.movieId.isin(movies_missing_tmdbIds.movieId)]
tags_df.shape

(3683, 4)

### links.csv

In [18]:
print(f"Num links before: {links_df.shape[0]}")
links_df = links_df[~links_df.movieId.isin(movies_missing_tmdbIds.movieId)]
print(f"Num links after: {links_df.shape[0]}")

Num links before: 9734
Num links after: 9734


## Overwrite the CSVs

In [19]:
movies_df.to_csv('../data/ml-latest-small/movies.csv', index=False)
ratings_df.to_csv('../data/ml-latest-small/ratings.csv', index=False)
tags_df.to_csv('../data/ml-latest-small/tags.csv', index=False)
links_df.to_csv('../data/ml-latest-small/links.csv', index=False)

## Reload the data and check the size

In [20]:
movies_df = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('../data/ml-latest-small/ratings.csv')
tags_df = pd.read_csv('../data/ml-latest-small/tags.csv')
links_df = pd.read_csv(
    '../data/ml-latest-small/links.csv', 
    usecols=['movieId', 'imdbId', 'tmdbId'],
    dtype={'movieId': int, 'imdbId': str, 'tmdbId': str}
)

print(f"movies_df shape: {movies_df.shape}")
print(f"ratings_df shape: {ratings_df.shape}")
print(f"tags_df shape: {tags_df.shape}")
print(f"links_df shape: {links_df.shape}")

links_df.head()

movies_df shape: (9734, 3)
ratings_df shape: (100823, 4)
tags_df shape: (3683, 4)
links_df shape: (9734, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862
