In [1]:
import pandas as pd

# Load datasets
movies = pd.read_csv("ml-latest/movies.csv")
ratings = pd.read_csv("ml-latest/ratings.csv")
tags = pd.read_csv("ml-latest/tags.csv")
genome_scores = pd.read_csv("ml-latest/genome-scores.csv")
genome_tags = pd.read_csv("ml-latest/genome-tags.csv")
links = pd.read_csv("ml-latest/links.csv")

# Display first few rows of each dataset
print("Movies:")
print(movies.head())

print("\nRatings:")
print(ratings.head())

print("\nTags:")
print(tags.head())

print("\nGenome Scores:")
print(genome_scores.head())

print("\nGenome Tags:")
print(genome_tags.head())

print("\nLinks:")
print(links.head())


Movies:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings:
   userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1       1      110     4.0  1225865086
2       1      158     4.0  1225733503
3       1      260     4.5  1225735204
4       1      356     5.0  1225735119

Tags:
   userId  movieId            tag   timestamp
0      10      260   good vs evil  1430666558
1      10      260  Harrison Ford  1430666505
2      1

In [2]:
print(len(ratings))
print(len(movies))
print(len(tags))
print(len(genome_scores))
print(len(genome_tags))
print(len(links))

33832162
86537
2328315
18472128
1128
86537


In [2]:
# Check for missing values in each dataset
print("Missing values in Movies:")
print(movies.isnull().sum())

print("\nMissing values in Ratings:")
print(ratings.isnull().sum())

print("\nMissing values in Tags:")
print(tags.isnull().sum())

print("\nMissing values in Genome Scores:")
print(genome_scores.isnull().sum())

print("\nMissing values in Genome Tags:")
print(genome_tags.isnull().sum())

print("\nMissing values in Links:")
print(links.isnull().sum())


Missing values in Movies:
movieId    0
title      0
genres     0
dtype: int64

Missing values in Ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Missing values in Tags:
userId        0
movieId       0
tag          17
timestamp     0
dtype: int64

Missing values in Genome Scores:
movieId      0
tagId        0
relevance    0
dtype: int64

Missing values in Genome Tags:
tagId    0
tag      0
dtype: int64

Missing values in Links:
movieId      0
imdbId       0
tmdbId     126
dtype: int64


In [3]:
# Drop rows with missing tags
tags_cleaned = tags.dropna(subset=["tag"])

# Leave Links as-is or drop rows with missing tmdbId if needed
# links_cleaned = links.dropna(subset=["tmdbId"])

# Confirm cleaning
print("Missing values in Tags after cleaning:")
print(tags_cleaned.isnull().sum())


Missing values in Tags after cleaning:
userId       0
movieId      0
tag          0
timestamp    0
dtype: int64


In [4]:
# Merge Movies with Ratings
movies_ratings = pd.merge(ratings, movies, on="movieId", how="inner")

# Merge Movies+Ratings with Tags (optional)
movies_ratings_tags = pd.merge(movies_ratings, tags_cleaned, on=["movieId", "userId"], how="left")

# Display a sample of the consolidated dataset
print("Sample consolidated dataset:")
print(movies_ratings_tags.head())

# Check for missing values in the consolidated dataset
print("\nMissing values in consolidated dataset:")
print(movies_ratings_tags.isnull().sum())


Sample consolidated dataset:
   userId  movieId  rating  timestamp_x  \
0       1        1     4.0   1225734739   
1       1      110     4.0   1225865086   
2       1      158     4.0   1225733503   
3       1      260     4.5   1225735204   
4       1      356     5.0   1225735119   

                                       title  \
0                           Toy Story (1995)   
1                          Braveheart (1995)   
2                              Casper (1995)   
3  Star Wars: Episode IV - A New Hope (1977)   
4                        Forrest Gump (1994)   

                                        genres  tag  timestamp_y  
0  Adventure|Animation|Children|Comedy|Fantasy  NaN          NaN  
1                             Action|Drama|War  NaN          NaN  
2                           Adventure|Children  NaN          NaN  
3                      Action|Adventure|Sci-Fi  NaN          NaN  
4                     Comedy|Drama|Romance|War  NaN          NaN  

Missing values in co

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres into lists
movies_ratings_tags["genres_split"] = movies_ratings_tags["genres"].str.split('|')

# One-hot encode genres
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(
    mlb.fit_transform(movies_ratings_tags["genres_split"]),
    columns=mlb.classes_,
    index=movies_ratings_tags.index
)

# Add one-hot encoded genres back to the dataset
movies_ratings_tags = pd.concat([movies_ratings_tags, genres_encoded], axis=1)

# Drop unnecessary columns for modeling
columns_to_drop = ["title", "genres", "tag", "genres_split"]
movies_ratings_tags_cleaned = movies_ratings_tags.drop(columns=columns_to_drop)

# Display a sample of the cleaned dataset
print("Cleaned dataset ready for modeling:")
print(movies_ratings_tags_cleaned.head())


In [1]:
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# Create the user-movie interaction matrix
user_movie_matrix = movies_ratings_tags_cleaned.pivot_table(
    index="userId", columns="movieId", values="rating"
)

# Fill missing values with 0 (sparse matrix representation)
user_movie_matrix = user_movie_matrix.fillna(0)

# Convert the matrix to a sparse format for efficient computation
user_movie_sparse = csr_matrix(user_movie_matrix)

# Display basic statistics
num_users, num_movies = user_movie_matrix.shape
non_zero_ratings = np.count_nonzero(user_movie_matrix.values)
sparsity = 1 - (non_zero_ratings / (num_users * num_movies))

print(f"User-Movie Matrix Shape: {num_users} users, {num_movies} movies")
print(f"Total Non-Zero Ratings: {non_zero_ratings}")
print(f"Matrix Sparsity: {sparsity:.2%}")

# Split the dataset into training and testing sets
train, test = train_test_split(
    user_movie_matrix.values, test_size=0.2, random_state=42
)

print("Training and testing data prepared:")
print(f"Training set shape: {train.shape}")
print(f"Testing set shape: {test.shape}")


NameError: name 'movies_ratings_tags_cleaned' is not defined