In [1]:
import pandas as pd

# Load datasets
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

# Display first few rows of each dataset
print("Movies:")
print(movies.head())

print("\nRatings:")
print(ratings.head())

print("\nTags:")
print(tags.head())

Movies:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings:
   userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1       1      110     4.0  1225865086
2       1      158     4.0  1225733503
3       1      260     4.5  1225735204
4       1      356     5.0  1225735119

Tags:
   userId  movieId            tag   timestamp
0      10      260   good vs evil  1430666558
1      10      260  Harrison Ford  1430666505
2      1

In [2]:
print(len(ratings))
print(len(movies))
print(len(tags))

33832162
86537
2328316


In [3]:
# Check for missing values in each dataset
print("Missing values in Movies:")
print(movies.isnull().sum())

print("\nMissing values in Ratings:")
print(ratings.isnull().sum())

print("\nMissing values in Tags:")
print(tags.isnull().sum())

Missing values in Movies:
movieId    0
title      0
genres     0
dtype: int64

Missing values in Ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Missing values in Tags:
userId        0
movieId       0
tag          17
timestamp     0
dtype: int64


In [4]:
movies_title = movies[['movieId', 'title']]
movies=movies.drop(columns=['title'])
movies_title

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
86532,288967,State of Siege: Temple Attack (2021)
86533,288971,Ouija Japan (2021)
86534,288975,The Men Who Made the Movies: Howard Hawks (1973)
86535,288977,Skinford: Death Sentence (2023)


In [5]:
ratings["movieId"] = ratings["movieId"].astype('int32')
ratings["rating"] = ratings["rating"].astype('float32')
ratings = ratings.drop(columns=["timestamp"])
ratings["userId"] = ratings["userId"].astype('int32')
movies["movieId"] = movies["movieId"].astype('int32')
tags["movieId"] = tags["movieId"].astype('int32')
tags["userId"] = tags["userId"].astype('int32')
tags = tags.drop(columns=["timestamp"])
print(movies.info())
print(ratings.info())
print(tags.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int32 
 1   genres   86537 non-null  object
dtypes: int32(1), object(1)
memory usage: 1014.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 387.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328316 entries, 0 to 2328315
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   userId   int32 
 1   movieId  int32 
 2   tag      object
dtypes: int32(2), object(1)
memory usage: 35.5+ MB
None


In [6]:
# Drop rows with missing tags
tags_cleaned = tags.dropna(subset=["tag"])

# Confirm cleaning
print("Missing values in Tags after cleaning:")
print(tags_cleaned.isnull().sum())

Missing values in Tags after cleaning:
userId     0
movieId    0
tag        0
dtype: int64


In [8]:
tags_cleaned

Unnamed: 0,userId,movieId,tag
0,10,260,good vs evil
1,10,260,Harrison Ford
2,10,260,sci-fi
3,14,1221,Al Pacino
4,14,1221,mafia
...,...,...,...
2328311,330923,176599,politically correct
2328312,330933,3317,coming of age
2328313,330933,3317,sexuality
2328314,330947,5782,Not Luc Besson


In [9]:
# Merge Movies with Ratings
movies_ratings = pd.merge(ratings, movies, on="movieId", how="inner")
movies_ratings

Unnamed: 0,userId,movieId,rating,genres
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy
1,1,110,4.0,Action|Drama|War
2,1,158,4.0,Adventure|Children
3,1,260,4.5,Action|Adventure|Sci-Fi
4,1,356,5.0,Comedy|Drama|Romance|War
...,...,...,...,...
33832157,330975,8340,2.0,Drama|Thriller
33832158,330975,8493,2.5,Action|Drama|War
33832159,330975,8622,4.0,Documentary
33832160,330975,8665,3.0,Action|Crime|Thriller


In [10]:
# Find common movieId and userId
common_movie_ids = set(movies_ratings['movieId']).intersection(set(tags_cleaned['movieId']))
common_user_ids = set(movies_ratings['userId']).intersection(set(tags_cleaned['userId']))

# Filter movies_ratings
filtered_movies_ratings = movies_ratings[
    (movies_ratings['movieId'].isin(common_movie_ids)) &
    (movies_ratings['userId'].isin(common_user_ids))
]

# Filter tags_cleaned
filtered_tags_cleaned = tags_cleaned[
    (tags_cleaned['movieId'].isin(common_movie_ids)) &
    (tags_cleaned['userId'].isin(common_user_ids))
]

In [11]:
movies_ratings_tags = pd.merge(
    filtered_movies_ratings,
    filtered_tags_cleaned,
    on=["movieId", "userId"],
    how="inner"  
)
movies_ratings_tags

Unnamed: 0,userId,movieId,rating,genres,tag
0,10,260,4.5,Action|Adventure|Sci-Fi,good vs evil
1,10,260,4.5,Action|Adventure|Sci-Fi,Harrison Ford
2,10,260,4.5,Action|Adventure|Sci-Fi,sci-fi
3,14,58559,5.0,Action|Crime|Drama|IMAX,Atmospheric
4,14,58559,5.0,Action|Crime|Drama|IMAX,Batman
...,...,...,...,...,...
1729286,330923,166643,0.5,Drama,revisionist
1729287,330923,176599,0.5,Comedy,black actor
1729288,330923,176599,0.5,Comedy,cringe
1729289,330923,176599,0.5,Comedy,political correct


In [12]:
movies_ratings_tags = (
    movies_ratings_tags
    .groupby(['userId', 'movieId', 'rating', 'genres'], as_index=False)
    .agg({'tag': lambda x: '|'.join(x.dropna().unique())})  # Combine tags with '|'
)

In [13]:
movies_ratings_tags

Unnamed: 0,userId,movieId,rating,genres,tag
0,10,260,4.5,Action|Adventure|Sci-Fi,good vs evil|Harrison Ford|sci-fi
1,14,58559,5.0,Action|Crime|Drama|IMAX,Atmospheric|Batman|comic book|dark|Heath Ledge...
2,16,57183,4.5,Drama,family
3,26,296,4.5,Comedy|Crime|Drama|Thriller,crime|cult film|quentin tarantino
4,37,47,5.0,Mystery|Thriller,Kevin Spacey|Morgan Freeman|powerful ending|tw...
...,...,...,...,...,...
333347,330923,161634,5.0,Thriller,intense|rape|sexual assault
333348,330923,162536,0.5,Thriller,gay romance|lgbt|political correct|politically...
333349,330923,166528,0.5,Action|Adventure|Fantasy|Sci-Fi,feminism|politically correct
333350,330923,166643,0.5,Drama,Feel good|female lead|feminism|feminist|identi...


In [15]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres into lists
movies_ratings_tags["genres_split"] = movies_ratings_tags["genres"].str.split('|')

# One-hot encode genres
mlb_genres = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(
    mlb_genres.fit_transform(movies_ratings_tags["genres_split"]),
    columns=mlb_genres.classes_,
    index=movies_ratings_tags.index
)

# Add one-hot encoded genres back to the dataset
movies_ratings_tags = pd.concat([movies_ratings_tags, genres_encoded], axis=1)
movies_ratings_tags.drop(columns=['genres_split', '(no genres listed)', 'genres'])
movies_ratings_tags

Unnamed: 0,userId,movieId,rating,genres,tag,genres_split,tag_split,(no genres listed),Action,Adventure,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,10,260,4.5,Action|Adventure|Sci-Fi,good vs evil|Harrison Ford|sci-fi,"[Action, Adventure, Sci-Fi]","[good vs evil, Harrison Ford, sci-fi]",0,1,1,...,0,0,0,0,0,0,1,0,0,0
1,14,58559,5.0,Action|Crime|Drama|IMAX,Atmospheric|Batman|comic book|dark|Heath Ledge...,"[Action, Crime, Drama, IMAX]","[Atmospheric, Batman, comic book, dark, Heath ...",0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,16,57183,4.5,Drama,family,[Drama],[family],0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,26,296,4.5,Comedy|Crime|Drama|Thriller,crime|cult film|quentin tarantino,"[Comedy, Crime, Drama, Thriller]","[crime, cult film, quentin tarantino]",0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,37,47,5.0,Mystery|Thriller,Kevin Spacey|Morgan Freeman|powerful ending|tw...,"[Mystery, Thriller]","[Kevin Spacey, Morgan Freeman, powerful ending...",0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333347,330923,161634,5.0,Thriller,intense|rape|sexual assault,[Thriller],"[intense, rape, sexual assault]",0,0,0,...,0,0,0,0,0,0,0,1,0,0
333348,330923,162536,0.5,Thriller,gay romance|lgbt|political correct|politically...,[Thriller],"[gay romance, lgbt, political correct, politic...",0,0,0,...,0,0,0,0,0,0,0,1,0,0
333349,330923,166528,0.5,Action|Adventure|Fantasy|Sci-Fi,feminism|politically correct,"[Action, Adventure, Fantasy, Sci-Fi]","[feminism, politically correct]",0,1,1,...,0,0,0,0,0,0,1,0,0,0
333350,330923,166643,0.5,Drama,Feel good|female lead|feminism|feminist|identi...,[Drama],"[Feel good, female lead, feminism, feminist, i...",0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
import scipy.sparse as sp

def process_tags_in_chunks(dataframe, chunk_size=30000):
    # Get all unique tags
    unique_tags = set('|'.join(dataframe['tag'].dropna()).split('|'))
   
    # Initialize MultiLabelBinarizer with all tags
    mlb = MultiLabelBinarizer(classes=list(unique_tags))
   
    results = []
    for i in range(0, len(dataframe), chunk_size):
        # Process chunk
        chunk = dataframe.iloc[i:i+chunk_size]
       
        # Split tags for the chunk
        chunk_tag_lists = chunk['tag'].str.split('|')
       
        # Fit and transform in one step for each chunk
        chunk_encoded = sp.csr_matrix(mlb.fit_transform(chunk_tag_lists))
       
        # Convert to sparse DataFrame
        chunk_tag_df = pd.DataFrame.sparse.from_spmatrix(
            chunk_encoded,
            columns=mlb.classes_,
            index=chunk.index
        )
       
        # Combine original chunk data with encoded tags
        chunk_result = pd.concat([chunk, chunk_tag_df], axis=1)
        results.append(chunk_result)
   
    # Concatenate all processed chunks
    return pd.concat(results)

movies_ratings_tags_cleaned = process_tags_in_chunks(movies_ratings_tags, chunk_size=25000).drop(columns=["tag", 'genres', 'genres_split', '(no genres listed)'])
movies_ratings_tags_cleaned 

Unnamed: 0,userId,movieId,rating,tag_split,Action,Adventure,Animation,Children,Comedy,Crime,...,kind woman,ebonics,no point of view,fuel crisis,movie actor,Rune Temte,feeding a pig,matrx copy,handheld communicator,corrupt police detective
0,10,260,4.5,"[good vs evil, Harrison Ford, sci-fi]",1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,14,58559,5.0,"[Atmospheric, Batman, comic book, dark, Heath ...",1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,16,57183,4.5,[family],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,26,296,4.5,"[crime, cult film, quentin tarantino]",0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,37,47,5.0,"[Kevin Spacey, Morgan Freeman, powerful ending...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333347,330923,161634,5.0,"[intense, rape, sexual assault]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
333348,330923,162536,0.5,"[gay romance, lgbt, political correct, politic...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
333349,330923,166528,0.5,"[feminism, politically correct]",1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
333350,330923,166643,0.5,"[Feel good, female lead, feminism, feminist, i...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Based on Genre and tags

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class MovieRecommendationDataset(Dataset):
    def __init__(self, dataframe, batch_size=10000):
        self.dataframe = dataframe
        self.batch_size = batch_size
        self.total_rows = len(dataframe)
        
        # Prepare feature columns
        self.feature_columns = [col for col in dataframe.columns 
                                if col not in ['userId', 'movieId', 'rating']]
    
    def __iter__(self):
        # Shuffle the dataframe
        shuffled_df = self.dataframe.sample(frac=1).reset_index(drop=True)
        
        for start in range(0, self.total_rows, self.batch_size):
            end = min(start + self.batch_size, self.total_rows)
            batch = shuffled_df.iloc[start:end]
            
            # Prepare features and targets
            features = batch[self.feature_columns].values
            targets = batch["rating"].values
            
            # Convert to tensors and ensure correct shapes
            features_tensor = torch.tensor(features, dtype=torch.float32)
            targets_tensor = torch.tensor(targets, dtype=torch.float32)
            
            yield features_tensor, targets_tensor
    
    def __len__(self):
        return (self.total_rows + self.batch_size - 1) // self.batch_size
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        features = torch.tensor(row.iloc[3:-1].values, dtype=torch.float32)
        target = torch.tensor(row.iloc[2], dtype=torch.float32)
        return features, target

# Prepare train and test datasets
train_data, test_data = train_test_split(movies_ratings_tags_cleaned, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = MovieRecommendationDataset(train_data, batch_size=64)
test_dataset = MovieRecommendationDataset(test_data, batch_size=64)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [59]:
import torch.nn as nn
import torch.optim as optim

class MovieRecommendationModel(nn.Module):
    def __init__(self, input_dim):
        super(MovieRecommendationModel, self).__init__()
        # Define the layers
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 1) 
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.3)
        
        # Activation functions
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Input dimension is the number of features (one-hot encoded genres + tag relevance)
input_dim = len(train_dataset.feature_columns) 

# Initialize the model
model = MovieRecommendationModel(input_dim)


In [15]:
# Move model to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 35
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (features, targets) in enumerate(train_loader):
        # Move data to the same device as the model
        features, targets = features.to(device), targets.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(features)

        # Compute loss
        loss = criterion(outputs.squeeze(), targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update running loss
        running_loss += loss.item()

        # Calculate accuracy
        # Assuming binary or multi-class classification:
        predicted = outputs.squeeze().round()  # Round predictions for binary classification
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

    # Calculate average loss and accuracy for this epoch
    avg_loss = running_loss / len(train_loader)
    accuracy = correct / total * 100

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")


Epoch 1/35, Loss: 3.5777, Accuracy: 14.58%
Epoch 2/35, Loss: 0.8700, Accuracy: 24.62%
Epoch 3/35, Loss: 0.6210, Accuracy: 30.51%
Epoch 4/35, Loss: 0.5117, Accuracy: 33.66%
Epoch 5/35, Loss: 0.4541, Accuracy: 36.19%
Epoch 6/35, Loss: 0.4255, Accuracy: 37.42%
Epoch 7/35, Loss: 0.3985, Accuracy: 38.35%
Epoch 8/35, Loss: 0.3772, Accuracy: 39.15%
Epoch 9/35, Loss: 0.3557, Accuracy: 40.23%
Epoch 10/35, Loss: 0.3336, Accuracy: 41.44%
Epoch 11/35, Loss: 0.3070, Accuracy: 42.65%
Epoch 12/35, Loss: 0.2907, Accuracy: 43.37%
Epoch 13/35, Loss: 0.2718, Accuracy: 44.50%
Epoch 14/35, Loss: 0.2546, Accuracy: 45.10%
Epoch 15/35, Loss: 0.2353, Accuracy: 46.34%
Epoch 16/35, Loss: 0.2202, Accuracy: 46.88%
Epoch 17/35, Loss: 0.2074, Accuracy: 47.59%
Epoch 18/35, Loss: 0.1943, Accuracy: 48.50%
Epoch 19/35, Loss: 0.1814, Accuracy: 49.16%
Epoch 20/35, Loss: 0.1704, Accuracy: 49.62%
Epoch 21/35, Loss: 0.1609, Accuracy: 50.01%
Epoch 22/35, Loss: 0.1531, Accuracy: 50.73%
Epoch 23/35, Loss: 0.1483, Accuracy: 50.8

In [16]:
import numpy as np

# Evaluate the model
model.eval()
predictions = []
targets = []

with torch.no_grad():
    for batch_idx, (features, targets) in enumerate(train_loader):
        # Move data to the same device as the model
        features, target = features.to(device), target.to(device)

        # Forward pass
        outputs = model(features)

        predictions.extend(outputs.squeeze().cpu().numpy())
        targets.extend(target.cpu().numpy())

# Convert predictions and targets to NumPy arrays
predictions = np.array(predictions)
targets = np.array(targets)

# Calculate RMSE
rmse = np.sqrt(np.mean((predictions - targets) ** 2))

# Calculate accuracy (assuming a tolerance for correct predictions)
tolerance = 0.5  # Adjust tolerance as needed
accuracy = np.mean(np.abs(predictions - targets) <= tolerance) * 100

print(f"RMSE on test set: {rmse:.4f}")
print(f"Accuracy within ±{tolerance}: {accuracy:.2f}%")

RMSE on test set: 1.0912
Accuracy within ±0.5: 36.99%


### For ratings

In [18]:
# Check userId and movieId ranges
print(f"Max userId: {max(movies_ratings_tags_cleaned['userId'])}, Min userId: {min(movies_ratings_tags_cleaned['userId'])}")
print(f"Max movieId: {max(movies_ratings_tags_cleaned['movieId'])}, Min movieId: {min(movies_ratings_tags_cleaned['movieId'])}")


Max userId: 15496, Min userId: 10
Max movieId: 286903, Min movieId: 1


In [19]:
movies_ratings_tags_cleaned['userId'] = pd.factorize(movies_ratings_tags_cleaned['userId'])[0]
movies_ratings_tags_cleaned['movieId'] = pd.factorize(movies_ratings_tags_cleaned['movieId'])[0]

In [20]:
print(f"Max userId: {max(movies_ratings_tags_cleaned['userId'])}, Min userId: {min(movies_ratings_tags_cleaned['userId'])}")
print(f"Max movieId: {max(movies_ratings_tags_cleaned['movieId'])}, Min movieId: {min(movies_ratings_tags_cleaned['movieId'])}")

Max userId: 1040, Min userId: 0
Max movieId: 5447, Min movieId: 0


In [60]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Define a PyTorch Dataset class
class MovieRecommendationDataset_Rating(Dataset):
    def __init__(self, dataframe, batch_size=10000):
        self.dataframe = dataframe
        self.batch_size = batch_size
        self.total_rows = len(dataframe)

    def __iter__(self):
        # Shuffle the dataframe
        shuffled_df = self.dataframe.sample(frac=1).reset_index(drop=True)

        for start in range(0, self.total_rows, self.batch_size):
            end = min(start + self.batch_size, self.total_rows)
            batch = shuffled_df.iloc[start:end]

            # Extract userId, movieId, and rating
            user_ids = batch["userId"].values
            movie_ids = batch["movieId"].values
            targets = batch["rating"].values

            # Convert to tensors and ensure correct shapes
            user_ids_tensor = torch.tensor(user_ids, dtype=torch.long)
            movie_ids_tensor = torch.tensor(movie_ids, dtype=torch.long)
            targets_tensor = torch.tensor(targets, dtype=torch.float32)

            yield user_ids_tensor, movie_ids_tensor, targets_tensor

    def __len__(self):
        return (self.total_rows + self.batch_size - 1) // self.batch_size

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        user_id = torch.tensor(row["userId"], dtype=torch.long)
        movie_id = torch.tensor(row["movieId"], dtype=torch.long)
        target = torch.tensor(row["rating"], dtype=torch.float32)
        return user_id, movie_id, target

train_data, test_data = train_test_split(movies_ratings_tags_cleaned, test_size=0.2, random_state=42)

train_dataset = MovieRecommendationDataset_Rating(train_data, batch_size=64)
test_dataset = MovieRecommendationDataset_Rating(test_data, batch_size=64)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [22]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class CollaborativeMovieRecommendationModel(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=500):
        super(CollaborativeMovieRecommendationModel, self).__init__()

        # Embedding layers for users and movies
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        # Genres and tags input (after one-hot encoding)
        self.fc1 = nn.Linear(embedding_dim * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

        self.relu = nn.ReLU()

    def forward(self, user_id, movie_id):
        # Get user and movie embeddings
        user_emb = self.user_embedding(user_id)
        movie_emb = self.movie_embedding(movie_id)

        # Concatenate embeddings with genres/tags features
        x = torch.cat([ user_emb, movie_emb], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the model, move to CUDA if available
num_users = max(movies_ratings_tags_cleaned['userId']) +1
num_movies = max(movies_ratings_tags_cleaned['movieId']) +1

model_ratings = CollaborativeMovieRecommendationModel(num_users, num_movies)
model_ratings = model_ratings.to(device)

In [25]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model_ratings.parameters(), lr=0.001)

# Define tolerance for accuracy
tolerance = 0.5  # Adjust this as needed for your use case

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model_ratings.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, (user_id, movie_id, target) in enumerate(train_loader):
        # Move data to the same device as the model
        user_id, movie_id, target = user_id.to(device), movie_id.to(device), target.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model_ratings(user_id, movie_id)

        # Compute loss
        loss = criterion(outputs.squeeze(), target)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update running loss
        running_loss += loss.item()

        # Calculate accuracy
        predictions = outputs.squeeze()
        correct_predictions += torch.sum(torch.abs(predictions - target) <= tolerance).item()
        total_predictions += target.size(0)

    # Calculate average loss and accuracy for this epoch
    avg_loss = running_loss / len(train_loader)
    accuracy = (correct_predictions / total_predictions) * 100

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")


Epoch 1/10, Loss: 1.9704, Accuracy: 33.88%
Epoch 2/10, Loss: 0.7297, Accuracy: 48.03%
Epoch 3/10, Loss: 0.5161, Accuracy: 56.01%
Epoch 4/10, Loss: 0.3903, Accuracy: 62.48%
Epoch 5/10, Loss: 0.2790, Accuracy: 70.11%
Epoch 6/10, Loss: 0.2122, Accuracy: 75.90%
Epoch 7/10, Loss: 0.1553, Accuracy: 81.73%
Epoch 8/10, Loss: 0.1201, Accuracy: 86.30%
Epoch 9/10, Loss: 0.0961, Accuracy: 89.49%
Epoch 10/10, Loss: 0.0743, Accuracy: 93.20%


In [26]:
# Define tolerance for accuracy
tolerance = 0.5 

# Evaluate the model
model_ratings.eval()
predictions = []
targets = []
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch_idx, (user_id, movie_id, target) in enumerate(train_loader):
        # Move data to the same device as the model
        user_id, movie_id, target = user_id.to(device), movie_id.to(device), target.to(device)

        # Forward pass
        outputs = model_ratings(user_id, movie_id)

        # Store predictions and targets
        predictions.extend(outputs.squeeze().cpu().numpy())
        targets.extend(target.cpu().numpy())

        # Calculate accuracy
        correct_predictions += torch.sum(torch.abs(outputs.squeeze() - target) <= tolerance).item()
        total_predictions += target.size(0)

# Convert predictions and targets to NumPy arrays
predictions = np.array(predictions)
targets = np.array(targets)

# Calculate RMSE
rmse = np.sqrt(np.mean((predictions - targets) ** 2))

# Calculate accuracy
accuracy = (correct_predictions / total_predictions) * 100

# Print results
print(f"RMSE on test set: {rmse:.4f}")
print(f"Accuracy on test set: {accuracy:.2f}%")


RMSE on test set: 1.0536
Accuracy on test set: 39.74%


In [27]:
def predict_for_user_id(id, model, df):
    model.eval()

    # Example user ID
    user_id = torch.tensor(id, dtype=torch.long).unsqueeze(0).to(device)

    # Get all movie IDs
    all_movie_ids = torch.tensor(range(1, max(df['movieId'])+1), dtype=torch.long).to(device) 

    # Create a tensor of the same user ID repeated for all movies
    user_ids = user_id.repeat(max(df['movieId']), 1).squeeze().to(device)

    rt = []
    # Forward pass to get predicted ratings for all movies
    with torch.no_grad():
        predicted_ratings = model(user_ids, all_movie_ids).squeeze()

    # Get top-N movies with highest predicted ratings
    top_n = 10
    top_n_ratings, top_n_indices = torch.topk(predicted_ratings, top_n)

    # Convert tensor indices back to movie IDs (adjusting for 0-based index)
    recommended_movie_ids = all_movie_ids[top_n_indices].cpu().numpy()

    # Map movie IDs to titles using the 'movieId' column in the DataFrame
    recommended_movie_titles = movies[movies['movieId'].isin(recommended_movie_ids)]['title'].values

    print(f"Top-{top_n} Recommended movies for User {id}:")
    for i, movie in enumerate(recommended_movie_titles, 1):
        print(f"{i}. {movie} (Predicted Rating: {top_n_ratings[i-1].item():.2f})")

In [28]:
predict_for_user_id(1, model_ratings, movies_ratings_tags_cleaned)

Top-10 Recommended movies for User 1:
1. Braveheart (1995) (Predicted Rating: 6.08)
2. Sirens (1994) (Predicted Rating: 5.84)
3. Vampire in Venice (Nosferatu a Venezia) (Nosferatu in Venice) (1986) (Predicted Rating: 5.84)
4. Pink Flamingos (1972) (Predicted Rating: 5.72)
5. Gossip (2000) (Predicted Rating: 5.56)
6. Jennifer 8 (1992) (Predicted Rating: 5.53)
7. White Christmas (1954) (Predicted Rating: 5.52)
8. Baraka (1992) (Predicted Rating: 5.46)
9. Breaker! Breaker! (1977) (Predicted Rating: 5.43)
10. Monster's Ball (2001) (Predicted Rating: 5.40)


## For Randomized Scaling

In [18]:
movies_ratings_tags_sample = movies_ratings_tags_cleaned.sample(frac=0.2, random_state=42)

In [21]:
movies_ratings_tags_sample

Unnamed: 0,userId,movieId,rating,tag_split,Action,Adventure,Animation,Children,Comedy,Crime,...,kind woman,ebonics,no point of view,fuel crisis,movie actor,Rune Temte,feeding a pig,matrx copy,handheld communicator,corrupt police detective
124302,124699,6713,4.5,"[romantic epic, Satoshi Kon]",0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
185455,185887,3119,3.5,"[giallo, italian horror, mario bava, slasher]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48298,48766,112868,2.0,"[aliens, poor writing, twist ending]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251999,243155,2528,5.0,[dystopia],1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198130,198317,6874,5.0,[martial arts],1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95950,96553,48516,3.0,[remake of an asian film],0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
266225,259542,207313,4.0,"[clever script, deception, murder mystery, mys...",0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
86773,88928,31921,4.0,[Robert Duvall],0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
292793,289905,66171,3.0,"[action, Chris Evans, Dakota Fanning, psychic ...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### For tags and genre

In [71]:
# Prepare train and test datasets
train_data, test_data = train_test_split(movies_ratings_tags_sample, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = MovieRecommendationDataset(train_data)
test_dataset = MovieRecommendationDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [72]:
train_data

Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,zoo,zookeeper,zoologist,zoophilia,zyzek,Unnamed: 17,Álex de la Iglesia,励志,坚强,奋斗
4546,260,2623,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4693,285,149,3.5,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10149,626,4300,1.0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
13537,824,851,4.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15251,1011,1020,3.5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2029,106,1351,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3214,179,2032,4.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6421,465,3173,2.5,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
9538,609,392,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
input_dim = len(train_dataset.feature_columns) 

# Initialize the model
model_scaled_tagsGenre = MovieRecommendationModel(input_dim)
model_scaled_tagsGenre.to(device)

MovieRecommendationModel(
  (fc1): Linear(in_features=14775, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)

In [78]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model_scaled_tagsGenre.parameters(), lr=0.001)

# Training loop
num_epochs = 35
for epoch in range(num_epochs):
    model_scaled_tagsGenre.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for batch_idx, (features, targets) in enumerate(train_loader):
        # Move data to the same device as the model
        features, targets = features.to(device), targets.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model_scaled_tagsGenre(features)

        # Compute loss
        loss = criterion(outputs.squeeze(), targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update running loss
        running_loss += loss.item()

        # Calculate accuracy
        # Assuming binary or multi-class classification:
        predicted = outputs.squeeze().round()  # Round predictions for binary classification
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

    # Calculate average loss and accuracy for this epoch
    avg_loss = running_loss / len(train_loader)
    accuracy = correct / total * 100

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

Epoch 1/35, Loss: 11.3135, Accuracy: 1.57%
Epoch 2/35, Loss: 2.7118, Accuracy: 12.75%
Epoch 3/35, Loss: 1.3462, Accuracy: 19.53%
Epoch 4/35, Loss: 0.9174, Accuracy: 26.15%
Epoch 5/35, Loss: 0.6586, Accuracy: 32.00%
Epoch 6/35, Loss: 0.4933, Accuracy: 37.57%
Epoch 7/35, Loss: 0.3979, Accuracy: 41.40%
Epoch 8/35, Loss: 0.3407, Accuracy: 43.95%
Epoch 9/35, Loss: 0.3083, Accuracy: 45.56%
Epoch 10/35, Loss: 0.2837, Accuracy: 46.85%
Epoch 11/35, Loss: 0.2597, Accuracy: 47.34%
Epoch 12/35, Loss: 0.2358, Accuracy: 48.14%
Epoch 13/35, Loss: 0.2247, Accuracy: 49.19%
Epoch 14/35, Loss: 0.2129, Accuracy: 49.19%
Epoch 15/35, Loss: 0.1930, Accuracy: 50.44%
Epoch 16/35, Loss: 0.1847, Accuracy: 50.52%
Epoch 17/35, Loss: 0.1746, Accuracy: 51.74%
Epoch 18/35, Loss: 0.1704, Accuracy: 51.94%
Epoch 19/35, Loss: 0.1660, Accuracy: 52.02%
Epoch 20/35, Loss: 0.1605, Accuracy: 52.10%
Epoch 21/35, Loss: 0.1504, Accuracy: 52.70%
Epoch 22/35, Loss: 0.1489, Accuracy: 53.03%
Epoch 23/35, Loss: 0.1444, Accuracy: 53.2

In [79]:
import numpy as np

# Evaluate the model
model_scaled_tagsGenre.eval()
predictions = []
targets = []

with torch.no_grad():
    for batch_idx, (features, targets) in enumerate(train_loader):
        # Move data to the same device as the model
        features, target = features.to(device), target.to(device)

        # Forward pass
        outputs = model_scaled_tagsGenre(features)

        predictions.extend(outputs.squeeze().cpu().numpy())
        targets.extend(target.cpu().numpy())

# Convert predictions and targets to NumPy arrays
predictions = np.array(predictions)
targets = np.array(targets)

# Calculate RMSE
rmse = np.sqrt(np.mean((predictions - targets) ** 2))

# Calculate accuracy (assuming a tolerance for correct predictions)
tolerance = 0.5  # Adjust tolerance as needed
accuracy = np.mean(np.abs(predictions - targets) <= tolerance) * 100

print(f"RMSE on test set: {rmse:.4f}")
print(f"Accuracy within ±{tolerance}: {accuracy:.2f}%")

RMSE on test set: 1.1986
Accuracy within ±0.5: 35.00%


### For Ratings

In [80]:
movies_ratings_tags_sample['userId'] = pd.factorize(movies_ratings_tags_sample['userId'])[0]
movies_ratings_tags_sample['movieId'] = pd.factorize(movies_ratings_tags_sample['movieId'])[0]

In [89]:
train_data, test_data = train_test_split(movies_ratings_tags_sample, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = MovieRecommendationDataset_Rating(train_data)
test_dataset = MovieRecommendationDataset_Rating(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

num_users = max(movies_ratings_tags_sample['userId']) +1
num_movies = max(movies_ratings_tags_sample['movieId']) +1

model_scaled_ratings = CollaborativeMovieRecommendationModel(num_users, num_movies)
model_scaled_ratings = model_scaled_ratings.to(device)

In [90]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model_scaled_ratings.parameters(), lr=0.001)

# Define tolerance for accuracy
tolerance = 0.5  # Adjust this as needed for your use case

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model_scaled_ratings.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, (user_id, movie_id, target) in enumerate(train_loader):
        # Move data to the same device as the model
        user_id, movie_id, target = user_id.to(device), movie_id.to(device), target.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model_scaled_ratings(user_id, movie_id)

        # Compute loss
        loss = criterion(outputs.squeeze(), target)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update running loss
        running_loss += loss.item()

        # Calculate accuracy
        predictions = outputs.squeeze()
        correct_predictions += torch.sum(torch.abs(predictions - target) <= tolerance).item()
        total_predictions += target.size(0)

    # Calculate average loss and accuracy for this epoch
    avg_loss = running_loss / len(train_loader)
    accuracy = (correct_predictions / total_predictions) * 100

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")


Epoch 1/10, Loss: 6.3428, Accuracy: 12.31%
Epoch 2/10, Loss: 1.0468, Accuracy: 38.26%
Epoch 3/10, Loss: 0.5365, Accuracy: 55.73%
Epoch 4/10, Loss: 0.3129, Accuracy: 67.68%
Epoch 5/10, Loss: 0.1964, Accuracy: 79.26%
Epoch 6/10, Loss: 0.1303, Accuracy: 86.12%
Epoch 7/10, Loss: 0.0813, Accuracy: 92.01%
Epoch 8/10, Loss: 0.0537, Accuracy: 95.92%
Epoch 9/10, Loss: 0.0367, Accuracy: 98.02%
Epoch 10/10, Loss: 0.0283, Accuracy: 98.79%


In [91]:
# Define tolerance for accuracy
tolerance = 0.5 

# Evaluate the model
model_scaled_ratings.eval()
predictions = []
targets = []
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
    for batch_idx, (user_id, movie_id, target) in enumerate(train_loader):
        # Move data to the same device as the model
        user_id, movie_id, target = user_id.to(device), movie_id.to(device), target.to(device)

        # Forward pass
        outputs = model_scaled_ratings(user_id, movie_id)

        # Store predictions and targets
        predictions.extend(outputs.squeeze().cpu().numpy())
        targets.extend(target.cpu().numpy())

        # Calculate accuracy
        correct_predictions += torch.sum(torch.abs(outputs.squeeze() - target) <= tolerance).item()
        total_predictions += target.size(0)

# Convert predictions and targets to NumPy arrays
predictions = np.array(predictions)
targets = np.array(targets)

# Calculate RMSE
rmse = np.sqrt(np.mean((predictions - targets) ** 2))

# Calculate accuracy
accuracy = (correct_predictions / total_predictions) * 100

# Print results
print(f"RMSE on test set: {rmse:.4f}")
print(f"Accuracy on test set: {accuracy:.2f}%")

RMSE on test set: 1.1231
Accuracy on test set: 33.71%


In [92]:
predict_for_user_id(1, model_scaled_ratings, movies_ratings_tags_sample)

Top-10 Recommended movies for User 1:
1. Johnny Mnemonic (1995) (Predicted Rating: 6.35)
2. Naked (1993) (Predicted Rating: 6.32)
3. Truth About Cats & Dogs, The (1996) (Predicted Rating: 6.29)
4. Pharaoh's Army (1995) (Predicted Rating: 6.05)
5. That Old Feeling (1997) (Predicted Rating: 6.03)
6. MURDER and murder (1996) (Predicted Rating: 5.99)
7. Air Force One (1997) (Predicted Rating: 5.99)
8. Year of the Horse (1997) (Predicted Rating: 5.98)
9. No Looking Back (1998) (Predicted Rating: 5.96)
10. Halloween 4: The Return of Michael Myers (1988) (Predicted Rating: 5.91)
