# Movie Model Training Script

The following script demonstrates how we preprocessed our data and then trained our machine learning model. 
Note: in the script below we only used a small part of our dataset for the training, since it would have taken too much time and storage to train and save it to our local machines. Therefore, the following script just demonstrates a working example of how it works for a small dataset.
We trained the model for the whole dataset on a different machine with small adjustments in our code. We uploaded this model to Hugging Face and fetched it for our prediction in our project. 

### Data loading and Preprocessing

In [None]:
from torch.utils.data import Dataset, DataLoader
import os
import pandas as pd
import numpy as np

class MoviePairDataset(Dataset):
    def __init__(self, preferred_movie_ids, similar_movie_ids):
        """
        Args:
            preferred_movie_ids (list of int): The IDs of the preferred movies.
            similar_movie_ids (list of int): The IDs of movies similar to the preferred ones.
        """
        assert len(preferred_movie_ids) == len(similar_movie_ids), "The lists must have the same length."
        self.preferred_movie_ids = preferred_movie_ids
        self.similar_movie_ids = similar_movie_ids
        
    def __len__(self):
        return len(self.preferred_movie_ids)
        
    def __getitem__(self, idx):
        preferred_id = torch.tensor(self.preferred_movie_ids[idx], dtype=torch.long)
        similar_id = torch.tensor(self.similar_movie_ids[idx], dtype=torch.long)
        return preferred_id, similar_id

In [None]:
# path to the dataset folder
data_folder = "data/"

# list all CSV files in the data directory
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

# load all CSV files into a single DataFrame
all_movies_df = pd.concat(
    (pd.read_csv(f"{data_folder}{file}") for file in csv_files),
    ignore_index=True
) #[:10000]

all_movies_df['year'] = pd.to_numeric(all_movies_df['year'], errors='coerce')

all_movies_df.fillna(0, inplace=True)

all_movies_df = all_movies_df.dropna()

print(all_movies_df)

# Adjust these thresholds based on your dataset and preferences
RATING_DIFF_THRESHOLD = 1.0  # Movies within this rating difference are considered similar
YEAR_DIFF_THRESHOLD = 5  # Movies within this range of years are considered similar

preferred_movie_ids = []
similar_movie_ids = []

ignored = 0
count = 0

for row_idx, row in all_movies_df.iterrows():
    # Find movies in the same genre
    same_genre_movies = all_movies_df[all_movies_df['genre'] == row['genre']]

    # Further filter movies by rating and year proximity
    similar_movies = same_genre_movies[
        (np.abs(same_genre_movies['rating'] - row['rating']) <= RATING_DIFF_THRESHOLD) &
        (np.abs(same_genre_movies['year'] - row['year']) <= YEAR_DIFF_THRESHOLD)
    ]
    
    # Exclude the current movie from its similar list
    similar_movies = similar_movies[similar_movies['movie_id'] != row['movie_id']]
    if len(similar_movies) > 0:
        similar_movies = similar_movies.sample(n=1)
    else:
        ignored+=1
        continue
    
    # Append pairs to the lists
    for similar_row_idx, similar_row in similar_movies.iterrows():
        preferred_movie_ids.append(row_idx)
        similar_movie_ids.append(similar_row_idx)

    count += 1
    if count % 1000 == 0:
        print(count)

# Deduplicate pairs while maintaining order
pairs = list(dict.fromkeys(zip(preferred_movie_ids, similar_movie_ids)))
preferred_movie_ids, similar_movie_ids = zip(*pairs)

# Convert back to lists if necessary
preferred_movie_ids = list(preferred_movie_ids)
similar_movie_ids = list(similar_movie_ids)

# Now you can create the dataset
#dataset = MoviePairDataset(preferred_movie_ids=preferred_movie_ids, similar_movie_ids=similar_movie_ids)
#dataloader = DataLoader(dataset, batch_size=512, shuffle=True)

SPLIT_POS = 7
BATCH_SIZE = 256

dataset = MoviePairDataset(preferred_movie_ids=preferred_movie_ids[:SPLIT_POS], similar_movie_ids=similar_movie_ids[:SPLIT_POS])
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

eval_dataset = MoviePairDataset(preferred_movie_ids=preferred_movie_ids[SPLIT_POS:], similar_movie_ids=similar_movie_ids[SPLIT_POS:])
eval_dataloader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=True)

### Neural Network Model Definition

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

class MovieIDPredictor(nn.Module):
    def __init__(self, num_movie_ids, movie_id_embedding_dim=64, transformer_heads=8, transformer_layers=1, transformer_dim=64):
        super(MovieIDPredictor, self).__init__()
        # New movie_id embedding dimension
        self.movie_id_embedding_dim = movie_id_embedding_dim
        self.transformer_dim = transformer_dim

        # Ensure the embedding dimension for movie_id matches the transformer dimension
        self.movie_id_embedding = nn.Embedding(num_movie_ids, self.movie_id_embedding_dim)

        # TransformerEncoder expects the input dimension (d_model) to match the transformer_dim
        # Assuming transformer_dim is set to match movie_id_embedding_dim or vice versa
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=self.transformer_dim, nhead=transformer_heads), num_layers=transformer_layers)
        
        # Output layer to classify movie IDs (stays unchanged)
        self.fc_out = nn.Linear(self.transformer_dim, num_movie_ids)
        
    def forward(self, movie_id):
        movie_id_emb = self.movie_id_embedding(movie_id).view(-1, 1, self.movie_id_embedding_dim)
        
        x = self.transformer(movie_id_emb)
        x = x.view(-1, self.transformer_dim)  # Reshape for the output layer
        output = self.fc_out(x)
        return output


### Training Loop

In [88]:
# set the device to be used for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model initialization
num_movie_ids = 368300 
model = MovieIDPredictor(num_movie_ids=num_movie_ids, movie_id_embedding_dim=64, transformer_heads=8, transformer_layers=1, transformer_dim=64)

model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Training loop setup
num_epochs = 200
lowest_val_loss = float('inf')
best_model_state = None  # To save the best model state

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    train_loss = 0.0
    for preferred_id, similar_id in dataloader:
        preferred_id, similar_id = preferred_id.to(device), similar_id.to(device)
        optimizer.zero_grad()
        outputs = model(preferred_id)
        loss = criterion(outputs, similar_id)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(dataloader)

    # Evaluation step
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():
        for preferred_id, similar_id in dataloader:
            preferred_id, similar_id = preferred_id.to(device), similar_id.to(device)
            outputs = model(preferred_id)
            loss = criterion(outputs, similar_id)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(eval_dataloader)

    print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

    # Check if the current validation loss is the lowest
    if avg_val_loss < lowest_val_loss:
        print(f'Validation loss decreased ({lowest_val_loss:.4f} --> {avg_val_loss:.4f}). Saving model ...')
        lowest_val_loss = avg_val_loss
        best_model_state = model.state_dict()

# After training completes, you can save the best model state
#torch.save(best_model_state, 'best_model_state.pth')

best_model_state_cpu = {k: v.cpu() for k, v in best_model_state.items()}
torch.save(best_model_state_cpu, 'movie_predictor_model.pth') # Save the best model state to a file



Epoch 1, Training Loss: 9.3937, Validation Loss: 0.2317
Validation loss decreased (inf --> 0.2317). Saving model ...
Epoch 2, Training Loss: 8.8171, Validation Loss: 0.2164
Validation loss decreased (0.2317 --> 0.2164). Saving model ...
Epoch 3, Training Loss: 8.2710, Validation Loss: 0.2028
Validation loss decreased (0.2164 --> 0.2028). Saving model ...
Epoch 4, Training Loss: 7.7054, Validation Loss: 0.1910
Validation loss decreased (0.2028 --> 0.1910). Saving model ...
Epoch 5, Training Loss: 7.3196, Validation Loss: 0.1809
Validation loss decreased (0.1910 --> 0.1809). Saving model ...
Epoch 6, Training Loss: 7.0070, Validation Loss: 0.1724
Validation loss decreased (0.1809 --> 0.1724). Saving model ...
Epoch 7, Training Loss: 6.5765, Validation Loss: 0.1650
Validation loss decreased (0.1724 --> 0.1650). Saving model ...
Epoch 8, Training Loss: 6.3207, Validation Loss: 0.1586
Validation loss decreased (0.1650 --> 0.1586). Saving model ...
Epoch 9, Training Loss: 6.1697, Validation 

### Validation

In [101]:
correct = 0
total = 0
with torch.no_grad():  # No need to track gradients during evaluation
    for preferred_id, similar_id in dataloader:  # Assuming you have a dataloader for your evaluation dataset
        outputs = loaded_model(preferred_id)
        _, predicted = torch.max(outputs.data, 1)  # Get the index of the max log-probability
        total += similar_id.size(0)
        correct += (predicted == similar_id).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the model on the evaluation dataset: {accuracy:.2f}%')


Accuracy of the model on the evaluation dataset: 100.00%


### Load model weights from local filesystem

In [None]:
loaded_model = MovieIDPredictor(num_movie_ids=num_movie_ids)

# Load the state dictionary
loaded_model.load_state_dict(torch.load('movie_predictor_model.pth'))

# Set the model to evaluation mode
loaded_model.eval()

### Example of a Movie Recommendation

In [65]:
import torch

def predict(movie_id, n):
    """
    Predicts the top n recommended movie_ids for a given movie_id.

    Args:
    movie_id (int): The movie ID for which recommendations are to be made.
    n (int): The number of recommendations to return.

    Returns:
    list: A list of the top n recommended movie IDs.
    """
    # Convert movie_id to a tensor and add a batch dimension (batch size = 1)
    movie_id_tensor = torch.tensor([movie_id], dtype=torch.long)
    
    # Ensure the model is in evaluation mode
    loaded_model.eval()
    
    with torch.no_grad():  # Inference doesn't require gradient calculation
        # Get model output for the given movie_id
        outputs = loaded_model(movie_id_tensor)
        
        # Get the scores, ignore the first recommendation as it's the movie itself
        _, recommended_ids = torch.topk(outputs, n + 1, dim=1)
        
        # Convert to a list and remove the input movie_id from the recommendations
        recommended_ids = recommended_ids[0].tolist()
        if movie_id in recommended_ids:
            recommended_ids.remove(movie_id)
        else:  # If the movie_id is not in the top n+1, remove the last to keep n recommendations
            recommended_ids.pop()

    return recommended_ids[:n]

# Example usage
# Ensure your model and environment are properly set up before calling this function.
movie_id = 20  # Example movie ID
n = 5  # Number of recommendations
recommended_movie_ids = predict(movie_id, n)
print(f"Recommended Movie IDs for Movie ID {movie_id}: {recommended_movie_ids}")


Recommended Movie IDs for Movie ID 20: [338502, 319299, 2971, 69313, 65428]


### Uploading the Model to Hugging Face 

Note: We trained the model for the whole dataset on a different machine with small adjustments in the code, since it would have taken a lot of time on our local machines. Therefore, we uploaded the whole model to Hugging Face and fetched it later on for our prediction.

### Clone Repository from Hugging Face

In [None]:
from huggingface_hub import Repository

model_path = "movie_predictor_model_368K.pth"

# clone the repository with the model
local_repo_path = os.path.join(os.getcwd(), repo_name)
repo = Repository(local_repo_path, clone_from=repo_url)

### Load model weights from Huggingface

In [None]:
from huggingface_hub import hf_hub_download

model_name = "Emm180/movie_match_model"
filename = "movie_predictor_model_368K.pth"  # This should match the filename you used to upload

# Download the model file
model_path = hf_hub_download(repo_id=model_name, filename=filename)

# Assuming `YourModel` is the class of your model
num_movie_ids = 368300
model = MovieIDPredictor(num_movie_ids=num_movie_ids)

# Load the weights into your model
model.load_state_dict(torch.load(model_path))

# If your model requires it, switch to evaluation mode
model.eval()