In [None]:
import torch
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

movies = pd.read_csv(movies_small_path)
ratings = pd.read_csv(ratings_small_path)
# Extract year from title
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').fillna(1990).astype(int)
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

# Encode genres (one-hot encoding)
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(movies['genres'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)
movies = pd.concat([movies[['movieId', 'year']], genre_df], axis=1)

# Compute movie average rating
movie_avg_rating = ratings.groupby('movieId')['rating'].mean().reset_index(name='avg_rating')
movies = movies.merge(movie_avg_rating, on='movieId', how='left').fillna(3.0)

# Normalize and finalize movie features
genre_cols = mlb.classes_
movie_features = movies.set_index('movieId')
movie_features['year'] = (movie_features['year'] - 1900) / 150  # Normalize year
movie_features['avg_rating'] = movie_features['avg_rating'] / 5.0  # Normalize avg_rating

# Merge user ratings with movie metadata
ratings = ratings.merge(movie_features, on='movieId')

# Step 1 (continued): Create user profiles - average rating per genre
user_profiles = ratings.groupby('userId')[genre_cols].mean().reset_index()
user_profiles = user_profiles.set_index('userId')
ratings = ratings.merge(user_profiles, on='userId', suffixes=('', '_user'))

In [None]:
# ------------------------------
# Step 2: Dataset Class
# ------------------------------

class MovieDataset(Dataset):
    def __init__(self, df, genre_cols):
        self.user_feats = df[[col + '_user' for col in genre_cols]].values.astype(np.float32)
        self.movie_feats = df[genre_cols.tolist() + ['year', 'avg_rating']].values.astype(np.float32)
        self.ratings = df['rating'].values.astype(np.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_feats[idx], self.movie_feats[idx], self.ratings[idx]

In [None]:
# ------------------------------
# Step 3: Train/val split and DataLoaders
# ------------------------------

train_df, val_df = train_test_split(ratings, test_size=0.2, random_state=42)
train_dataset = MovieDataset(train_df, genre_cols)
val_dataset = MovieDataset(val_df, genre_cols)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)# ------------------------------
# Step 3: Train/val split and DataLoaders
# ------------------------------

train_df, val_df = train_test_split(ratings, test_size=0.2, random_state=42)
train_dataset = MovieDataset(train_df, genre_cols)
val_dataset = MovieDataset(val_df, genre_cols)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
# ------------------------------
# Step 4: Neural Network Model
# ------------------------------

class RecommenderNet(nn.Module):
    def __init__(self, user_dim, movie_dim):
        super().__init__()

        # User Feature Input → Dense Layer → User Embedding
        self.user_branch = nn.Sequential(
            nn.Linear(user_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16)  # User embedding
        )

        # Movie Feature Input → Dense Layer → Movie Embedding
        self.movie_branch = nn.Sequential(
            nn.Linear(movie_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16)  # Movie embedding
        )

        # Combine User & Movie Embeddings → Predict Rating
        self.predict_layer = nn.Sequential(
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, user_input, movie_input):
        user_embedding = self.user_branch(user_input)
        movie_embedding = self.movie_branch(movie_input)
        combined = torch.cat((user_embedding, movie_embedding), dim=1)
        rating_pred = self.predict_layer(combined).squeeze(-1)
        return rating_pred

# ------------------------------
# Step 5: Training Setup
# ------------------------------

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RecommenderNet(len(genre_cols), len(genre_cols) + 2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
loss_fn = nn.MSELoss()

# Training and validation loops
def train_epoch(loader):
    model.train()
    loss_sum = 0
    for user, movie, rating in loader:
        user, movie, rating = user.to(device), movie.to(device), rating.to(device)
        optimizer.zero_grad()
        pred = model(user, movie)
        loss = loss_fn(pred, rating)
        loss.backward()
        optimizer.step()
        loss_sum += loss.item() * len(rating)
    return loss_sum / len(loader.dataset)

def validate(loader):
    model.eval()
    loss_sum = 0
    with torch.no_grad():
        for user, movie, rating in loader:
            user, movie, rating = user.to(device), movie.to(device), rating.to(device)
            pred = model(user, movie)
            loss = loss_fn(pred, rating)
            loss_sum += loss.item() * len(rating)
    return loss_sum / len(loader.dataset)

# ------------------------------
# Step 6: Train Model with Early Stopping
# ------------------------------

best_val_loss = float('inf')
patience = 3
patience_counter = 0

for epoch in range(20):
    train_loss = train_epoch(train_loader)
    val_loss = validate(val_loader)
    scheduler.step(val_loss)

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# ------------------------------
# Step 7: Cosine Similarity Baseline (TF-IDF style)
# ------------------------------

def tfidf_baseline(user_profiles, movie_features, ratings, genre_cols):
    results = []
    for _, row in ratings.iterrows():
        try:
            # Get vectors ensuring same genre columns
            user_vec = user_profiles.loc[row['userId'], genre_cols].values.reshape(1, -1)
            movie_vec = movie_features.loc[row['movieId'], genre_cols].values.reshape(1, -1)

            # Verify dimensions match
            if user_vec.shape[1] == movie_vec.shape[1]:
                sim = cosine_similarity(user_vec, movie_vec)[0][0]
                results.append((sim * 5.0, row['rating']))
        except KeyError:
            continue

    if not results:
        return float('inf')

    preds, actuals = zip(*results)
    return mean_squared_error(actuals, preds)

# Evaluate baseline
baseline_mse = tfidf_baseline(user_profiles, movie_features, val_df, genre_cols)
print(f"TF-IDF Cosine Baseline MSE: {baseline_mse:.4f}")

# Evaluate trained model
model.load_state_dict(torch.load("best_model.pt"))
final_val_loss = validate(val_loader)
print(f"Neural Network Validation MSE: {final_val_loss:.4f}")

# ------------------------------
# Step 8: Final Evaluation / Comparison
# ------------------------------

print("\n--- Final Analysis ---")
if final_val_loss < baseline_mse:
    print("Neural Network outperformed TF-IDF baseline. It captured more complex preferences.")
else:
    print("TF-IDF baseline performed better. Neural model might be underfitting or needs tuning.")

In [None]:
# ------------------------------
# Step 9: Making Predictions for User 1
# ------------------------------

def recommend_for_user(user_id, top_k=5):
    # Load the best model
    model.load_state_dict(torch.load("best_model.pt"))
    model.eval()

    # Get user features
    try:
        user_features = user_profiles.loc[user_id].values
    except KeyError:
        print(f"User {user_id} not found in database")
        return None

    # Prepare all movie features
    movie_ids = movie_features.index.values
    movie_feature_list = [movie_features.loc[mid].values for mid in movie_ids]

    # Convert to tensors
    user_tensor = torch.FloatTensor(user_features).unsqueeze(0).repeat(len(movie_ids), 1).to(device)
    movie_tensor = torch.FloatTensor(np.array(movie_feature_list)).to(device)

    # Make predictions
    with torch.no_grad():
        predictions = model(user_tensor, movie_tensor).cpu().numpy()

    # Get top recommendations
    top_indices = np.argsort(predictions)[-top_k:][::-1]
    recommendations = []

    for idx in top_indices:
        movie_id = movie_ids[idx]
        predicted_rating = predictions[idx]
        recommendations.append((movie_id, predicted_rating))

    return recommendations

# Get recommendations for user 1
user1_recommendations = recommend_for_user(1)
print("\nTop Recommendations for User 1:")
for movie_id, rating in user1_recommendations:
    print(f"{movie_id} (predicted rating: {rating:.2f})")