In [1]:
#  Install & Import Libraries

# Install required libraries
!pip install -q transformers scikit-learn joblib

import os
import re
import ast
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import pipeline
import joblib


In [2]:
#  Download MovieLens & Clean Movie Data

# 1. Download & unzip MovieLens "latest small" dataset
!wget -q http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -O ml-latest-small.zip
!unzip -o -q ml-latest-small.zip

# 2. Load movies & ratings
raw_movies_path = "ml-latest-small/movies.csv"
raw_ratings_path = "ml-latest-small/ratings.csv"

movies_df = pd.read_csv(raw_movies_path)
ratings_df = pd.read_csv(raw_ratings_path)

print("Original movies shape:", movies_df.shape)
print("Original ratings shape:", ratings_df.shape)

# 3. Helper to clean title & extract year
def clean_title(title: str):
    """
    Extract the release year in brackets, e.g. 'Toy Story (1995)',
    and remove the ' (1995)' part from the title.
    Returns: (clean_title, year:int or None)
    """
    if not isinstance(title, str):
        return "", None

    # Find "(1995)" pattern
    year_match = re.search(r"\((\d{4})\)", title)
    if year_match:
        year = int(year_match.group(1))
        # correct regex - remove ' (1995)' safely
        title_cleaned = re.sub(r"\s*\(\d{4}\)", "", title).strip()
    else:
        year = None
        title_cleaned = title.strip()
    return title_cleaned, year

# Apply cleaning to all titles
title_year_df = movies_df["title"].apply(
    lambda x: pd.Series(clean_title(x), index=["title_cleaned", "release_year"])
)
movies_df = pd.concat([movies_df, title_year_df], axis=1)

# 4. Clean genres
# Replace "(no genres listed)" with NaN, then split on '|'
movies_df["genres"] = movies_df["genres"].replace("(no genres listed)", np.nan)
movies_df["genres"] = movies_df["genres"].apply(
    lambda x: x.split("|") if pd.notna(x) else []
)

# 5. Handle missing values & normalize types
movies_df["release_year"] = movies_df["release_year"].fillna(0).astype(int)

# 6. Save cleaned versions to a /data folder
os.makedirs("data", exist_ok=True)

movies_clean_path = "data/movies_cleaned.csv"
ratings_clean_path = "data/ratings_cleaned.csv"

movies_df.to_csv(movies_clean_path, index=False)
ratings_df.to_csv(ratings_clean_path, index=False)

print("Saved cleaned movies to:", movies_clean_path)
print("Saved cleaned ratings to:", ratings_clean_path)


Original movies shape: (9742, 3)
Original ratings shape: (100836, 4)
Saved cleaned movies to: data/movies_cleaned.csv
Saved cleaned ratings to: data/ratings_cleaned.csv


In [3]:
# Build TF-IDF Features, Similarity Matrix & Save Artifacts

# Reload cleaned movies to be safe
movies_df = pd.read_csv("data/movies_cleaned.csv")
ratings_df = pd.read_csv("data/ratings_cleaned.csv")

# If genres were saved as list-like strings, convert safely
def safe_parse_genres(val):
    """
    Makes sure genres are a Python list.
    Uses ast.literal_eval instead of eval for safety.
    """
    if isinstance(val, list):
        return val
    if isinstance(val, str) and val.startswith("[") and "]" in val:
        try:
            parsed = ast.literal_eval(val)  # SAFE (no eval)
            if isinstance(parsed, list):
                return parsed
        except Exception:
            pass
    # Fallback: if it's still a plain string like "Action|Adventure"
    if isinstance(val, str):
        return val.split("|")
    return []

movies_df["genres"] = movies_df["genres"].apply(safe_parse_genres)

# Build a text version of genres for TF-IDF
movies_df["genres_str"] = movies_df["genres"].apply(
    lambda lst: " ".join(g.lower().replace(" ", "") for g in lst)
)

# Combined text = cleaned title + genres_str
movies_df["title_cleaned"] = movies_df["title_cleaned"].fillna("")
movies_df["combined_features"] = (
    movies_df["title_cleaned"].astype(str) + " " + movies_df["genres_str"]
)

# 1. TF-IDF vectorizer on combined text
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df["combined_features"])

# 2. Cosine similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Cosine similarity matrix shape:", cosine_sim_matrix.shape)

# 3. Compute average rating per movie
average_ratings = ratings_df.groupby("movieId")["rating"].mean().rename("avg_rating")

# Align ratings with movies_df index by movieId
movies_with_ratings = movies_df.set_index("movieId").join(average_ratings)
movies_with_ratings["avg_rating"] = movies_with_ratings["avg_rating"].fillna(
    movies_with_ratings["avg_rating"].mean()
)

# 4. Save all artifacts into /models
os.makedirs("models", exist_ok=True)

joblib.dump(tfidf_vectorizer, "models/tfidf_vectorizer.pkl")
np.save("models/tfidf_matrix.npy", tfidf_matrix.toarray())
np.save("models/cosine_sim_matrix.npy", cosine_sim_matrix)

movies_with_ratings.reset_index().to_csv("models/loaded_movies_df.csv", index=False)
# If you still want a separate rating file, you can, but we don't strictly need it:
movies_with_ratings["avg_rating"].to_csv("models/average_movie_ratings.csv")

print("Saved vectorizer, matrices, and movie data into /models")


TF-IDF matrix shape: (9742, 8969)
Cosine similarity matrix shape: (9742, 9742)
Saved vectorizer, matrices, and movie data into /models


In [15]:
# ðŸ¤– Cell 4 â€“ Create backend/ai/emotion_detection.py (final version with explanations & optimized mapping)

import os

os.makedirs("backend/ai", exist_ok=True)

script_path = os.path.join("backend", "ai", "emotion_detection.py")

script_code = '''"""Emotion detection and movie recommendation utilities for Vyber.

This module exposes three main helpers for the FastAPI backend:

- load_movies()        â†’ returns the movies DataFrame with ratings and genres
- detect_mood(text)    â†’ maps free-text input to one of our moods
- recommend(mood, ...) â†’ returns a list of recommended movie titles with explanations
"""

import os
import ast
import numpy as np
import pandas as pd
from transformers import pipeline
import joblib

# --- Load precomputed artifacts (vectorizer, similarity matrix, movies) ---

MODELS_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "models")
MODELS_DIR = os.path.abspath(MODELS_DIR)

TFIDF_VECTORIZER_PATH = os.path.join(MODELS_DIR, "tfidf_vectorizer.pkl")
COSINE_SIM_MATRIX_PATH = os.path.join(MODELS_DIR, "cosine_sim_matrix.npy")
MOVIES_DF_PATH = os.path.join(MODELS_DIR, "loaded_movies_df.csv")

# Load TF-IDF vectorizer (kept for future use)
tfidf_vectorizer = joblib.load(TFIDF_VECTORIZER_PATH)

# Load cosine similarity matrix
cosine_sim_matrix = np.load(COSINE_SIM_MATRIX_PATH)

# Load movies with ratings
movies_df = pd.read_csv(MOVIES_DF_PATH)

# Ensure genres are a proper Python list
def _ensure_genres_list(val):
    if isinstance(val, list):
        return val
    if isinstance(val, str) and val.startswith("[") and "]" in val:
        try:
            parsed = ast.literal_eval(val)
            if isinstance(parsed, list):
                return parsed
        except Exception:
            pass
    if isinstance(val, str):
        return val.split("|")
    return []

if "genres" in movies_df.columns:
    movies_df["genres"] = movies_df["genres"].apply(_ensure_genres_list)

# --- Emotion model and mapping ---

# Pretrained HuggingFace model for emotion classification
emotion_pipeline = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base"
)

# Map fine-grained emotions â†’ 6 Vyber moods
emotion_to_mood_map = {
    "joy": "happy",
    "optimism": "happy",
    "admiration": "happy",
    "amusement": "happy",
    "surprise": "happy",
    "trust": "happy",
    "contentment": "happy",
    "love": "romantic",
    "caring": "romantic",
    "sadness": "sad",
    "grief": "sad",
    "disappointment": "sad",
    "anger": "action",
    "annoyance": "action",
    "disgust": "scary",
    "fear": "scary",
    "nervousness": "scary",
    "anticipation": "fantasy",
    "curiosity": "fantasy",
    "excitement": "fantasy",
}

# Fallback if nothing matches
DEFAULT_MOOD = "happy"

# Mood â†’ genres mapping (optimized)
mood_to_genres_map = {
    "happy": [
        "Comedy",
        "Family",
        "Animation",
        "Romance"
    ],
    "sad": [
        "Drama",
        "Romance"
    ],
    "romantic": [
        "Romance",
        "Drama"
    ],
    "action": [
        "Action",
        "Adventure",
        "Crime",
        "Sci-Fi"
    ],
    "scary": [
        "Horror",
        "Thriller"
    ],
    "fantasy": [
        "Fantasy",
        "Sci-Fi",
        "Animation",
        "Adventure"
    ]
}

def load_movies():
    """Return the full movies DataFrame used by the recommender."""
    return movies_df.copy()

def _extract_top_dict(results):
    """Safely pull out the top {label, score} dict from any nested list structure."""
    obj = results
    # Sometimes it's already a dict
    if isinstance(obj, dict):
        return obj
    # If it's a list, keep going into the first element until we hit a dict or fail
    while isinstance(obj, list) and len(obj) > 0:
        obj = obj[0]
        if isinstance(obj, dict):
            return obj
    return None

def detect_mood(text: str) -> str:
    """Detect a coarse mood (one of 6) from free-text input.

    Handles different output shapes from the HuggingFace pipeline.
    If anything goes wrong, returns DEFAULT_MOOD.
    """
    if not isinstance(text, str) or not text.strip():
        return DEFAULT_MOOD

    try:
        results = emotion_pipeline(text)
    except Exception:
        return DEFAULT_MOOD

    top = _extract_top_dict(results)
    if top is None:
        return DEFAULT_MOOD

    label = str(top.get("label", "")).lower()
    mood = emotion_to_mood_map.get(label, DEFAULT_MOOD)
    return mood

def recommend(mood: str, top_n: int = 5, weight_sim: float = 0.7, weight_rating: float = 0.3):
    """Recommend movies for a given mood.

    Combines cosine similarity (based on title + genres)
    and average rating to score movies, then returns a list
    of dicts with title, genres, mood, rating, and explanation.
    """
    mood = (mood or "").lower()
    if mood not in mood_to_genres_map:
        mood = DEFAULT_MOOD

    target_genres = mood_to_genres_map[mood]

    # Simple genre filter: keep movies that contain at least one target genre
    def has_genre(genres):
        if not isinstance(genres, (list, tuple, set)):
            return False
        genres_lower = [str(g).lower() for g in genres]
        return any(tg.lower() in genres_lower for tg in target_genres)

    mask = movies_df["genres"].apply(has_genre)
    candidate_indices = movies_df.index[mask].tolist()

    if not candidate_indices:
        # Fallback: if no movie matches, just take all movies
        candidate_indices = list(movies_df.index)

    # Slice similarity matrix & ratings for the candidates
    sim_submatrix = cosine_sim_matrix[np.ix_(candidate_indices, candidate_indices)]

    # For simplicity, use the average similarity of each candidate to all others
    sim_scores = sim_submatrix.mean(axis=1)

    # Use avg_rating column if present, else fallback to ones
    if "avg_rating" in movies_df.columns:
        ratings = movies_df.loc[candidate_indices, "avg_rating"].values
    else:
        ratings = np.ones(len(candidate_indices))

    # Normalize scores to [0,1] to combine them
    def _normalize(x):
        x = np.asarray(x, dtype=float)
        if x.max() == x.min():
            return np.ones_like(x)
        return (x - x.min()) / (x.max() - x.min())

    sim_norm = _normalize(sim_scores)
    rating_norm = _normalize(ratings)

    final_scores = weight_sim * sim_norm + weight_rating * rating_norm

    # Sort candidates by score
    sorted_idx = np.argsort(final_scores)[::-1]  # descending
    top_idx = sorted_idx[:top_n]

    top_movie_indices = [candidate_indices[i] for i in top_idx]
    top_movies = movies_df.loc[top_movie_indices]

    # Build rich output: title, genres, mood, rating, explanation
    results = []
    for _, row in top_movies.iterrows():
        genres_val = row.get("genres", [])
        # Make sure genres is a list for JSON
        if isinstance(genres_val, str):
            try:
                parsed = ast.literal_eval(genres_val)
                if isinstance(parsed, list):
                    genres_val = parsed
                else:
                    genres_val = [genres_val]
            except Exception:
                genres_val = [genres_val]
        elif not isinstance(genres_val, (list, tuple, set)):
            genres_val = [str(genres_val)]

        genres_list = [str(g) for g in genres_val if g is not None]
        main_genre = genres_list[0] if genres_list else None

        if main_genre:
            explanation = (
                f"Because you're feeling {mood}, we picked this {main_genre} movie "
                f"that matches your current vibe."
            )
        else:
            explanation = (
                f"Because you're feeling {mood}, we picked this highly-rated movie "
                f"that many people enjoy in this mood."
            )

        avg_rating = None
        if "avg_rating" in row and not pd.isna(row["avg_rating"]):
            avg_rating = float(row["avg_rating"])

        results.append({
            "title": row["title"],
            "genres": genres_list,
            "mood": mood,
            "avg_rating": avg_rating,
            "explanation": explanation
        })

    return results
'''

with open(script_path, "w", encoding="utf-8") as f:
    f.write(script_code)

print(f"Written backend AI script to: {script_path}")


Written backend AI script to: backend/ai/emotion_detection.py


In [16]:
# ðŸ§ª Test â€“ detect_mood(), recommend() with explanations

import os, sys, importlib

# 1) Make sure we can import from backend/
sys.path.append(os.path.abspath("."))

# 2) Import and reload the module so we get the latest changes
import backend.ai.emotion_detection as emo
importlib.reload(emo)

# 3) Test 1: Detect mood from a sample text
sample_text = "I feel very low and lonely today. Nothing seems exciting."
detected = emo.detect_mood(sample_text)
print("Input text:", sample_text)
print("Detected mood:", detected)

# 4) Test 2: Get recommendations for that detected mood
recs_for_detected = emo.recommend(detected, top_n=5)
print("\nMovie recommendations for detected mood (with explanations):")
for i, rec in enumerate(recs_for_detected, start=1):
    print(f"{i}. {rec['title']}")
    print(f"   Genres: {rec['genres']}")
    print(f"   Rating: {rec['avg_rating']}")
    print(f"   Explanation: {rec['explanation']}\n")

# 5) Test 3: Try each mood manually
test_moods = ["happy", "sad", "romantic", "action", "scary", "fantasy"]

print("\nDirect mood â†’ movie test (with explanations):")
for m in test_moods:
    movies = emo.recommend(m, top_n=3)
    print(f"\nMood: {m}")
    for i, rec in enumerate(movies, start=1):
        print(f"  {i}. {rec['title']}")
        print(f"     Genres: {rec['genres']}")
        print(f"     Explanation: {rec['explanation']}")


Device set to use cpu


Input text: I feel very low and lonely today. Nothing seems exciting.
Detected mood: sad

Movie recommendations for detected mood (with explanations):
1. All Yours (2016)
   Genres: ['Comedy', 'Drama', 'Romance']
   Rating: 5.0
   Explanation: Because you're feeling sad, we picked this Comedy movie that matches your current vibe.

2. L.I.E. (2001)
   Genres: ['Drama']
   Rating: 4.333333333333333
   Explanation: Because you're feeling sad, we picked this Drama movie that matches your current vibe.

3. Call Me by Your Name (2017)
   Genres: ['Drama', 'Romance']
   Rating: 3.75
   Explanation: Because you're feeling sad, we picked this Drama movie that matches your current vibe.

4. Last Detail, The (1973)
   Genres: ['Comedy', 'Drama']
   Rating: 4.25
   Explanation: Because you're feeling sad, we picked this Comedy movie that matches your current vibe.

5. All or Nothing (2002)
   Genres: ['Drama']
   Rating: 3.6666666666666665
   Explanation: Because you're feeling sad, we picked this

In [17]:
#  trying different similarity vs rating weights

import os, sys, importlib

# make sure we have the latest version
sys.path.append(os.path.abspath("."))
import backend.ai.emotion_detection as emo
importlib.reload(emo)

test_mood = "happy"   # you can change to "sad", "romantic", etc.
weight_pairs = [
    (0.5, 0.5),
    (0.7, 0.3),
    (0.8, 0.2)
]

for w_sim, w_rat in weight_pairs:
    print(f"\n=== Testing weights: similarity={w_sim}, rating={w_rat} ===")
    recs = emo.recommend(test_mood, top_n=5, weight_sim=w_sim, weight_rating=w_rat)
    for i, rec in enumerate(recs, start=1):
        print(f"{i}. {rec['title']} | Genres: {rec['genres']} | Rating: {rec['avg_rating']}")
    print("-" * 60)


Device set to use cpu



=== Testing weights: similarity=0.5, rating=0.5 ===
1. All Yours (2016) | Genres: ['Comedy', 'Drama', 'Romance'] | Rating: 5.0
2. It Can't Be! (1975) | Genres: ['Comedy'] | Rating: 4.5
3. Ten, The (2007) | Genres: ['Comedy'] | Rating: 4.5
4. Last Detail, The (1973) | Genres: ['Comedy', 'Drama'] | Rating: 4.25
5. P.S. (2004) | Genres: ['Comedy', 'Drama', 'Fantasy', 'Romance'] | Rating: 4.25
------------------------------------------------------------

=== Testing weights: similarity=0.7, rating=0.3 ===
1. All Yours (2016) | Genres: ['Comedy', 'Drama', 'Romance'] | Rating: 5.0
2. You Can't Take It with You (1938) | Genres: ['Comedy', 'Romance'] | Rating: 3.8333333333333335
3. Ten, The (2007) | Genres: ['Comedy'] | Rating: 4.5
4. It Can't Be! (1975) | Genres: ['Comedy'] | Rating: 4.5
5. Last Detail, The (1973) | Genres: ['Comedy', 'Drama'] | Rating: 4.25
------------------------------------------------------------

=== Testing weights: similarity=0.8, rating=0.2 ===
1. All Yours (2016) |

In [18]:
!zip -r vyber_ai.zip backend data models vyber_ai_mood_recommender_grifith.ipynb


  adding: backend/ (stored 0%)
  adding: backend/ai/ (stored 0%)
  adding: backend/ai/__pycache__/ (stored 0%)
  adding: backend/ai/__pycache__/emotion_detection.cpython-312.pyc (deflated 43%)
  adding: backend/ai/emotion_detection.py (deflated 64%)
  adding: data/ (stored 0%)
  adding: data/ratings_cleaned.csv (deflated 71%)
  adding: data/movies_cleaned.csv (deflated 73%)
  adding: models/ (stored 0%)
  adding: models/tfidf_vectorizer.pkl (deflated 60%)
  adding: models/cosine_sim_matrix.npy (deflated 61%)
  adding: models/average_movie_ratings.csv (deflated 69%)
  adding: models/loaded_movies_df.csv (deflated 77%)
  adding: models/tfidf_matrix.npy (deflated 100%)


In [19]:
!ls
!ls *.ipynb


backend  ml-latest-small      models	   vyber_ai.zip
data	 ml-latest-small.zip  sample_data
ls: cannot access '*.ipynb': No such file or directory
