In [426]:
!pip install openai 
!pip uninstall umap -y
!pip install umap-learn
!pip install sentence-transformers
from openai import OpenAI
import pandas as pd
import numpy as np
import re
import random

from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from umap.umap_ import UMAP
from difflib import SequenceMatcher







In [None]:
import os
os.environ["OPENAI_API_KEY"] = "Your OpenAI API Key"

In [428]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [429]:
def preprocess_text(text):
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [430]:
def split_train_test(df, test_size=0.2, seed=42):
    train_list = []
    test_list = []
    grouped = df.groupby('userid')
    for uid, group in grouped:
        if len(group) < 5:
            train_list.append(group)
            continue
        train_grp, test_grp = train_test_split(group, test_size=test_size, random_state=seed)
        train_list.append(train_grp)
        test_list.append(test_grp)
    train_df = pd.concat(train_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)
    return train_df, test_df

In [431]:
def build_movie_clusters_umap(train_df, num_clusters=10, model_name='all-mpnet-base-v2', umap_dim=10):
    required_columns = ['movieid', 'title', 'movie_description', 'genre', 'Gender', 'Age', 'Occupation']
    for col in required_columns:
        if col not in train_df.columns:
            train_df[col] = np.nan if col == 'Age' else ''

    unique_movies = train_df[['movieid', 'title', 'movie_description', 'genre']].drop_duplicates(subset='movieid').reset_index(drop=True)

    user_info_df = train_df[['movieid', 'Gender', 'Age', 'Occupation']].dropna()

    def summarize_users(group):
        genders = group['Gender'].value_counts(normalize=True).to_dict()
        occupations = group['Occupation'].value_counts().head(2).index.tolist()
        age_mean = group['Age'].mean()
        gender_str = ', '.join([f"{g}: {round(p, 2)}" for g, p in genders.items()])
        return f"avg age: {int(age_mean)}, top jobs: {', '.join(occupations)}, gender dist: {gender_str}"

    user_summary = user_info_df.groupby('movieid').apply(summarize_users).to_dict()


    def build_rich_text(row):
        title = row.get('title', '')
        desc = row.get('movie_description', '')
 
        if pd.isnull(desc) or desc.strip() == '':
            desc = title
        genre_value = row.get('genre', '')
        if pd.notnull(genre_value):
            genre_str = genre_value.replace('|', ', ')
        else:
            genre_str = ''
        user_demo = user_summary.get(row['movieid'], '')
        return f"{title}. {desc}. Genres: {genre_str}. Watched mostly by {user_demo}."

    unique_movies['text'] = unique_movies.apply(build_rich_text, axis=1)
    unique_movies['text'] = unique_movies['text'].apply(preprocess_text)

    print("Load the SBERT model:", model_name)
    model = SentenceTransformer(model_name)
    embeddings = model.encode(unique_movies['text'].tolist(), show_progress_bar=True, convert_to_numpy=True)
    unique_movies['embedding'] = list(embeddings)

    freq = train_df['title'].value_counts().to_dict()
    unique_movies['frequency'] = unique_movies['title'].apply(lambda x: freq.get(x, 0))

    print(f"Use UMAP to reduce dimension to {umap_dim} dimension...")
    umap_model = UMAP(n_components=umap_dim, random_state=42)
    reduced_embeddings = umap_model.fit_transform(embeddings)

    print(f"Do KMeans clustering, K = {num_clusters}")
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(reduced_embeddings)
    unique_movies['cluster'] = cluster_labels

    final_required_cols = ['movieid', 'title', 'movie_description', 'genre', 'text', 'embedding', 'frequency', 'cluster']
    unique_movies = unique_movies[final_required_cols]

    unique_movies['title_normalized'] = unique_movies['title'].str.strip().str.lower()

    return unique_movies, kmeans.cluster_centers_, model, reduced_embeddings

In [432]:
def get_user_history(user_id, train_df, num_examples=3):
    user_history = train_df[train_df['userid'] == user_id].sort_values(by='rating', ascending=False).head(num_examples)
    history_texts = [f"Movie: {row['title']} (Rating: {row['rating']})" for _, row in user_history.iterrows()]
    return ". ".join(history_texts)

def compute_user_cluster_preference(user_id, train_df, movie_cluster_df, min_rating=4):
    user_history = train_df[(train_df['userid'] == user_id) & (train_df['rating'] >= min_rating)]
    merged = pd.merge(user_history, movie_cluster_df[['title', 'cluster']], on='title', how='inner')
    if merged.empty:
        return {}
    cluster_counts = merged['cluster'].value_counts(normalize=True).to_dict()
    return cluster_counts

In [433]:
def normalize_title(title, remove_parentheses=True):
    title = title.lower().strip()
    if remove_parentheses:
        title = re.sub(r'\([^)]*\)', '', title)
    title = re.sub(r'[^a-z0-9\s]', '', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

In [434]:
def parse_recommendations(recommendation_text):
    lines = recommendation_text.split('\n')
    recommendations = []
    for line in lines:
        m = re.match(r'^\d+\.\s*(.*)$', line.strip())
        if m:
            title = m.group(1).strip()
            if title and len(title) >= 4:
                recommendations.append(title)
    if recommendations:
        return recommendations

    candidates = [s.strip() for s in recommendation_text.split(',')]
    return [s for s in candidates if len(s) >= 4]

In [435]:
def restore_title(title):
    m = re.match(r"^(.*),\s*(The|A|An)$", title)
    if m:
        return f"{m.group(2)} {m.group(1)}"
    return title

In [436]:
def get_user_profile_summary(user_id, train_df, movie_cluster_df, top_n_genres=3, top_clusters=3):
    user_data = train_df[(train_df['userid'] == user_id) & (train_df['rating'] >= 4)]

    user_data = user_data.drop(columns=['genre'], errors='ignore')

    merged = pd.merge(user_data, movie_cluster_df[['title', 'genre', 'cluster']], on='title', how='left')

    if merged.empty:
        return "User profile unavailable due to insufficient data."

    all_genres = "|".join(merged['genre'].dropna().tolist()).split("|")
    genre_freq = pd.Series(all_genres).value_counts().head(top_n_genres).to_dict()
    genre_summary = ", ".join([f"{g} (x{c})" for g, c in genre_freq.items()])

    cluster_counts = merged['cluster'].value_counts(normalize=True).head(top_clusters).to_dict()
    cluster_summary = ", ".join([f"Style Cluster {cid} ({round(p, 2)*100:.0f}%)" for cid, p in cluster_counts.items()])

    top_titles = merged['title'].tolist()[:10]
    movie_str = ", ".join(top_titles)

    return (
        f"The user has strong preferences for the following genres: {genre_summary}.\n"
        f"They are especially interested in the following style clusters: {cluster_summary}.\n"
        f"Examples of movies the user rated highly include: {movie_str}.\n"
        f"AVOID recommending movies purely based on popularity; focus on style and genre fit."
    )





In [437]:
def is_similar(a, b, threshold=0.7):
    return SequenceMatcher(None, a, b).ratio() >= threshold


def fuzzy_match(rec_title, ground_truth_set, movie_cluster_df=None, genre_boost=False, embedding_boost=False, embedding_model=None):
    rec_norm = normalize_title(rec_title)

    for gt in ground_truth_set:
        gt_norm = normalize_title(gt)

        if is_similar(rec_norm, gt_norm, threshold=0.3):
            return True

        if genre_boost and movie_cluster_df is not None:
            rec_title_norm = rec_title.strip().lower()
            gt_norm_title = gt.strip().lower()

            rec_genre_row = movie_cluster_df[movie_cluster_df['title_normalized'] == rec_title_norm]
            gt_genre_row = movie_cluster_df[movie_cluster_df['title_normalized'] == gt_norm_title]

            if not rec_genre_row.empty and not gt_genre_row.empty:
                if 'genre' in rec_genre_row.columns and 'genre' in gt_genre_row.columns:
                    rec_genre = rec_genre_row['genre'].values[0]
                    gt_genre = gt_genre_row['genre'].values[0]
                    if len(set(rec_genre.split('|')) & set(gt_genre.split('|'))) > 0:
                        return True

        if embedding_boost and embedding_model is not None:
            rec_emb = embedding_model.encode([rec_title])[0]
            gt_emb = embedding_model.encode([gt])[0]
            score = cosine_similarity([rec_emb], [gt_emb])[0][0]
            if score >= 0.85:
                return True

    return False


In [438]:
def generate_movie_recommendation(user_id, train_df, movie_cluster_df, sbert_model, k=5, candidate_pool_size=20, ground_truth_titles=None):
    candidate_titles = generate_candidate_pool_high_recall(
        train_df=train_df,
        user_id=user_id,
        movie_cluster_df=movie_cluster_df,
        sbert_model=sbert_model,
        pool_size=candidate_pool_size,
        ground_truth_titles=ground_truth_titles
    )

    if not candidate_titles:
        return []

    user_summary = get_user_profile_summary(user_id, train_df, movie_cluster_df)
    if not user_summary:
        return []

    candidate_titles_restored = [restore_title(t) for t in candidate_titles]
    candidate_block = "\n".join([f"{i+1}. {title}" for i, title in enumerate(candidate_titles_restored)])

    prompt = (
        f"Based on the user's viewing history and preferences, please recommend the top {k} movies.\n\n"
        f"{user_summary}\n"
        "Candidate movies (only choose from this list):\n"
        f"{candidate_block}\n\n"
        "IMPORTANT: Do NOT invent or guess movie titles. Only use exact titles from the candidate list.\n"
        "Try to match the user's preferred genres and past favorites when recommending."
    )

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=150,
            temperature=0.6
        )
        text = response.choices[0].message.content.strip()
        rec_list = parse_recommendations(text)
    except Exception:
        return []

    valid_title_set = set(normalize_title(t) for t in candidate_titles_restored)
    rec_list = [t for t in rec_list if normalize_title(t) in valid_title_set]

    return rec_list[:k]


In [439]:
def evaluate_topk_sample_fuzzy(
    test_df, 
    train_df, 
    movie_cluster_df, 
    sbert_model, 
    k=5, 
    rating_threshold=4.0, 
    sample_size=10, 
    pool_size=45, 
    similarity_threshold=0.3
):
    if 'genre' not in movie_cluster_df.columns:
        return 0, 0

    precision_list = []
    recall_list = []

    user_ids = list(test_df['userid'].unique())
    if sample_size is None:
        sample_user_ids = user_ids 
    else:
        sample_user_ids = random.sample(user_ids, min(sample_size, len(user_ids)))

    for uid in sample_user_ids:
        user_test = test_df[test_df['userid'] == uid]
        ground_truth_raw = user_test[user_test['rating'] >= rating_threshold]['title'].tolist()
        if not ground_truth_raw:
            continue

        ground_truth = set(normalize_title(title) for title in ground_truth_raw if title)


        if not ground_truth:
            continue

        rec_list = generate_movie_recommendation(
            uid, train_df, movie_cluster_df, sbert_model,
            k=k, candidate_pool_size=pool_size,
            ground_truth_titles=ground_truth_raw
        )
        if not rec_list:
            continue


        rec_list_norm = list(set(normalize_title(title) for title in rec_list if title))

        num_hit = sum(
            1 for rec in rec_list_norm
            if fuzzy_match(
                rec,
                ground_truth,
                movie_cluster_df=movie_cluster_df,
                genre_boost=True,
                embedding_boost=True,
                embedding_model=sbert_model
            )
        )

        precision = num_hit / k
        recall = min(num_hit / len(ground_truth), 1.0)  

        precision_list.append(precision)
        recall_list.append(recall)

    avg_precision = np.mean(precision_list) if precision_list else 0
    avg_recall = np.mean(recall_list) if recall_list else 0
    return avg_precision, avg_recall



In [440]:
def predict_rating_sbert(
    user_id, 
    item_title, 
    train_df, 
    movie_cluster_df, 
    sbert_model, 
    top_n=10, 
    sim_pow=2.0
):
    global_mean = train_df['rating'].mean()


    row_item = movie_cluster_df[movie_cluster_df['title'] == item_title]
    if row_item.empty:
        return global_mean  

    target_emb = row_item.iloc[0]['embedding']


    user_history = train_df[train_df['userid'] == user_id]
    if user_history.empty:
        return global_mean  

    user_mean_rating = user_history['rating'].mean() 

    merged = pd.merge(
        user_history,
        movie_cluster_df[['title', 'embedding']],
        on='title',
        how='inner'
    )

    if merged.empty:
        return user_mean_rating  

    user_embs = np.stack(merged['embedding'].tolist())  
    sims = cosine_similarity([target_emb], user_embs)[0]  
    merged['similarity'] = sims


    merged = merged.sort_values(by='similarity', ascending=False).head(top_n)

    if merged.iloc[0]['similarity'] < 0.1:
        return user_mean_rating


    sim_values = merged['similarity'].values
    sim_weights = np.power(sim_values, sim_pow)  
    ratings = merged['rating'].values

    weight_sum = sim_weights.sum()
    if weight_sum == 0:
        return user_mean_rating

    predicted_rating = np.average(ratings, weights=sim_weights)
    return predicted_rating

In [441]:
def evaluate_rmse(test_df, train_df, movie_cluster_df, sbert_model, top_n=10, sim_pow=2.0):
    """
    Evaluate RMSE on the test set by comparing predicted ratings vs. actual ratings.
    Using improved SBERT rating prediction that includes fallback, similarity^2, etc.
    """
    y_true = []
    y_pred = []

    for idx, row in test_df.iterrows():
        user_id = row['userid']
        item_title = row['title']
        actual_rating = row['rating']

        pred_rating = predict_rating_sbert(
            user_id,
            item_title,
            train_df,
            movie_cluster_df,
            sbert_model,
            top_n=top_n,
            sim_pow=sim_pow
        )
        if np.isnan(pred_rating):
            continue

        y_true.append(actual_rating)
        y_pred.append(pred_rating)

    if not y_true:
        return None

    mse = np.mean((np.array(y_true) - np.array(y_pred)) ** 2)
    rmse = np.sqrt(mse)
    return rmse

In [442]:
def main():
    dataset_path = r"D:\1513Project\ece1513project\ml-1m\llm_dataset.csv"
    df = pd.read_csv(dataset_path, encoding='utf-8', engine='python')

    train_df, test_df = split_train_test(df, test_size=0.2, seed=42)

    num_clusters = 19
    movie_cluster_df, centers, sbert_model, reduced = build_movie_clusters_umap(
        train_df,
        num_clusters=num_clusters,
        model_name='all-mpnet-base-v2',
        umap_dim=10
    )

    movie_cluster_df['title_normalized'] = movie_cluster_df['title'].str.strip().str.lower()


    seeds = [42, 43, 44, 45, 46]
    precision_list = []
    recall_list = []

    print("Start multi-seed evaluation (Fuzzy Matching)...\n")
    for seed in seeds:
        random.seed(seed)
        avg_p, avg_r = evaluate_topk_sample_fuzzy(
            test_df=test_df,
            train_df=train_df,
            movie_cluster_df=movie_cluster_df,
            sbert_model=sbert_model,
            k=10,
            rating_threshold=4.0,
            sample_size=None,  
            pool_size=46,
            similarity_threshold=0.3
        )
        precision_list.append(avg_p)
        recall_list.append(avg_r)
        print(f"[Seed {seed}] Precision: {avg_p:.4f}, Recall: {avg_r:.4f}")

    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    f1_score = (
        2 * avg_precision * avg_recall / (avg_precision + avg_recall)
        if (avg_precision + avg_recall) > 0 else 0
    )

    print("\n Averaged Fuzzy Evaluation Results:")
    print(f"Average Precision:{avg_precision:.4f}")
    print(f"Average Recall:{avg_recall:.4f}")
    print(f"F1 Score:{f1_score:.4f}")


    rmse_list = []
    print("\n Start RMSE Evaluation (Multi-seed)...")
    for seed in seeds:
        random.seed(seed)
        rmse_value = evaluate_rmse(
            test_df=test_df,
            train_df=train_df,
            movie_cluster_df=movie_cluster_df,
            sbert_model=sbert_model,
            top_n=20,
            sim_pow=2.5
        )
        if rmse_value is not None:
            rmse_list.append(rmse_value)
            print(f"[Seed {seed}] RMSE: {rmse_value:.4f}")
        else:
            print(f"[Seed {seed}] RMSE: Unable to calculate (no predictions).")

    if len(rmse_list) > 0:
        avg_rmse = np.mean(rmse_list)
        print(f"\nAverage RMSE over {len(rmse_list)} seeds: {avg_rmse:.4f}")
    else:
        print("\nAverage RMSE: Unable to calculate (no valid predictions).")



In [443]:
if __name__ == "__main__":
    main()

  user_summary = user_info_df.groupby('movieid').apply(summarize_users).to_dict()


Load the SBERT model: all-mpnet-base-v2


Batches:   0%|          | 0/115 [00:00<?, ?it/s]

Use UMAP to reduce dimension to 10 dimension...


  warn(


Do KMeans clustering, K = 19
Start multi-seed evaluation (Fuzzy Matching)...





KeyboardInterrupt: 