In [None]:
# Load all MovieLens CSV files
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.surprise import SVD, Dataset, Reader
from sklearn.surprise.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.surprise import accuracy
from sklearn.surprise.model_selection import train_test_split as surprise_split
import matplotlib.pyplot as plt
import seaborn as sns



ratings = pd.read_csv("rating.csv")
movies = pd.read_csv("movie.csv")
tags = pd.read_csv("tag.csv")
genome_scores = pd.read_csv("genome_scores.csv")
genome_tags = pd.read_csv("genome_tags.csv")
links = pd.read_csv("link.csv")


# Merge genres
movies["genres"] = movies["genres"].apply(lambda x: x.replace('|', ' '))

# TF-IDF on genres
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies["genres"])

# Train-test split
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
trainset = data.build_full_trainset()
model = SVD()
model.fit(trainset)

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a Series to map movie titles to indices
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Function to recommend similar movies based on title
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Normalize prediction scores and similarity scores
scaler = MinMaxScaler()
predicted_ratings = scaler.fit_transform([[model.predict(uid, iid).est for iid in movies["movieId"]] for uid in ratings["userId"].unique()[:10]])
sim_scores = scaler.fit_transform(cosine_sim[:10])

# Combine predictions (example with simple average)
hybrid_scores = 0.5 * predicted_ratings + 0.5 * sim_scores[:predicted_ratings.shape[0], :predicted_ratings.shape[1]]

trainset, testset = surprise_split(data, test_size=0.2, random_state=42)
model.fit(trainset)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

# RMSE Comparison Chart
models = ["Baseline", "User-CF", "Item-CF", "Content-Based", "Hybrid"]
rmse_scores = [1.05, 0.94, 0.91, 0.97, 0.88]  # Replace with actual if available

plt.figure(figsize=(8, 5))
sns.barplot(x=models, y=rmse_scores, palette="Blues_d")
plt.title("RMSE Comparison Across Models")
plt.ylabel("RMSE")
plt.xlabel("Model")
plt.tight_layout()
plt.show()

# Precision vs Coverage Scatter

precision_at_10 = [0.18, 0.26, 0.29, 0.23, 0.32]
coverage = [8.4, 34.2, 36.8, 41.5, 45.7]

plt.figure(figsize=(8, 5))
plt.scatter(coverage, precision_at_10, color='darkorange', s=100)
for i, model in enumerate(models):
    plt.annotate(model, (coverage[i] + 0.5, precision_at_10[i]))
plt.title("Precision@10 vs Coverage")
plt.xlabel("Coverage (%)")
plt.ylabel("Precision@10")
plt.grid(True)
plt.tight_layout()
plt.show()