<a href="https://colab.research.google.com/github/AkashB-13/NaanMudhalvan1/blob/main/app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import ast
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
import scipy.sparse as sparse
import implicit
import requests

# Load data
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('rating.csv')
tmdb_movies = pd.read_csv('tmdb_5000_movies.csv')
tmdb_credits = pd.read_csv('tmdb_5000_credits.csv')
links = pd.read_csv('link.csv')

# Merge TMDB data
tmdb = tmdb_movies.merge(tmdb_credits, left_on='id', right_on='movie_id', how='left')
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)
tmdb['id'] = tmdb['id'].astype(int)
df = links.merge(tmdb, left_on='tmdbId', right_on='id')

# Helper functions
def get_director(x):
    for i in ast.literal_eval(x):
        if i['job'] == 'Director':
            return i['name']
    return ''

def get_top_cast(x):
    return [i['name'] for i in ast.literal_eval(x)[:3]]

def parse_genres(x):
    return [i['name'] for i in ast.literal_eval(x)]

# Fill missing values and extract features
df['crew'] = df['crew'].fillna('[]')
df['cast'] = df['cast'].fillna('[]')
df['genres'] = df['genres'].fillna('[]')
df['overview'] = df['overview'].fillna('')
df['director'] = df['crew'].apply(get_director)
df['cast'] = df['cast'].apply(get_top_cast)
df['genres'] = df['genres'].apply(parse_genres)

def create_tags(row):
    cast = ' '.join(row['cast']) if isinstance(row['cast'], list) else ''
    genres = ' '.join(row['genres']) if isinstance(row['genres'], list) else ''
    return f"{cast} {row['director']} {genres} {row['overview']}"

df['tags'] = df.apply(create_tags, axis=1)

# TF-IDF and cosine similarity
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['tags'])
cosine_sim = cosine_similarity(tfidf_matrix)

# ALS model prep
ratings['implicit_rating'] = ratings['rating']
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()
user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_idx)

sparse_matrix = sparse.coo_matrix(
    (ratings['implicit_rating'].astype(float),
     (ratings['movie_idx'], ratings['user_idx']))
)

als_model = implicit.als.AlternatingLeastSquares(
    factors=50,
    regularization=0.1,
    iterations=20,
    random_state=42
)

sparse_matrix_csr = sparse_matrix.tocsr()
als_model.fit(sparse_matrix_csr)

# Save data
with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('als_model.pkl', 'wb') as f:
    pickle.dump(als_model, f)

with open('user_to_idx.pkl', 'wb') as f:
    pickle.dump(user_to_idx, f)

with open('movie_to_idx.pkl', 'wb') as f:
    pickle.dump(movie_to_idx, f)

df.to_pickle('movies_df.pkl')

# Fetch posters
tmdb_api_key = "fd09c6f07ac096efb6bf5af91fa69803"

def fetch_poster_url(tmdb_id, api_key):
    try:
        url = f"https://api.themoviedb.org/3/movie/{tmdb_id}?api_key={api_key}"
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        data = response.json()
        poster_path = data.get('poster_path')
        if poster_path:
            return f"https://image.tmdb.org/t/p/w500{poster_path}"
    except Exception as e:
        print(f"Error fetching poster for TMDB ID {tmdb_id}: {e}")
    return "https://via.placeholder.com/500x750?text=No+Image"

print("Fetching poster URLs for movies...")
poster_urls = {tmdb_id: fetch_poster_url(tmdb_id, tmdb_api_key) for tmdb_id in df['tmdbId'].unique()}
df['poster_url'] = df['tmdbId'].map(poster_urls)
df.to_pickle('movies_with_posters.pkl')
print("Done! Saved movies_with_posters.pkl")

SyntaxError: invalid syntax (<ipython-input-1-f22b974cab45>, line 110)