In [12]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [2]:
df = pd.read_csv("D:/Project/Untitled Folder/final_movie_with_Sent_embedding.csv")

In [4]:
df=df[df['votes']>5000]
df.shape

(16102, 13)

In [5]:
df.drop(columns=['Unnamed: 0'],inplace=True)
df.drop(columns=['Unnamed: 0.1'],inplace=True)

In [6]:
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer

# Normalize numeric features
scaler = MinMaxScaler()
df['votes'] = scaler.fit_transform(df[['votes']])

In [7]:
df["plot_embedding"] = df["plot_embedding"].apply(lambda x : np.fromstring(x.strip('[]'), sep=' '))

In [8]:
df.iloc[1]['plot_embedding']

array([ 1.24864271e-02,  1.26574710e-02, -2.79607270e-02, -2.22760271e-02,
       -4.85204607e-02,  2.18362343e-02,  1.64649710e-01,  1.87041871e-02,
        1.92957791e-03, -3.28024030e-02,  2.90308800e-02,  7.38063082e-02,
       -3.70278098e-02,  5.78086525e-02,  3.54700796e-02, -1.44459065e-02,
        1.66332182e-02,  6.08105073e-03, -1.59857571e-02,  9.44518447e-02,
       -4.45305295e-02, -8.21410641e-02, -3.55049632e-02, -7.56704360e-02,
        8.02756473e-03, -3.74906287e-02,  6.89512044e-02,  5.55603988e-02,
       -5.19546680e-02, -1.60463136e-02,  5.50378338e-02, -1.57633517e-02,
        5.96319027e-02,  3.42928134e-02,  1.13728590e-01, -1.76504310e-02,
        9.11109243e-03,  6.68795556e-02,  5.94067611e-02,  1.77730881e-02,
       -1.32880425e-02, -7.87728652e-02,  2.12012939e-02,  8.35853964e-02,
       -1.11727268e-01, -1.47380847e-02,  3.38299908e-02,  4.62554134e-02,
        1.58568900e-02, -5.13496362e-02, -4.13405523e-02,  3.74391228e-02,
        5.99090345e-02, -

In [9]:
plot_embeddings = np.vstack(df["plot_embedding"].values)
plot_embeddings.shape

(16102, 384)

In [10]:
norms = np.linalg.norm(plot_embeddings, axis=1)
print(f"Min norm: {norms.min()}, Max norm: {norms.max()}")

Min norm: 0.9999998581246479, Max norm: 1.0000001409909178


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import joblib


plot_sim = cosine_similarity(plot_embeddings)
joblib.dump(plot_sim, "D:/Project/Untitled Folder/plot_sim.joblib")
# Save the matrix to disk (optional)
#np.save("plot_sim.npy", plot_sim)
plot_sim.shape

(16102, 16102)

In [13]:
plot_sim=joblib.load("D:/Project/Untitled Folder/plot_sim.joblib")

In [14]:
def hybrid_similarity(idx_a, idx_b, df, plot_sim):
    movie_a = df.iloc[idx_a]
    movie_b = df.iloc[idx_b]

    # Genre overlap
    genres_a = set(str(movie_a['genre']).split(', '))
    genres_b = set(str(movie_b['genre']).split(', '))
    genre_overlap = len(genres_a & genres_b) / max(len(genres_a), 1)

    # Year score
    year_distance = abs(movie_a['year'] - movie_b['year'])
    year_score = max(0, 1 - year_distance / 10)

    # Certificate match
    certificate_score = 1 if movie_a['certificates'] == movie_b['certificates'] else 0

    # Metascore similarity
    if pd.isna(movie_a['metascore']) or pd.isna(movie_b['metascore']):
        metascore_score = 0.5
    else:
        meta_diff = abs(movie_a['metascore'] - movie_b['metascore']) / 100
        metascore_score = 1 - meta_diff

    # Popularity
    if movie_a['votes'] > 0:
        pop_score = min(np.log1p(movie_b['votes']) / np.log1p(movie_a['votes']), 1.0)
    else:
        pop_score = 0

    # Plot similarity from matrix
    plot_score = plot_sim[idx_a][idx_b]

    # Final weighted score
    final_score = (
        0.3 * plot_score +
        0.2 * genre_overlap +
        0.1 * year_score +
        0.1 * certificate_score +
        0.1 * metascore_score +
        0.2 * pop_score
    )

    return final_score


In [15]:
def recommend_hybrid(title, df, plot_sim, top_k=5):
    title_to_index = {t.lower(): i for i, t in enumerate(df['title'])}
    idx = title_to_index.get(title.lower())
    
    if idx is None:
        return f"Movie '{title}' not found."

    scores = []
    for i in range(len(df)):
        if i == idx:
            continue
        score = hybrid_similarity(idx, i, df, plot_sim)
        scores.append((i, score))

    top_indices = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    top_ids = [i for i, _ in top_indices]
    
    return df.iloc[top_ids][['title', 'year', 'genre', 'imdb_rating', 'votes', 'metascore']]


In [16]:
recommend_hybrid("Inception", df, plot_sim, top_k=5)

Unnamed: 0,title,year,genre,imdb_rating,votes,metascore
12002,Iron Man,2008,"['Action', 'Adventure', 'Sci-Fi']",7.9,0.384195,79.0
13251,Iron Man Three,2013,"['Action', 'Adventure', 'Sci-Fi']",7.1,0.304286,62.0
11754,Transformers,2007,"['Action', 'Adventure', 'Sci-Fi']",7.1,0.226807,61.0
12266,Terminator Salvation,2009,"['Action', 'Adventure', 'Sci-Fi']",6.5,0.124514,49.0
11757,Spider-Man 3,2007,"['Action', 'Adventure', 'Sci-Fi']",6.3,0.218351,59.0


In [17]:
def hybrid_similarity(idx_a, idx_b, df, plot_sim):
    movie_a = df.iloc[idx_a]
    movie_b = df.iloc[idx_b]

    # Genre overlap
    genres_a = set(str(movie_a['genre']).split(', '))
    genres_b = set(str(movie_b['genre']).split(', '))
    genre_overlap = len(genres_a & genres_b) / max(len(genres_a), 1)

    # Year score
    year_distance = abs(movie_a['year'] - movie_b['year'])
    year_score = max(0, 1 - year_distance / 10)

    # Certificate match
    certificate_score = 1 if movie_a['certificates'] == movie_b['certificates'] else 0

    # Metascore similarity
    if pd.isna(movie_a['metascore']) or pd.isna(movie_b['metascore']):
        metascore_score = 0.5
    else:
        meta_diff = abs(movie_a['metascore'] - movie_b['metascore']) / 100
        metascore_score = 1 - meta_diff

    # IMDb rating similarity
    if pd.isna(movie_a['imdb_rating']) or pd.isna(movie_b['imdb_rating']):
        imdb_score = 0.5
    else:
        imdb_diff = abs(movie_a['imdb_rating'] - movie_b['imdb_rating']) / 10
        imdb_score = 1 - imdb_diff

    # Popularity
    if movie_a['votes'] > 0:
        pop_score = min(np.log1p(movie_b['votes']) / np.log1p(movie_a['votes']), 1.0)
    else:
        pop_score = 0

    # Plot similarity
    plot_score = plot_sim[idx_a][idx_b]

    # Final weighted hybrid score
    final_score = (
        0.30 * plot_score +
        0.15 * genre_overlap +
        0.05 * year_score +
        0.10 * certificate_score +
        0.10 * metascore_score +
        0.20 * imdb_score +
        0.10 * pop_score
    )

    return final_score


In [18]:
def recommend_hybrid(title, df, plot_sim, top_k=5):
    title_to_index = {t.lower(): i for i, t in enumerate(df['title'])}
    idx = title_to_index.get(title.lower())
    
    if idx is None:
        return f"Movie '{title}' not found."

    scores = []
    for i in range(len(df)):
        if i == idx:
            continue
        score = hybrid_similarity(idx, i, df, plot_sim)
        scores.append((i, score))

    top_indices = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    top_ids = [i for i, _ in top_indices]
    
    return df.iloc[top_ids][['title', 'year', 'genre', 'imdb_rating', 'votes', 'metascore']]


In [19]:
recommend_hybrid("Dune: Part One", df, plot_sim, top_k=5)

Unnamed: 0,title,year,genre,imdb_rating,votes,metascore
16000,Dune: Part Two,2024,"['Action', 'Adventure', 'Drama']",8.5,0.210522,79.0
14251,Thor: Ragnarok,2017,"['Action', 'Adventure', 'Comedy']",7.9,0.27981,74.0
15251,Spider-Man: No Way Home,2021,"['Action', 'Adventure', 'Fantasy']",8.2,0.312966,71.0
12251,Avatar,2009,"['Action', 'Adventure', 'Fantasy']",7.9,0.467036,83.0
14275,War for the Planet of the Apes,2017,"['Action', 'Adventure', 'Drama']",7.4,0.102069,82.0


In [28]:
df[df['title'] == "Dune: Part One"]

Unnamed: 0,title,year,imdb_rating,metascore,votes,genre,runtime,certificates,img_url,plot,plot_embedding
15250,Dune: Part One,2021,8.0,74.0,0.313816,"['Action', 'Adventure', 'Drama']",9300.0,PG-13,https://m.media-amazon.com/images/M/MV5BNWIyNm...,Paul Atreides arrives on Arrakis after his fat...,"[-0.0104016718, 0.0161152314, -0.0604734235, 0..."


In [20]:
import streamlit as st

st.title("🎬 Hybrid Movie Recommender")
movie_list = sorted(df['title'].unique())
selected_movie = st.selectbox("Select a movie:", movie_list)

top_k = st.slider("Number of recommendations", 1, 20, 5)

if st.button("Get Recommendations"):
    results, error = recommend_hybrid(selected_movie, df, plot_sim, top_k=top_k)
    if error:
        st.error(error)
    else:
        st.subheader(f"Top {top_k} movies similar to '{selected_movie}':")
        st.dataframe(results.reset_index(drop=True))

2025-08-02 16:25:55.223 
  command:

    streamlit run d:\Project\RecSys\RecSys\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-08-02 16:25:55.369 Session state does not function when running a script without `streamlit run`


In [30]:
pip install streamlit

Collecting streamlitNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Using cached streamlit-1.47.1-py3-none-any.whl.metadata (9.0 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Using cached cachetools-6.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Using cached protobuf-6.31.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-21.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Using cached toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Using cached watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting gitpython!=3.1.19,<4,>=3

In [32]:
pip freeze > D:/Project/Untitled Folder/requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [33]:
cd D:/Project/Untitled Folder

D:\Project\Untitled Folder


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [34]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [26]:
df[df['votes']>0.2][['title', 'year', 'genre', 'imdb_rating', 'votes', 'metascore']]

Unnamed: 0,title,year,genre,imdb_rating,votes,metascore
0,Psycho,1960,"['Drama', 'Horror', 'Mystery']",8.5,0.245582,97.0
1500,"Il buono, il brutto, il cattivo",1966,"['Adventure', 'Drama', 'Western']",8.8,0.279406,90.0
2000,2001: A Space Odyssey,1968,"['Adventure', 'Sci-Fi']",8.3,0.245705,84.0
2750,A Clockwork Orange,1971,"['Crime', 'Sci-Fi']",8.2,0.297047,77.0
3000,The Godfather,1972,"['Crime', 'Drama']",9.2,0.697088,100.0
...,...,...,...,...,...,...
15252,Don't Look Up,2021,"['Comedy', 'Drama', 'Sci-Fi']",7.2,0.208544,49.0
15500,The Batman,2022,"['Action', 'Crime', 'Drama']",7.8,0.286380,72.0
15501,Top Gun: Maverick,2022,"['Action', 'Drama']",8.2,0.256642,78.0
15750,Oppenheimer,2023,"['Biography', 'Drama', 'History']",8.3,0.296766,90.0


In [25]:
df.head()

Unnamed: 0,title,year,imdb_rating,metascore,votes,genre,runtime,certificates,img_url,plot,plot_embedding
0,Psycho,1960,8.5,97.0,0.245582,"['Drama', 'Horror', 'Mystery']",6540.0,R,https://m.media-amazon.com/images/M/MV5BYjZhMz...,A secretary on the run for embezzlement takes ...,"[-0.0254985727, 0.0469600223, -0.0414056852, 0..."
1,The Apartment,1960,8.3,94.0,0.067003,"['Comedy', 'Drama', 'Romance']",7500.0,Approved,https://m.media-amazon.com/images/M/MV5BNDdhMz...,A Manhattan insurance clerk tries to rise in h...,"[0.0124864271, 0.012657471, -0.027960727, -0.0..."
2,Spartacus,1960,7.9,87.0,0.046868,"['Adventure', 'Biography', 'Drama']",11820.0,PG-13,https://m.media-amazon.com/images/M/MV5BMTcyOT...,The slave Spartacus survives brutal training a...,"[-0.0340616852, 0.0296150967, -0.0190523751, -..."
3,The Magnificent Seven,1960,7.7,74.0,0.033408,"['Action', 'Adventure', 'Drama']",7680.0,Approved,https://m.media-amazon.com/images/M/MV5BMzYyNz...,Seven gunfighters are hired by Mexican peasant...,"[-0.0328211859, 0.0200974066, -0.104040436, 0...."
4,À bout de souffle,1960,7.7,96.0,0.028243,"['Crime', 'Drama']",5400.0,Not Rated,https://m.media-amazon.com/images/M/MV5BZGI5MW...,"A small-time crook, hunted by the authorities ...","[0.00968873408, 0.0687137395, -0.0214084052, 0..."
