In [1]:
# Load & merge data
import pandas as pd

movies = pd.read_csv('/path/to.csv')
credits = pd.read_csv('/path/to.csv')

movies = movies.merge(credits, on="title")

In [2]:
# Keep useful features 
movies = movies[[
    "movie_id",
    "title",
    "overview",
    "genres",
    "keywords",
    "cast",
    "crew"
]]

In [3]:
# Conversion of JSON strings → lists
import ast

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

In [4]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [5]:
# Extract cast (top actors)
def convert_cast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert_cast)

In [6]:
# Extract director from crew
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

In [7]:
# Overview → list
movies['overview'] = movies['overview'].fillna('')
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [8]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [9]:
# Convert tags → string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [10]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(movies["tags"]).toarray()

In [11]:
# Similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [12]:
# Recommendation function
def recommend(movie):
    index = movies[movies["title"] == movie].index[0]
    distances = similarity[index]
    movies_list = sorted(list(enumerate(distances)),
                         reverse=True,
                         key=lambda x: x[1])[1:6]

    for i in movies_list:
        print(movies.iloc[i[0]].title)

In [13]:
# Recommendation test
recommend("Avatar")
recommend("Batman Begins")
recommend("Titanic")

Aliens
Moonraker
Alien
Alien³
Silent Running
The Dark Knight
The Dark Knight Rises
Batman
Batman
Batman & Robin
The Notebook
Romance & Cigarettes
Captain Phillips
Veer-Zaara
Four Weddings and a Funeral


In [28]:
import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

movies['tags'] = movies['tags'].apply(stem)


In [30]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()

similarity = cosine_similarity(vectors)


In [32]:
def recommend(movie):
    index = movies[movies["title"] == movie].index[0]
    distances = similarity[index]
    movies_list = sorted(list(enumerate(distances)),
                         reverse=True,
                         key=lambda x: x[1])[1:6]

    recommended_movies = []
    recommended_posters = []

    for i in movies_list:
        movie_id = movies.iloc[i[0]].movie_id
        
        recommended_movies.append(movies.iloc[i[0]].title)
        recommended_posters.append(fetch_poster(movie_id))

    return recommended_movies, recommended_posters


In [38]:
pip install streamlit

Collecting streamlit
  Obtaining dependency information for streamlit from https://files.pythonhosted.org/packages/48/1d/40de1819374b4f0507411a60f4d2de0d620a9b10c817de5925799132b6c9/streamlit-1.54.0-py3-none-any.whl.metadata
  Downloading streamlit-1.54.0-py3-none-any.whl.metadata (9.8 kB)
Collecting altair!=5.4.0,!=5.4.1,<7,>=4.0 (from streamlit)
  Obtaining dependency information for altair!=5.4.0,!=5.4.1,<7,>=4.0 from https://files.pythonhosted.org/packages/db/33/ef2f2409450ef6daa61459d5de5c08128e7d3edb773fefd0a324d1310238/altair-6.0.0-py3-none-any.whl.metadata
  Downloading altair-6.0.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Obtaining dependency information for blinker<2,>=1.5.0 from https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl.metadata
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=5.5 (from streamlit)
  O

### STREAMLIT UI

In [40]:
import streamlit as st

st.title("Movie Recommender System")

selected_movie = st.selectbox(
    "Choose a movie",
    movies['title'].values
)

if st.button("Recommend"):
    names, posters = recommend(selected_movie)
    
    for i in range(5):
        st.text(names[i])
        st.image(posters[i])


2026-02-11 17:34:15.468 
  command:

    streamlit run /Users/anna/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]
2026-02-11 17:34:15.484 Session state does not function when running a script without `streamlit run`


In [45]:
import pickle

pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))