<a href="https://colab.research.google.com/github/Anushka0108/Movie-Recommender/blob/main/movierecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

url = "https://www.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm"
headers = {"User-Agent": "Mozilla/5.0"}

page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, "html.parser")

data = []

movies = soup.find_all("li", class_="ipc-metadata-list-summary-item")

for movie in movies[:10]:
    try:
        title = movie.find("h3").get_text(strip=True)
    except:
        title = "N/A"

    try:
        metadata = movie.find_all("span", class_="cli-title-metadata-item")
        year = metadata[0].get_text(strip=True) if len(metadata) > 0 else "N/A"
        duration = metadata[1].get_text(strip=True) if len(metadata) > 1 else "N/A"
        rating = metadata[2].get_text(strip=True) if len(metadata) > 2 else "N/A"
    except:
        year, duration, rating = "N/A", "N/A", "N/A"

    try:
        link = "https://www.imdb.com" + movie.find("a")["href"].split("?")[0]
    except:
        link = "N/A"

    if link != "N/A":
        movie_page = requests.get(link, headers=headers)
        movie_soup = BeautifulSoup(movie_page.text, "html.parser")
        try:
            genre = ", ".join([g.get_text(strip=True)
                               for g in movie_soup.find_all("span", class_="ipc-chip__text")])
        except:
            genre = "N/A"
        time.sleep(1)
    else:
        genre = "N/A"

    data.append({
        "Title": title,
        "Year": year,
        "Duration": duration,
        "Rating": rating,
        "Genre": genre,
        "Link": link
    })

df = pd.DataFrame(data)
df.to_excel("imdb_movies.xlsx", index=False)

print("Data saved to imdb_movies.xlsx")


✅ Data saved to imdb_movies.xlsx


In [28]:
import pandas as pd

df = pd.read_excel("imdb_movies.xlsx")
df


Unnamed: 0,Title,Year,Duration,Rating,Genre,Link
0,Weapons,2025,2h 8m,R,"Dark Comedy, Psychological Horror, Supernatura...",https://www.imdb.com/title/tt26581740/
1,Superman,2025,2h 9m,PG-13,"Action Epic, Adventure Epic, Globetrotting Adv...",https://www.imdb.com/title/tt5950044/
2,Night Always Comes,2025,1h 48m,R,"Crime, Drama, Thriller, Back to top",https://www.imdb.com/title/tt31567422/
3,Happy Gilmore 2,2025,1h 54m,PG-13,"Farce, High-Concept Comedy, Quest, Raunchy Com...",https://www.imdb.com/title/tt31868189/
4,The Naked Gun,2025,1h 25m,PG-13,"Bumbling Detective, Parody, Satire, Slapstick,...",https://www.imdb.com/title/tt3402138/
5,Nobody 2,2025,1h 29m,R,"Dark Comedy, One-Person Army Action, Action, C...",https://www.imdb.com/title/tt28996126/
6,Jurassic World: Rebirth,2025,2h 13m,PG-13,"Dinosaur Adventure, Action, Adventure, Sci-Fi,...",https://www.imdb.com/title/tt31036941/
7,Together,2025,1h 42m,R,"Body Horror, Psychological Horror, Horror, Rom...",https://www.imdb.com/title/tt31184028/
8,The Fantastic Four: First Steps,2025,1h 54m,PG-13,"Space Sci-Fi, Superhero, Action, Adventure, Sc...",https://www.imdb.com/title/tt10676052/
9,KPop Demon Hunters,2025,1h 35m,PG,"Computer Animation, Pop Musical, Action, Adven...",https://www.imdb.com/title/tt14205554/


In [42]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_excel("imdb_movies.xlsx")
df['Genre'] = df['Genre'].fillna("")

vectorizer = TfidfVectorizer(stop_words="english")
genre_matrix = vectorizer.fit_transform(df['Genre'])
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

def recommend_movies(title, n=5):
    if title not in df['Title'].values:
        return f"Movie '{title}' not found in dataset"

    idx = df.index[df['Title'] == title][0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_movies = [df.iloc[i[0]]['Title'] for i in sim_scores[1:n+1]]
    recommend_df = df[df['Title'].isin(top_movies)][
        ["Title", "Year", "Duration", "Rating", "Genre", "Link"]
    ]

    recommend_df.to_excel("recommended.xlsx", index=False)
    return recommend_df

movie_title = input("Enter a movie title: ")
print(f"Recommendations for {movie_title}:")
result_df = recommend_movies(movie_title, n=5)
print(result_df)


Enter a movie title: Weapons
Recommendations for Weapons:
                Title  Year Duration Rating  \
3     Happy Gilmore 2  2025   1h 54m  PG-13   
4       The Naked Gun  2025   1h 25m  PG-13   
5            Nobody 2  2025   1h 29m      R   
7            Together  2025   1h 42m      R   
9  KPop Demon Hunters  2025   1h 35m     PG   

                                               Genre  \
3  Farce, High-Concept Comedy, Quest, Raunchy Com...   
4  Bumbling Detective, Parody, Satire, Slapstick,...   
5  Dark Comedy, One-Person Army Action, Action, C...   
7  Body Horror, Psychological Horror, Horror, Rom...   
9  Computer Animation, Pop Musical, Action, Adven...   

                                     Link  
3  https://www.imdb.com/title/tt31868189/  
4   https://www.imdb.com/title/tt3402138/  
5  https://www.imdb.com/title/tt28996126/  
7  https://www.imdb.com/title/tt31184028/  
9  https://www.imdb.com/title/tt14205554/  
