In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re

# Setup WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# IMDb URL - Change to your target search / genre page
base_url = "https://www.imdb.com/search/title/?genres=drama&start={}&explore=title_type,genres"

movie_names = []
storylines = []

# Start from page 1
start = 1

while True:
    url = base_url.format(start)
    print(f"Scraping: {url}")
    driver.get(url)
    time.sleep(3)

    # Find all movie containers on current page
    movies = driver.find_elements(By.XPATH, '//li[@class="ipc-metadata-list-summary-item"]')

    # If no movies found, we reached the last page → break loop
    if not movies:
        print("Reached the end of pages.")
        break

    for movie in movies:
        # Extract movie title
        try:
            title_text = movie.find_element(By.XPATH, './/h3[contains(@class,"ipc-title__text")]').text
            # Remove number prefix (e.g., "1. The Shawshank Redemption" → "The Shawshank Redemption")
            title = re.sub(r"^\d+\.\s*", "", title_text)
        except:
            title = "N/A"

        # Extract storyline / description
        try:
            storyline = movie.find_element(By.XPATH, './/div[contains(@class,"ipc-html-content-inner-div")]').text
        except:
            storyline = "N/A"

        movie_names.append(title)
        storylines.append(storyline)

    # Go to next page → IMDb shows 50 movies per page
    start += 50

# Save all scraped data into CSV
df = pd.DataFrame({"Movie Name": movie_names, "Storyline": storylines})
df.to_csv("imdb_all_movies_storylines.csv", index=False, encoding="utf-8")
print(f"✅ Scraped {len(df)} movies successfully!")

driver.quit()


Scraping: https://www.imdb.com/search/title/?genres=drama&start=1&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=51&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=101&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=151&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=201&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=251&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=301&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=351&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=401&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start=451&explore=title_type,genres
Scraping: https://www.imdb.com/search/title/?genres=drama&start

InvalidSessionIdException: Message: invalid session id; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
	GetHandleVerifier [0x0x107a8a3+63283]
	GetHandleVerifier [0x0x107a8e4+63348]
	(No symbol) [0x0xeb3ca0]
	(No symbol) [0x0xeefb48]
	(No symbol) [0x0xf212d6]
	(No symbol) [0x0xf1ceb5]
	(No symbol) [0x0xf1c436]
	(No symbol) [0x0xe85755]
	(No symbol) [0x0xe85cae]
	(No symbol) [0x0xe8613d]
	GetHandleVerifier [0x0x12ebb43+2623955]
	GetHandleVerifier [0x0x12e6daa+2604090]
	GetHandleVerifier [0x0x10a069a+218410]
	GetHandleVerifier [0x0x1090ed8+154984]
	GetHandleVerifier [0x0x109742d+180925]
	(No symbol) [0x0xe85420]
	(No symbol) [0x0xe84c36]
	GetHandleVerifier [0x0x14268fc+3913612]
	BaseThreadInitThunk [0x0x7644fcc9+25]
	RtlGetAppContainerNamedObjectPath [0x0x77dc82ae+286]
	RtlGetAppContainerNamedObjectPath [0x0x77dc827e+238]


In [2]:
import os
file_path = os.path.abspath("imdb_all_movies_storylines.csv")
print(f"CSV saved at: {file_path}")


CSV saved at: C:\Users\Hp\Downloads\IMDB_recommendation_system-main\IMDB_recommendation_system-main\imdb_all_movies_storylines.csv


In [3]:
import os

file_path = os.path.abspath("imdb_all_movies_storylines.csv")
print(f"📂 File saved at: {file_path}")


📂 File saved at: C:\Users\Hp\Downloads\IMDB_recommendation_system-main\IMDB_recommendation_system-main\imdb_all_movies_storylines.csv


In [4]:
import os
import pandas as pd

# Create DataFrame from scraped data
df = pd.DataFrame({"Movie Name": movie_names, "Storyline": storylines})

# Save CSV
df.to_csv("imdb_storylines.csv", index=False, encoding="utf-8")

# ✅ Check saved file path and number of movies
file_path = os.path.abspath("imdb_storylines.csv")
print(f"📂 CSV saved at: {file_path}")
print(f"📊 Total movies saved: {len(df)}")

# Optional: print first 5 rows to verify
print(df.head())


📂 CSV saved at: C:\Users\Hp\Downloads\IMDB_recommendation_system-main\IMDB_recommendation_system-main\imdb_storylines.csv
📊 Total movies saved: 3600
                     Movie Name  \
0          Dexter: Resurrection   
1                 Thunderbolts*   
2  The Terminal List: Dark Wolf   
3                       Hostage   
4                 F1: The Movie   

                                           Storyline  
0  Dexter Morgan awakens from a coma and sets out...  
1  After finding themselves ensnared in a death t...  
2  Navy SEAL Ben Edwards' abrupt discharge from t...  
3  When the PM's husband is kidnapped and the vis...  
4  A Formula One driver comes out of retirement t...  


## Preprocessing using NLP

In [5]:
print(df['Movie Name'].head(20))


0             Dexter: Resurrection
1                    Thunderbolts*
2     The Terminal List: Dark Wolf
3                          Hostage
4                    F1: The Movie
5       The Summer I Turned Pretty
6     My Life with the Walter Boys
7                           Dexter
8                  Game of Thrones
9                 King & Conqueror
10                    Breaking Bad
11                      Foundation
12                    Eenie Meanie
13               The Terminal List
14                         Fallout
15                   Twisted Metal
16                      Two Graves
17                         Sinners
18                         Untamed
19                         Dept. Q
Name: Movie Name, dtype: object


In [6]:
# 1. Import Libraries
# ================================
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# 2️⃣ Load Dataset & Rename Columns
# ================================
df = pd.read_csv("imdb_storylines.csv")
df = df[['Movie Name', 'Storyline']].dropna().reset_index(drop=True)
df.rename(columns={'Movie Name': 'title', 'Storyline': 'overview'}, inplace=True)

print("Dataset Shape:", df.shape)
print(df.head(3))

Dataset Shape: (3550, 2)
                          title  \
0          Dexter: Resurrection   
1                 Thunderbolts*   
2  The Terminal List: Dark Wolf   

                                            overview  
0  Dexter Morgan awakens from a coma and sets out...  
1  After finding themselves ensnared in a death t...  
2  Navy SEAL Ben Edwards' abrupt discharge from t...  


In [8]:
import pandas as pd

# Load your CSV
df = pd.read_csv("imdb_storylines.csv")

# Remove exact duplicates based on both 'Movie Name' and 'Storyline'
df_clean = df.drop_duplicates(subset=['Movie Name', 'Storyline']).reset_index(drop=True)

print("Original rows:", df.shape[0])
print("Rows after removing duplicates:", df_clean.shape[0])

# Save cleaned CSV
df_clean.to_csv("imdb_storylines_cleaned.csv", index=False)
print("✅ Cleaned CSV saved as 'imdb_storylines_cleaned.csv'")


Original rows: 3600
Rows after removing duplicates: 51
✅ Cleaned CSV saved as 'imdb_storylines_cleaned.csv'


In [9]:
# 2️⃣ Load Dataset & Rename Columns
# ================================
df = pd.read_csv("imdb_storylines_cleaned.csv")
df = df[['Movie Name', 'Storyline']].dropna().reset_index(drop=True)
df.rename(columns={'Movie Name': 'title', 'Storyline': 'overview'}, inplace=True)

print("Dataset Shape:", df.shape)
print(df.head(3))

Dataset Shape: (50, 2)
                          title  \
0          Dexter: Resurrection   
1                 Thunderbolts*   
2  The Terminal List: Dark Wolf   

                                            overview  
0  Dexter Morgan awakens from a coma and sets out...  
1  After finding themselves ensnared in a death t...  
2  Navy SEAL Ben Edwards' abrupt discharge from t...  


In [10]:
print("Total unique movies:", df.shape[0])
print(df.head(3))

Total unique movies: 50
                          title  \
0          Dexter: Resurrection   
1                 Thunderbolts*   
2  The Terminal List: Dark Wolf   

                                            overview  
0  Dexter Morgan awakens from a coma and sets out...  
1  After finding themselves ensnared in a death t...  
2  Navy SEAL Ben Edwards' abrupt discharge from t...  


In [11]:
# 3️⃣ Text Cleaning Function
STOP_WORDS = set(ENGLISH_STOP_WORDS)

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in STOP_WORDS and len(t) > 2]
    return " ".join(tokens)

In [12]:
# Apply cleaning
df['clean_overview'] = df['overview'].apply(clean_text)
print("\nSample cleaned overview:")
print(df[['title', 'clean_overview']].head(3))


Sample cleaned overview:
                          title  \
0          Dexter: Resurrection   
1                 Thunderbolts*   
2  The Terminal List: Dark Wolf   

                                      clean_overview  
0  dexter morgan awakens coma sets new york city ...  
1  finding ensnared death trap unconventional tea...  
2  navy seal ben edwards abrupt discharge militar...  


In [13]:
# 4️⃣ TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(df['clean_overview'])

In [14]:
# 5️⃣ Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [15]:
# 6️⃣ Recommendation Function with Fuzzy Matching
def recommend_movies(title, df=df, similarity_matrix=cosine_sim, top_n=5):
    # Find closest match
    all_titles = df['title'].tolist()
    matches = difflib.get_close_matches(title, all_titles, n=1, cutoff=0.6)
    if not matches:
        return f"❌ Movie '{title}' not found in dataset."
    
    resolved_title = matches[0]
    idx = df[df['title'] == resolved_title].index[0]

    # Cosine similarity scores
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Exclude the movie itself and get unique recommendations
    sim_scores = [s for s in sim_scores if s[0] != idx]
    recommendations = []
    seen = set()
    for i, _ in sim_scores:
        t = df['title'].iloc[i]
        if t not in seen:
            recommendations.append(t)
            seen.add(t)
        if len(recommendations) >= top_n:
            break

    return resolved_title, recommendations

In [16]:
import difflib

# 7️⃣ Example Usage
query = "Hostage"
resolved_title, recommended = recommend_movies(query, top_n=5)

print(f"\n🎬 Movies similar to '{resolved_title}':")
for m in recommended:
    print("👉", m)


🎬 Movies similar to 'Hostage':
👉 Severance
👉 Countdown
👉 Two Graves
👉 Invasion
👉 Dept. Q


In [17]:
import difflib

# 7️⃣ Example Usage
query = "Dexter"
resolved_title, recommended = recommend_movies(query, top_n=5)

print(f"\n🎬 Movies similar to '{resolved_title}':")
for m in recommended:
    print("👉", m)


🎬 Movies similar to 'Dexter':
👉 Dexter: Resurrection
👉 Monster
👉 MobLand
👉 Thunderbolts*
👉 The Terminal List: Dark Wolf


In [18]:
import pandas as pd

df = pd.read_csv("imdb_storylines_cleaned.csv")
df.columns = df.columns.str.strip().str.lower()

print("Columns in CSV:", df.columns)


Columns in CSV: Index(['movie name', 'storyline'], dtype='object')
