In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('--window-size=1920x1080')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    return driver

def scrape_movie_details(driver, url):
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(1)

        genre_elements = driver.find_elements(By.CSS_SELECTOR, ".ipc-chip-list__scroller a span")
        genres = [g.text for g in genre_elements[:3]]  # Limit to first 3 genres

        director_element = driver.find_element(By.XPATH, "//li[@data-testid='title-pc-principal-credit']//a")
        director = director_element.text

        return ", ".join(genres), director

    except Exception as e:
        print(f"Error fetching movie details: {e}")
        return "", ""

def scrape_imdb_top_250():
    driver = setup_driver()
    data = []

    try:
        driver.get("https://www.imdb.com/chart/top/")
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".ipc-metadata-list-summary-item"))
        )

        movies = driver.find_elements(By.CSS_SELECTOR, ".ipc-metadata-list-summary-item")

        for idx in range(len(movies)):  #for top 250
        #for idx in range(min(5, len(movies))): ## for top 5
            try:
                # Re-fetch to avoid stale elements
                movies = driver.find_elements(By.CSS_SELECTOR, ".ipc-metadata-list-summary-item")
                movie = movies[idx]

                title_text = movie.find_element(By.CSS_SELECTOR, "h3").text
                title = " ".join(title_text.split(". ")[1:])
                year = movie.find_element(By.CSS_SELECTOR, "[class*='cli-title-metadata-item']").text
                rating = movie.find_element(By.CSS_SELECTOR, ".ipc-rating-star").text.split()[0]
                url = movie.find_element(By.CSS_SELECTOR, "a").get_attribute("href")

                print(f"Scraping {idx+1}. {title}...")

                # Go to movie page and extract genre + director
                genre, director = scrape_movie_details(driver, url)

                # Return to Top 250
                driver.back()
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".ipc-metadata-list-summary-item"))
                )

                data.append({
                    'Rank': idx + 1,
                    'Title': title,
                    'Year': year,
                    'Rating': rating,
                    'Genre': genre,
                    'Director': director,
                    'URL': url
                })

            except Exception as e:
                print(f"Error processing movie at rank {idx+1}: {e}")

        return data

    except Exception as e:
        print(f"Error during scraping: {e}")
        return []
    finally:
        driver.quit()

print("🎬 Scraping IMDB Top 250 Movies with Genre and Director...")
movie_data = scrape_imdb_top_250()

if movie_data:
    df = pd.DataFrame(movie_data)
    print("\nScraped Data (Top 5 rows):")
    print(df.head())
    df.to_csv("imdb_top_250_detailed.csv", index=False)
    print("\nData saved to imdb_top_250_detailed.csv")
else:
    print("No data scraped.")


🎬 Scraping IMDB Top 250 Movies with Genre and Director...
Scraping 1. The Shawshank Redemption...
Scraping 2. The Godfather...
Scraping 3. The Dark Knight...
Scraping 4. The Godfather Part II...
Scraping 5. 12 Angry Men...
Scraping 6. The Lord of the Rings: The Return of the King...
Scraping 7. Schindler's List...
Scraping 8. Pulp Fiction...
Scraping 9. The Lord of the Rings: The Fellowship of the Ring...
Scraping 10. The Good, the Bad and the Ugly...
Scraping 11. Forrest Gump...
Scraping 12. The Lord of the Rings: The Two Towers...
Scraping 13. Fight Club...
Scraping 14. Inception...
Scraping 15. Star Wars: Episode V - The Empire Strikes Back...
Scraping 16. The Matrix...
Scraping 17. Goodfellas...
Scraping 18. Interstellar...
Scraping 19. One Flew Over the Cuckoo's Nest...
Scraping 20. Se7en...
Scraping 21. It's a Wonderful Life...
Scraping 22. The Silence of the Lambs...
Scraping 23. Seven Samurai...
Scraping 24. Saving Private Ryan...
Scraping 25. City of God...
Scraping 26. The Gr

In [6]:
!pip install pandas pymongo openpyxl


Collecting pymongo
  Downloading pymongo-4.12.0-cp312-cp312-win_amd64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.12.0-cp312-cp312-win_amd64.whl (896 kB)
   ---------------------------------------- 0.0/896.7 kB ? eta -:--:--
   ---------------------------------------- 0.0/896.7 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/896.7 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/896.7 kB ? eta -:--:--
   ---------------------- --------------- 524.3/896.7 kB 509.0 kB/s eta 0:00:01
   ---------------------- --------------- 524.3/896.7 kB 509.0 kB/s eta 0:00:01
   -------------------------------------- 896.7/896.7 kB 738.7 kB/s eta 0:00:00
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.12.0


In [31]:
from pymongo import MongoClient
import pandas as pd

# ✅ Replace this with your actual connection URI
MONGO_URI = "mongodb+srv://ziadx2:zoz123456@cluster0.ax4seh7.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# ✅ Optional: Customize your DB and collection names
DB_NAME = "movie_data"
COLLECTION_NAME = "imdb_top_250"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

# Load the CSV if not already in memory
# df = pd.read_csv("imdb_top_250_detailed.csv")

# Convert DataFrame to a list of dictionaries (MongoDB documents)
documents = df.to_dict(orient='records')

# Insert into MongoDB
collection.insert_many(documents)

print(f"✅ Successfully inserted {len(documents)} movies into MongoDB Atlas!")


  _crypto.X509.from_cryptography(x509.load_der_x509_certificate(cert))


✅ Successfully inserted 245 movies into MongoDB Atlas!
