In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import mysql.connector as db

class IMDbScraper:
    
    def __init__(self, genres, db_config):
        self.genres = genres
        self.db_config = db_config


    def click_show_more(self, driver):
        try:
            show_more_button = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button/span/span')
            ActionChains(driver).move_to_element(show_more_button).perform()
            show_more_button.click()
            time.sleep(3)
            return True
        except Exception as e:
            print("Button is not available or could not be clicked:", e)
            return False

    def to_number(self, value):
        value = value.strip().upper().replace('(', '').replace(')', '').replace(',', '')
        if value.endswith('K'):
            return abs(float(value[:-1]) * 1000)
        elif value.endswith('L'):
            return abs(float(value[:-1]) * 100000)
        elif value.endswith("M"):  
            return abs(float(value[:-1]) * 1000000)
        else:
            return abs(float(value))

    def remove_prefix(self, value):
        data = value.split('.', 1)
        return data[1].strip() if data[0].strip().isdigit() else value

    def to_minutes(self, value):
        value = value.strip().lower()
        hours = 0
        minutes = 0
        if 'h' in value:
            value_arr = value.split('h')
            hours = int(value_arr[0].strip())
            value = value_arr[1].strip() if len(value_arr) > 1 else ""
        if 'm' in value:
            value = value.replace('m', '')
            minutes = int(value.strip())
        return hours * 60 + minutes

    def get_db_connection(self):
        return db.connect(
            host=self.db_config['host'],
            user=self.db_config['user'],
            password=self.db_config['password'],
            database=self.db_config['database']
        )

    def insert_data(self, df):
        db_con = self.get_db_connection()
        curr = db_con.cursor()
        data_tuples = list(map(tuple, df.values.tolist()))
        curr.executemany("INSERT INTO mydb.MOVIE_DATA_TMP (MOVIE_NAME,GENRE,RATINGS,VOTING_COUNTS,DURATION) VALUES (%s,%s,%s,%s,%s)",data_tuples)
        db_con.commit()
        curr.close()
        db_con.close()

    def scrape_data(self, genre):
        print("Processing genre: "+genre)
        driver = webdriver.Chrome()
        url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=" + genre
        driver.get(url)
        time.sleep(3)

        while self.click_show_more(driver):
            print('Clicked on Show More Button')

        print('End of content')

        titles = []
        ratings = []
        votings = []
        durations = []
        genres = []

        movie_items = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')

        for movie_item in movie_items:
            try:
                title = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]').text
                rating = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/span/div/span/span[1]').text
                voting = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/span/div/span/span[2]').text
                duration = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[2]/span[2]').text

                title = self.remove_prefix(title)
                voting = self.to_number(voting)
                duration = self.to_minutes(duration)
        
                titles.append(title)
                genres.append(genre.upper())
                ratings.append(rating)
                votings.append(voting)
                durations.append(duration)

            except Exception as e:
                print("Error extracting data for movie:", e)
                continue

        df = pd.DataFrame({
            'MOVIE_NAME': titles,
            'GENRE': genres,
            'RATINGS': ratings,
            'VOTING_COUNTS': votings,
            'DURATION': durations
        })

        self.insert_data(df)
        df.to_csv(f'{genre}.csv', index=False)
        print(f"Data saved to {genre}.csv")
        driver.quit()

    def extract_imdb_data(self):
        for genre in self.genres:
            self.scrape_data(genre)


if __name__ == "__main__":
    # genres = ['action', 'crime', 'romance', 'comedy', 'horror', 'fantasy'] takes more time so passing one genre at a time
    genres = ['fantasy']
    db_config = {
        'host': 'localhost',
        'user': 'rino',
        'password': 'admin@123',
        'database': 'mydb'
    }

    scraper = IMDbScraper(genres, db_config)
    scraper.extract_imdb_data()