In [1]:
import pandas as pd
import numpy as np
import requests
import json
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


In [3]:
data = pd.read_csv("u.data", sep='\t', header=None)
genre = pd.read_csv("u.genre", sep='|', header=None)
item = pd.read_csv("u.item", sep='|', header=None, encoding='ISO-8859-1')


In [4]:
def fetch_news(api_key, query, page_size=10):
    url = f'https://newsapi.org/v2/everything?q={query}&pageSize={page_size}&apiKey={api_key}'
    response = requests.get(url)
    return response.json()

In [5]:
class Movies:
    def __init__(self):
        self.data = data
        self.item = item
        self.genre = genre
        self.data_headers = ['userID', 'itemID', 'rating', 'timestamp']
        self.genre_headers = ['Genre', 'Index']
        self.item_headers = ['MovieId', 'movie_title', 'release_date', 'video_release_date', 'IMDB_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

    def assign_column_names(self):
        self.data.columns = self.data_headers
        self.genre.columns = self.genre_headers
        self.item.columns = self.item_headers
    
    def merge_frames(self):
        self.df = pd.merge(self.item, self.data, left_on='MovieId', right_on = 'itemID', how='inner')
    
    def split_columns(self):
        self.df[['day', 'month', 'year']] = self.df['release_date'].str.split('-', expand = True)
    
    def drop_columns(self):
        columns_to_drop = ['unknown', 'itemID', 'video_release_date', 'release_date', 'userID', 'timestamp']
        self.df = self.df.drop(columns=columns_to_drop)

movie = Movies()

In [6]:
movie.assign_column_names()
movie.merge_frames()
movie.split_columns()
movie.drop_columns()

In [7]:
class reviews:
    def __init__(self, df):
        self.movies = df
        self.url = "https://www.imdb.com/"
        self.tag_words = []
        self.reviews = {}
        
        
    def make_tagword(self):
        pattern = r'^(.*?)\(.*[^)](.{5})$'
        self.unique_movies = list(self.movies['movie_title'].unique())
        for tag in self.unique_movies:
            match = re.search(pattern, tag)
            if match:
                before_bracket = match.group(1)
                last_five = match.group(2)
                self.tag_words.append(before_bracket + " " + last_five)
            else:
                self.tag_words.append(tag)


    def initiate_driver(self):
        service = Service('chromedriver.exe')    
        options = Options()
        options.headless = True
        self.driver = webdriver.Chrome(service=service, options=options)
        self.wait = WebDriverWait(self.driver, 10)

        self.driver.get(self.url)
    
    def get_search_box(self):
        self.search_box = self.driver.find_element(By.ID, "suggestion-search")

    def make_search(self, tag):
        print(tag)
        self.search_box.send_keys(tag)
        self.search_box.send_keys(Keys.RETURN)
        link = self.driver.find_element(By.CLASS_NAME, 'ipc-metadata-list-summary-item__t')
        self.driver.execute_script("arguments[0].scrollIntoView();", link)
        self.driver.execute_script("arguments[0].click();", link)

    def click_reviews(self):
        user_review_links = self.driver.find_element(By.XPATH , '/html/body/div[2]/main/div/section[1]/div/section/div/div[1]/section[7]/div[1]/div/a')
        self.driver.execute_script("arguments[0].scrollIntoView();", user_review_links)
        self.driver.execute_script("arguments[0].click();", user_review_links)
    
    def get_reviews(self, tag):
        div_elements = WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.text.show-more__control.clickable'))
        )
        for div in div_elements[:10]:
           self.reviews[tag] = div.text

    def go_back(self):
        self.driver.back()

    
        
rev = reviews(movie.df)

In [8]:

rev.make_tagword()
rev.initiate_driver()
for tag in rev.tag_words:
    try:
        rev.get_search_box()
        rev.make_search(tag)
        rev.click_reviews()
        rev.get_reviews(tag)
    except:
        print("Exception")
    finally:
        continue

NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [None]:
len(rev.reviews)