## IMDB Dataset - Capstone Project

### Phase 1

-  Webscrape the provided URL  - IMDB dataset:

https://www.imdb.com/search/title/?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=f11158cc-b50b-4c4d-b0a2-40b32863395b&pf_rd_r=XZ8X52H1R40B7KG5SNZ9&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_1

- Store the entire data in two different CSV files as per the given fields:

### The first CSV file data contains :

- Sno, Movie Name, Director Name, Duration, year, ratings, Metascore

- Bifurcate the Director field into subfields as per the number of directors of the movie belongs to such as Director1, director2


### The second CSV file contains the following:

- Movie Name, stars, votes, Genre, Gross collection, popularity, Certification

- Bifurcate the stars field into 4 subfields as per the number of stars worked in the movie such as star1, star2, star3, star4

- Bifurcate the genre into 3 subfields as per the number of genres the movie belongs to such as :

- Genre1, genre2, genre3

In [1]:
# Importing Libraries
import selenium
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup

# Importing selenium webdriver 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

# Importing required Exceptions which needs to handled
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException,ElementNotInteractableException
from selenium.webdriver.common.by import By

#Importing requests
import requests

# importing regex
import re

### `Connecting to the Webdriver and Url`

In [2]:
driver_path = r'C:\Users\chizz\Downloads\chromedriver_win32.zip\chromedriver.exe'
service = Service(driver_path)
driver = webdriver.Chrome(service=service)
driver.get('https://www.imdb.com/search/title/?genres=action&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=f11158cc-b50b-4c4d-b0a2-40b32863395b&pf_rd_r=XZ8X52H1R40B7KG5SNZ9&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_1')
time.sleep(1)

#Maximising the window
driver.maximize_window()

### `Scraping the Entire Content`

In [3]:
Content = []
URls = []


while True:
    
    time.sleep(2)
    
    ## Getting entire content
    for i in driver.find_elements(By.XPATH,'//div[@class="lister-item-content"]'):
        Content.append(i.text)
    
    ## Getting URl
    url = driver.find_elements(By.XPATH,'//*[@id="main"]/div/div[3]/div/div/div[3]/h3/a')
    for i in url:
        URls.append(i.get_attribute('href'))

    try:
        next_button = driver.find_element(By.XPATH, '//a[@class="lister-page-next next-page"]')
        next_button.click()
        time.sleep(2)
    except NoSuchElementException:
        # Break the loop if the "Next" button is not found
        break

#### `Spiltting the derived content`

In [14]:
split_list = [i.split('\n') for i in Content]
for i in split_list:
    del i[-3]

#### `Storing the values accroding to instructions`

In [16]:
# Extract movie titles
Movie_Name = [item[0].split('. ')[1].split(' (')[0] for item in split_list]

# Extract release years
Year_Released = [item[0].split('(')[-1].split(')')[0] for item in split_list]

# Extract ratings
Ratings = [item[2].split(' ')[0] for item in split_list]

## For Metascores
Metascore = []

for sublist in split_list:
    if len(sublist) == 6:
        item = sublist[3]
        item = item.replace(' Metascore', '')  # Remove ' Metascore' from the item
        Metascore.append(item)
    elif len(sublist) == 5:
        Metascore.append('N/A')
        
        
        
## For Certificate and Duration

Certificate = []
Duration_in_min = []

for sublist in split_list:
    info = sublist[1].split(' | ')
    if len(info) == 2:
        Certificate.append('N/A')
        Duration_in_min.append(info[0])
    else:
        Certificate.append(info[0])
        Duration_in_min.append(info[1])
        
        
        
## For Votes

Votes = []

for sublist in split_list:
    last_element = sublist[-1]
    if isinstance(last_element, list):
        votes = 'N/A'
    else:
        votes = last_element.split(' | ')[0].split(': ')[1]
    Votes.append(votes)
    
    
    
    
## For Directors 

Director1 = []
Director2 = []
Director3 = []

for sublist in split_list:
    directors = sublist[-2].split(' | ')[0].split(': ')[1]
    director_list = directors.split(', ')
    
    if len(director_list) >= 1:
        Director1.append(director_list[0])
    else:
        Director1.append('N/A')
    
    if len(director_list) >= 2:
        Director2.append(director_list[1])
    else:
        Director2.append('N/A')
        
    if len(director_list) >= 3:
        Director3.append(director_list[2])
    else:
        Director3.append('N/A')
        
        
        
## Stars

Star1 = []
Star2 = []
Star3 = []
Star4 = []

for sublist in split_list:
    stars = sublist[-2].split(' | ')[1].split(': ')[1]
    star_list = stars.split(', ')
    
    if len(star_list) >= 1:
        Star1.append(star_list[0])
    else:
        Star1.append('N/A')
    
    if len(star_list) >= 2:
        Star2.append(star_list[1])
    else:
        Star2.append('N/A')
    
    if len(star_list) >= 3:
        Star3.append(star_list[2])
    else:
        Star3.append('N/A')
    
    if len(star_list) >= 4:
        Star4.append(star_list[3])
    else:
        Star4.append('N/A')

        
        
## For Genre

Genre1 = []
Genre2 = []
Genre3 = []

for sublist in split_list:
    genres = sublist[1].split(' | ')[-1].split(', ')
    num_genres = len(genres)
    
    if num_genres >= 1:
        Genre1.append(genres[0])
    else:
        Genre1.append('N/A')
    
    if num_genres >= 2:
        Genre2.append(genres[1])
    else:
        Genre2.append('N/A')
    
    if num_genres >= 3:
        Genre3.append(genres[2])
    else:
        Genre3.append('N/A')

### `Since Popularity and Gross Collection are only given when we open the URLS we extract them seperately`

In [33]:
## For Gross collection, popularity

Gross = []
Popularity = []

for i in URls:
    driver.get(i)
    time.sleep(1)    
        
        ## For Gross Collection
    try:

        gross = driver.find_element(By.XPATH,'//li[@data-testid="title-boxoffice-cumulativeworldwidegross"]')
        Gross.append(gross.text) if i else Gross.append('N/A')

    except NoSuchElementException:
        Gross.append('N/A')
        
        
        ## For popularity
    try:
        
        pop = driver.find_element(By.XPATH,'//div[@data-testid="hero-rating-bar__popularity__score"]')
        Popularity.append(pop.text) if i else Popularity.append('N/A')
        
    except NoSuchElementException:
        Popularity.append('N/A')

        

## `CSV File 1`

In [27]:
data1 = {
    'Movie_Name':Movie_Name,
    'Director1':Director1,
    'Director2':Director2,
    'Director3':Director3,
    'Duration':Duration_in_min,
    'year':Year_Released,    
    'Ratings':Ratings,
    'Metascore':Metascore
}

df1 = pd.DataFrame(data1)
df1['Sno'] = range(1, len(df1) + 1)

In [54]:
df1

Unnamed: 0,Movie_Name,Director1,Director2,Director3,Duration,year,Ratings,Metascore,Sno
0,Spider-Man: Across the Spider-Verse,Joaquim Dos Santos,Kemp Powers,Justin K. Thompson,140 min,2023,9.1,86,1
1,The Dark Knight,Christopher Nolan,,,152 min,2008,9.0,84,2
2,The Lord of the Rings: The Return of the King,Peter Jackson,,,201 min,2003,9.0,94,3
3,Inception,Christopher Nolan,,,148 min,2010,8.8,74,4
4,The Lord of the Rings: The Fellowship of the Ring,Peter Jackson,,,178 min,2001,8.8,92,5
...,...,...,...,...,...,...,...,...,...
1746,Radhe,Prabhu Deva,,,109 min,2021,1.9,,1747
1747,Race 3,Remo D'Souza,,,160 min,2018,1.9,,1748
1748,Angels Apocalypse,Sean Cain,Enzo Zelocchi,,85 min,2015,1.7,,1749
1749,The Cost of Deception,Keith English,,,125 min,2021,1.5,,1750


#### `Converting to CSV`

In [29]:
df1.to_csv('Movies_file_1.csv', index=True)

### `CSV File 2`

In [45]:
Gross_Collection_Dollars = []

for item in Gross:
    number = item.split('$')[-1].replace(',', '')
    Gross_Collection_Dollars.append(number)

In [51]:
data2 = {
    'Movie_Name':Movie_Name,
    'Star1':Star1,
    'Star2':Star2,
    'Star3':Star3,
    'Star4':Star4,
    'Votes':Votes,
    'Genre1': Genre1,
    'Genre2': Genre3,
    'Genre3': Genre3,
    'Gross_Collection':Gross_Collection_Dollars,
    'Popularity':Popularity,
    'Certification':Certificate
      
}

df2 = pd.DataFrame(data2)

In [52]:
df2

Unnamed: 0,Movie_Name,Star1,Star2,Star3,Star4,Votes,Genre1,Genre2,Genre3,Gross_Collection,Popularity,Certification
0,Spider-Man: Across the Spider-Verse,Shameik Moore,Hailee Steinfeld,Oscar Isaac,Jake Johnson,29474,Animation,Adventure,Adventure,69100000,23,PG
1,The Dark Knight,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2719284,Action,Drama,Drama,1006234167,117,PG-13
2,The Lord of the Rings: The Return of the King,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1887739,Action,Drama,Drama,1147633833,407,PG-13
3,Inception,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2413938,Action,Sci-Fi,Sci-Fi,870110523,103,PG-13
4,The Lord of the Rings: The Fellowship of the Ring,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean,1916363,Action,Drama,Drama,898204420,184,PG-13
...,...,...,...,...,...,...,...,...,...,...,...,...
1746,Radhe,Salman Khan,Disha Patani,Randeep Hooda,Jackie Shroff,178810,Action,Thriller,Thriller,1900312,,TV-MA
1747,Race 3,Anil Kapoor,Salman Khan,Bobby Deol,Jacqueline Fernandez,47589,Action,Thriller,Thriller,29969693,,Not Rated
1748,Angels Apocalypse,Enzo Zelocchi,Jana Rochelle,Ryan C.F. Buckley,William Kirkham,42911,Action,Sci-Fi,Sci-Fi,,,
1749,The Cost of Deception,Vivianne Bánovits,András Mózes,Barna Bokor,Gabriella Gubás,39492,Action,Drama,Drama,600610,,


#### `Converting to CSV file` 

In [53]:
df2.to_csv('Movies_file_2.csv',index=True)
driver.close()