In [41]:
import pandas as pd
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException

In [42]:

chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=chrome_options)
driver.get('https://www.rottentomatoes.com/browse/movies_at_home/sort:popular')


In [43]:
# Hàm lọc
def filter_array(array_elements):
    array = []
    for element in array_elements:
        array_processed = element.text.replace('\n', '').strip()
        array.append(array_processed)
    return array
# Đợi cho phần tử "Load more" hiển thị
wait = WebDriverWait(driver, 10)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
load_more_button = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button[data-qa='dlp-load-more-button']")))

# Bấm nút "Load more" 35 lần (35*30=1050 phim)
click_count = 0
while click_count < 30:
    actions = ActionChains(driver)
    actions.move_to_element(load_more_button).perform()
    try:
        driver.execute_script("arguments[0].click();", load_more_button)
        click_count += 1
        load_more_button = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button[data-qa='dlp-load-more-button']")))
    except:
        break

In [44]:
# Hàm nhấn nút "Show More"
def click_show_more_button(driver):
    try:
        link_elements = driver.find_elements(By.CSS_SELECTOR, 'rt-link[data-modulecastcrewmanager="showMoreBtn:click"][slot="ctaOpen"]') 
        for link_element in link_elements:
            driver.execute_script("arguments[0].click();", link_element)
            time.sleep(2)  # Consider using dynamic waiting instead of fixed sleep times
    except NoSuchElementException:
        pass

# Hàm trích xuất dữ liệu từ trang phim
def extract_data_from_movie_page(driver, movie_link):
    print("Movie Link:", movie_link)
    driver.get(movie_link)
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[slot="insetText"] p.name')))  # Wait for actor names to appear
    
    movie_title = driver.find_element(By.CSS_SELECTOR, 'h1').text
    print('Movie:', movie_title)
    # Lọc actors
    actors = driver.find_elements(By.CSS_SELECTOR, 'div[slot="insetText"] p.name')
    actor_names = [actor.text for actor in actors]
    actor_names_str = ', '.join(actor_names)
    print("Actors:", actor_names_str)

    # Lọc descriptions
    description_elements = driver.find_elements(By.CSS_SELECTOR, 'div.synopsis-wrap rt-text:not(.key)')
    descriptions_full = [description.text for description in description_elements]
    descriptions_text = ', '.join(descriptions_full)  # Call the extract_description function
    print("Descriptions:", descriptions_text)

    # Lấy link poster
    try:
        poster_element = driver.find_element(By.CSS_SELECTOR, 'img[slot="poster"]')
        poster_link = poster_element.get_attribute("src")
    except Exception as e:
        poster_link = "N/A"
    print('Poster link:',poster_link)
        
    # Lọc duration 
    try:
        duration_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Runtime"]/ancestor::dt/following-sibling::dd/rt-text')
        duration_text = duration_element.text if duration_element else None
    except NoSuchElementException:
        duration_text = None
    print("Duration:", duration_text)

    # Lọc ratings
    try:
        critics_element = driver.find_element(By.CSS_SELECTOR, "rt-text[slot='criticsScore']")
        shadow_root = driver.execute_script("return arguments[0].shadowRoot", critics_element)
        rating = shadow_root.find_element(By.CSS_SELECTOR, "span").text
    except Exception as e:
        rating = "N/A" 
    print("Rating:", rating)

    # Lọc release date
    try:
        release_date_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Release Date (Streaming)"]/ancestor::dt/following-sibling::dd/rt-text')
        release_date_text = release_date_element.text if release_date_element else None
    except NoSuchElementException:
        release_date_text = None
    print("Release Date:", release_date_text)

    # Lọc box office
    boxoffice_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Box Office (Gross USA)':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                boxoffice_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    boxoffice_text = boxoffice_element.text if boxoffice_element else None
    print("Box Office:", boxoffice_text)

    # Lọc Certificate
    certificate_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Rating':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                certificate_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    certificate_text = certificate_element.text if certificate_element else None
    print("Certificate (MPAA):", certificate_text)

    # Lọc original_language
    original_language_element = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Original Language':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                original_language_element = dd_element.find_element(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass
    original_language_text = original_language_element.text if original_language_element else None
    print("Original Language:", original_language_text)

    # Lọc companies
    companies_text = None  # Set a default value
    companies_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Production Co':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                companies_elements = dd_element.find_elements(By.TAG_NAME,'rt-text')
                break
    except NoSuchElementException:
        pass

    if companies_elements is not None:
        companies_text = ''.join([element.text for element in companies_elements])
        print("Companies:", companies_text)

    # Lọc movie category
    movie_category_elements = None
    try:
        dt_element = driver.find_element(By.XPATH, '//dt[@class="key"]/rt-text[text()="Genre"]')
        dd_element = dt_element.find_element(By.XPATH, './../following-sibling::dd')
        movie_category_elements = dd_element.find_elements(By.TAG_NAME, 'rt-link')
    except NoSuchElementException:
        pass
    movie_category_text = ', '.join(set([element.text for element in movie_category_elements])) if movie_category_elements else None
    print("Movie Category:", movie_category_text)

    # Lọc director 
    director_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Director':
                dd_element = dt_element.find_element(By.XPATH, './../following-sibling::dd')
                director_elements = dd_element.find_elements(By.TAG_NAME, 'rt-link')
                break
    except NoSuchElementException:
        pass 
    diretor_text = ', '.join([element.text for element in director_elements]) if director_elements else None
    print("Directors:", diretor_text)

    # Lọc writing credits
    writing_credits_elements = None
    try:
        dt_elements = driver.find_elements(By.CSS_SELECTOR, 'dt.key rt-text')
        for dt_element in dt_elements:
            if dt_element.text == 'Screenwriter':
                dd_element = dt_element.find_element(By.XPATH,'./../following-sibling::dd')
                writing_credits_elements = dd_element.find_elements(By.TAG_NAME,'rt-link')
                break
    except NoSuchElementException:
        pass
    writing_credits_text = None
    if writing_credits_elements is not None:
        writing_credits_text = ', '.join([element.text for element in writing_credits_elements if element.text])
    print("Writing Credits:", writing_credits_text)

    # Thêm dữ liệu đã lọc vào danh sách
    movies_name.append(movie_title)
    poster_links.append(poster_link)
    release_date.append(release_date_text)
    film_actors.append(actor_names_str)
    duration.append(duration_text)
    country.append(None)
    companies.append(companies_text)
    movie_category.append(movie_category_text)
    ratings.append(rating)
    directors.append(diretor_text)
    certificate.append(certificate_text)
    budget.append(None)
    boxoffice.append(boxoffice_text)
    original_language.append(original_language_text)
    taglines.append(None)
    writing_credits.append(writing_credits_text)
    descriptions.append(descriptions_text)  # Keep this line for adding to the list

# Lọc movie
movies = []
movie_elements = driver.find_element(By.CSS_SELECTOR, 'h1').text
movies.append(movie_elements)

# Lọc movie link
movie_link_elements = driver.find_elements(By.CSS_SELECTOR,'a[data-qa="discovery-media-list-item-caption"]')
movie_links = [element.get_attribute('href') for element in movie_link_elements]

# Khởi tạo danh sách rỗng cho các thuộc tính
movies_name = []
poster_links = []
release_date = []
film_actors = []
duration = []
country = []
companies = []
movie_category = []
ratings = []
directors = []
certificate = []
budget= []
boxoffice= []
original_language = []
taglines = []
writing_credits = []
descriptions = []

# Lặp qua các liên kết phim và trích xuất thông tin
count = 0
for movie_link in movie_links:
    if count >= 1000:  # Thay đổi giới hạn từ 50 thành 1000
        break
    extract_data_from_movie_page(driver, movie_link)
    count += 1
# Hàm lọc
def filter_array(array_elements):
    array = []
    for element in array_elements:
        array_processed = element.text.replace('\n', '').strip()
        array.append(array_processed)
    return array

# Hàm xử lý giá trị thiếu
def fill_missing_values(array, length):
    while len(array) < length:
        array.append(None)
    return array

data = pd.DataFrame()

# Thay đổi các giá trị thiếu bằng None hoặc một giá trị mặc định khác
movies_name = fill_missing_values(movies_name, len(movies))
poster_links = fill_missing_values(poster_links, len(movies))
ratings = fill_missing_values(ratings, len(movies))
movie_category = fill_missing_values(movie_category, len(movies))
film_actors = fill_missing_values(film_actors, len(movies))
release_date = fill_missing_values(release_date, len(movies))
duration = fill_missing_values(duration, len(movies))
original_language = fill_missing_values(original_language, len(movies))
companies = fill_missing_values(companies, len(movies))
descriptions = fill_missing_values(descriptions, len(movies))
boxoffice = fill_missing_values(boxoffice, len(movies))
certificate = fill_missing_values(certificate, len(movies))
directors = fill_missing_values(directors, len(movies))
writing_credits = fill_missing_values(writing_credits, len(movies))


Movie Link: https://www.rottentomatoes.com/m/juror_2
Movie: Juror #2
Actors: Clint Eastwood, Nicholas Hoult, Toni Collette, J.K. Simmons, Chris Messina, Gabriel Basso
Descriptions: "Juror #2" follows family man Justin Kemp (Nicholas Hoult) who, while serving as a juror in a high profile murder trial, finds himself struggling with a serious moral dilemma... one he could use to sway the jury verdict and potentially convict--or free--the accused killer.
Poster link: https://resizing.flixster.com/Uweg-aB2RW7m1tiYinK68ZtRqeg=/68x102/v2/https://resizing.flixster.com/65hB_oIyMeqRvppAAA5EH09Quz0=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzL2IyMDBjMDk1LTgxN2ItNDUwZi05ODVlLTM0M2U5YmI0OTk1ZS5qcGc=
Duration: 1h 54m
Rating: 93%
Release Date: Dec 3, 2024
Box Office: None
Certificate (MPAA): PG-13 (Some Violent Images|Strong Language)
Original Language: English
Companies: Malpaso Productions,Dichotomy Films
Movie Category: Drama, Crime, Mystery & Thriller
Directors: Clint Eastwood
Writing Credits: Jonathan Abra

In [45]:
len(movie_links)

222

In [46]:
movie_links

['https://www.rottentomatoes.com/m/juror_2',
 'https://www.rottentomatoes.com/m/conclave',
 'https://www.rottentomatoes.com/m/the_substance',
 'https://www.rottentomatoes.com/m/dear_santa_2024',
 'https://www.rottentomatoes.com/m/nutcrackers',
 'https://www.rottentomatoes.com/m/our_little_secret',
 'https://www.rottentomatoes.com/m/that_christmas',
 'https://www.rottentomatoes.com/m/smile_2',
 'https://www.rottentomatoes.com/m/heretic_2024',
 'https://www.rottentomatoes.com/m/hot_frosty',
 'https://www.rottentomatoes.com/m/the_wild_robot',
 'https://www.rottentomatoes.com/m/the_best_christmas_pageant_ever_2024',
 'https://www.rottentomatoes.com/m/fly_me_to_the_moon_2024',
 'https://www.rottentomatoes.com/m/blitz',
 'https://www.rottentomatoes.com/m/the_piano_lesson',
 'https://www.rottentomatoes.com/m/beetlejuice_beetlejuice',
 'https://www.rottentomatoes.com/m/christmas_eve_in_millers_point',
 'https://www.rottentomatoes.com/m/gladiator',
 'https://www.rottentomatoes.com/m/elevation',

In [47]:
len(poster_links)

222

In [48]:
poster_links

['https://resizing.flixster.com/Uweg-aB2RW7m1tiYinK68ZtRqeg=/68x102/v2/https://resizing.flixster.com/65hB_oIyMeqRvppAAA5EH09Quz0=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzL2IyMDBjMDk1LTgxN2ItNDUwZi05ODVlLTM0M2U5YmI0OTk1ZS5qcGc=',
 'https://resizing.flixster.com/qjfmXMJ6hiJ7x1WBFSJr-8_MIZQ=/68x102/v2/https://resizing.flixster.com/EuHvfeWcRbhSOlcUxLa4n_yhNtQ=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzLzY4ZWI3MGFjLWNlZTEtNGYyOC04NGY1LWFlOGQzMDZmZThmNS5qcGc=',
 'https://resizing.flixster.com/1UnYNJn2dZk4aAiRFIcOl9RcdYU=/68x102/v2/https://resizing.flixster.com/jR2K79Abw-XbUPWBfdjzMStAHSA=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzLzFjNzgyYzkwLWZkNGItNDEwZi04MTViLTRmOGRlN2M4NTg2OC5qcGc=',
 'https://resizing.flixster.com/uJQmTX-G8kmT9ro3GKFOWINUifs=/68x102/v2/https://resizing.flixster.com/9UpMcPglTmQZMezfPXOWN9QDZ7k=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzLzU3ZDRmN2JjLTZkYjQtNGMxNC04OTE3LWU5NTIwZmI4ZjE3OC5qcGc=',
 'https://resizing.flixster.com/pL0CpZc9elVbI3cwPOLd_iB3wEk=/68x102/v2/https://resizing.flixster.com/j5gRsC_C3SJJ-We

In [49]:
len(movies)

1

In [50]:
movies

['Best Movies to Stream at Home (2024)']

In [51]:
len(movies_name)

222

In [52]:
movies_name

['Juror #2',
 'Conclave',
 'The Substance',
 'Dear Santa',
 'Nutcrackers',
 'Our Little Secret',
 'That Christmas',
 'Smile 2',
 'Heretic',
 'Hot Frosty',
 'The Wild Robot',
 'The Best Christmas Pageant Ever',
 'Fly Me to the Moon',
 'Blitz',
 'The Piano Lesson',
 'Beetlejuice Beetlejuice',
 "Christmas Eve in Miller's Point",
 'Gladiator',
 'Elevation',
 'Alien: Romulus',
 'Speak No Evil',
 'Moana',
 'We Live in Time',
 'Maria',
 'The Holdovers',
 'My Old Ass',
 'Deadpool & Wolverine',
 'AfrAId',
 'Joker: Folie à Deux',
 'Venom: The Last Dance',
 'Terrifier 3',
 'Emilia Pérez',
 'Woman of the Hour',
 'Here',
 'Interstellar',
 'Blink Twice',
 'Midway',
 'Hit Man',
 'Twisters',
 'Megalopolis',
 'Rebel Ridge',
 'Spellbound',
 'Transformers One',
 'The Fall Guy',
 'Trap',
 'Civil War',
 'It Ends With Us',
 'Sweethearts',
 'Weekend in Taipei',
 'Lake George',
 'Klaus',
 "It's What's Inside",
 'The Merry Gentlemen',
 'The Dark Tower',
 'Exhibiting Forgiveness',
 'A Different Man',
 'Spirited

In [53]:
len(ratings)

222

In [54]:
ratings

['93%',
 '93%',
 '90%',
 '22%',
 '43%',
 '40%',
 '58%',
 '86%',
 '91%',
 '81%',
 '97%',
 '91%',
 '65%',
 '80%',
 '88%',
 '76%',
 '78%',
 '80%',
 '55%',
 '79%',
 '83%',
 '95%',
 '78%',
 '75%',
 '97%',
 '90%',
 '78%',
 '22%',
 '32%',
 '41%',
 '77%',
 '76%',
 '91%',
 '35%',
 '73%',
 '75%',
 '42%',
 '95%',
 '75%',
 '45%',
 '96%',
 '45%',
 '89%',
 '81%',
 '57%',
 '81%',
 '56%',
 '58%',
 '52%',
 '95%',
 '95%',
 '79%',
 '44%',
 '16%',
 '95%',
 '92%',
 '70%',
 '86%',
 '100%',
 '',
 '95%',
 '72%',
 '79%',
 '93%',
 '67%',
 '43%',
 '',
 '99%',
 '83%',
 '77%',
 '66%',
 '96%',
 '92%',
 '93%',
 '76%',
 '69%',
 '64%',
 '94%',
 '84%',
 '97%',
 '0%',
 '56%',
 '89%',
 '98%',
 '79%',
 '74%',
 '86%',
 '52%',
 '88%',
 '10%',
 '53%',
 '74%',
 '89%',
 '85%',
 '90%',
 '89%',
 '85%',
 '71%',
 '66%',
 '94%',
 '83%',
 '90%',
 '95%',
 '25%',
 '54%',
 '91%',
 '98%',
 '87%',
 '46%',
 '82%',
 '95%',
 '99%',
 '67%',
 '71%',
 '76%',
 '92%',
 '18%',
 '68%',
 '96%',
 '82%',
 '83%',
 '93%',
 '95%',
 '97%',
 '90%',
 '25%'

In [55]:
len(movie_category)

222

In [56]:
movie_category

['Drama, Crime, Mystery & Thriller',
 'Drama, Mystery & Thriller',
 'Drama, Horror',
 'Comedy, Holiday, Fantasy',
 'Drama, Comedy, Holiday',
 'Comedy, Holiday, Romance',
 'Animation, Kids & Family, Holiday, Adventure, Comedy',
 'Horror, Mystery & Thriller',
 'Horror, Mystery & Thriller',
 'Comedy, Holiday, Fantasy, Romance',
 'Kids & Family, Animation, Adventure',
 'Kids & Family, Comedy, Holiday',
 'Drama, Comedy, History, Romance',
 'Drama, History, War',
 'Drama, Music',
 'Comedy, Fantasy',
 'Drama, Comedy, Holiday',
 'Drama, Adventure, History, Action',
 'Sci-Fi, Drama, Mystery & Thriller, Action',
 'Sci-Fi, Horror',
 'Drama, Horror, Mystery & Thriller',
 'Animation, Kids & Family, Adventure, Musical, Comedy',
 'Drama, Romance',
 'Music, Drama, Biography',
 'Drama, Comedy, Holiday',
 'Drama, Comedy, Fantasy',
 'Comedy, Adventure, Action',
 'Horror, Mystery & Thriller',
 'Drama, Crime, Musical',
 'Sci-Fi, Adventure, Action',
 'Horror, Holiday, Mystery & Thriller',
 'Drama, Crime, Mu

In [57]:
len(film_actors)

222

In [58]:
film_actors

['Clint Eastwood, Nicholas Hoult, Toni Collette, J.K. Simmons, Chris Messina, Gabriel Basso',
 "Edward Berger, Ralph Fiennes, Stanley Tucci, John Lithgow, Lucian Msamati, Brian F. O'Byrne",
 'Coralie Fargeat, Demi Moore, Margaret Qualley, Dennis Quaid, Hugo Diego Garcia, Gore Abrams',
 'Robert Farrelly, Jack Black, Keegan-Michael Key, Brianne Howey, Post Malone, Hayes MacArthur',
 'David Gordon Green, Ben Stiller, Linda Cardellini, Edi Patterson, Tim Heidecker, Toby Huss',
 'Stephen Herek, Lindsay Lohan, Ian Harding, Kristin Chenoweth, Jonathan Rudnitsky, Katie Baker',
 'Simon Otto, Brian Cox, Fiona Shaw, Jodie Whittaker, Bill Nighy, Guz Khan',
 'Parker Finn, Naomi Scott, Rosemarie DeWitt, Kyle Gallner, Lukas Gage, Miles Gutierrez-Riley',
 'Scott Beck, Bryan Woods, Hugh Grant, Sophie Thatcher, Chloe East, Topher Grace',
 'Jerry Ciccoritti, Lacey Chabert, Dustin Milligan, Katy Mixon, Lauren Holly, Chrishell Stause',
 "Christopher Sanders, Lupita Nyong'O, Pedro Pascal, Catherine O'Hara, 

In [59]:
len(release_date)

222

In [60]:
release_date

['Dec 3, 2024',
 'Nov 26, 2024',
 'Oct 31, 2024',
 'Nov 25, 2024',
 'Nov 29, 2024',
 'Nov 27, 2024',
 'Dec 4, 2024',
 'Nov 19, 2024',
 'Dec 10, 2024',
 'Nov 13, 2024',
 'Oct 15, 2024',
 'Dec 10, 2024',
 'Aug 13, 2024',
 'Nov 22, 2024',
 'Nov 22, 2024',
 'Oct 8, 2024',
 'Dec 3, 2024',
 'Jun 15, 2011',
 'Nov 26, 2024',
 'Oct 15, 2024',
 'Oct 1, 2024',
 'Feb 21, 2017',
 'Nov 22, 2024',
 'Dec 11, 2024',
 'Nov 28, 2023',
 'Nov 7, 2024',
 'Oct 1, 2024',
 'Sep 17, 2024',
 'Oct 29, 2024',
 'Dec 10, 2024',
 'Nov 26, 2024',
 'Nov 13, 2024',
 'Oct 18, 2024',
 'Nov 26, 2024',
 'May 24, 2016',
 'Sep 17, 2024',
 'Nov 8, 2019',
 'Jun 7, 2024',
 'Aug 13, 2024',
 'Nov 12, 2024',
 'Sep 6, 2024',
 'Nov 22, 2024',
 'Oct 22, 2024',
 'May 21, 2024',
 'Aug 30, 2024',
 'May 24, 2024',
 'Sep 24, 2024',
 'Nov 28, 2024',
 'Dec 3, 2024',
 'Dec 6, 2024',
 'Nov 15, 2019',
 'Oct 4, 2024',
 'Nov 20, 2024',
 'Oct 17, 2017',
 'Dec 3, 2024',
 'Nov 5, 2024',
 'Nov 18, 2022',
 'Aug 23, 2024',
 'Nov 15, 2024',
 'Dec 13, 20

In [61]:
len(duration)

222

In [62]:
duration

['1h 54m',
 '2h 0m',
 '2h 21m',
 '1h 48m',
 '1h 44m',
 '1h 40m',
 '1h 31m',
 '2h 7m',
 '1h 51m',
 '1h 32m',
 '1h 42m',
 '1h 39m',
 '2h 12m',
 '2h 0m',
 '2h 5m',
 '1h 45m',
 '1h 47m',
 '2h 34m',
 '1h 30m',
 '1h 59m',
 '1h 50m',
 '1h 47m',
 '1h 48m',
 '2h 3m',
 '2h 13m',
 '1h 29m',
 '2h 8m',
 '1h 24m',
 '2h 18m',
 '1h 49m',
 '2h 8m',
 '2h 12m',
 '1h 29m',
 '1h 44m',
 '2h 45m',
 '1h 42m',
 '2h 18m',
 '1h 53m',
 '2h 2m',
 '2h 18m',
 '2h 11m',
 '1h 50m',
 '1h 44m',
 '2h 6m',
 '1h 45m',
 '1h 49m',
 '2h 10m',
 '1h 38m',
 '1h 41m',
 '1h 47m',
 '1h 38m',
 '1h 44m',
 '1h 26m',
 '1h 35m',
 '1h 57m',
 '1h 52m',
 '2h 7m',
 '1h 41m',
 '21m',
 None,
 '1h 36m',
 '1h 32m',
 '1h 56m',
 '1h 44m',
 '1h 48m',
 '1h 51m',
 '1h 37m',
 '1h 54m',
 '2h 2m',
 '1h 54m',
 '1h 42m',
 '1h 38m',
 '2h 46m',
 '2h 49m',
 '1h 45m',
 '1h 45m',
 '2h 15m',
 '1h 58m',
 '1h 40m',
 '1h 33m',
 '1h 29m',
 '1h 40m',
 '1h 35m',
 '1h 37m',
 '1h 49m',
 '2h 20m',
 '1h 37m',
 '1h 44m',
 '2h 11m',
 '1h 42m',
 '1h 52m',
 '1h 52m',
 '1h 5

In [63]:
len(original_language)

222

In [64]:
original_language

['English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'British English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'British English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'British English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'English',
 'Spanish',
 'English',
 'English',
 'British English',
 'English',
 'English',
 'English',
 'Englis

In [65]:
len(companies)

222

In [66]:
companies

['Malpaso Productions,Dichotomy Films',
 'FilmNation Entertainment,House Productions',
 'Working Title Films',
 'Conundrum Entertainment',
 'Rivulet Films,Rough House Pictures',
 'Capital Arts Entertainment,Good Entertainment Inc.,Netflix',
 'Locksmith Animation,Double Negative (DNEG)',
 'Paramount Players,Temple Hill Entertainment,Bad Feeling',
 'Catchlight Studios,Shiny Penny,Beck Woods',
 'Muse Entertainment Enterprises,Muse Entertainment LLC',
 'DreamWorks Animation',
 'Kingdom Story Company',
 'Apple Original Films',
 'New Regency Productions,Working Title Films,Lammas Park,Apple Studios',
 'Lord Miller,Netflix Studios',
 'Tommy Harper,Plan B Entertainment,Marc Toberoff,Tim Burton Productions',
 'Omnes Films',
 'Universal Pictures,Scott Free Productions,DreamWorks Pictures',
 'Lyrical Media',
 'Brandywine Productions,Scott Free Productions',
 'Blumhouse Productions,Universal Pictures',
 'Walt Disney Animation Studios',
 'SunnyMarch,Shoebox Films',
 'Komplizen Film,The Apartment,Fa

In [67]:
len(descriptions)

222

In [68]:
descriptions

['"Juror #2" follows family man Justin Kemp (Nicholas Hoult) who, while serving as a juror in a high profile murder trial, finds himself struggling with a serious moral dilemma... one he could use to sway the jury verdict and potentially convict--or free--the accused killer.',
 'CONCLAVE follows one of the world’s most secretive and ancient events -- selecting the new Pope. Cardinal Lawrence (Ralph Fiennes) is tasked with running this covert process after the unexpected death of the beloved Pope. Once the Catholic Church’s most powerful leaders have gathered from around the world and are locked together in the Vatican halls, Lawrence uncovers a trail of deep secrets left in the dead Pope’s wake, secrets which could shake the foundations of the Church.',
 "Have you ever dreamt of a better version of yourself? You, only better in every way. You should try this new product, it's called The Substance. IT CHANGED MY LIFE. With The Substance, you can generate another you: younger, more beaut

In [69]:
len(boxoffice)

222

In [70]:
boxoffice

[None,
 '$15.0M',
 '$15.5M',
 None,
 None,
 None,
 None,
 '$52.6M',
 None,
 None,
 None,
 None,
 '$20.3M',
 None,
 None,
 '$292.1M',
 None,
 '$187.7M',
 None,
 '$105.3M',
 '$36.9M',
 '$248.8M',
 None,
 None,
 '$20.3M',
 '$5.2M',
 '$636.7M',
 '$6.2M',
 '$58.2M',
 '$90.0M',
 '$50.5M',
 None,
 None,
 '$4.8M',
 '$188.0M',
 '$22.8M',
 '$56.8M',
 None,
 '$267.7M',
 '$7.4M',
 None,
 None,
 '$58.6M',
 '$92.8M',
 '$42.7M',
 '$68.6M',
 '$148.3M',
 None,
 None,
 None,
 None,
 None,
 None,
 '$50.7M',
 '$494.4K',
 '$49.0K',
 None,
 '$74.2M',
 None,
 None,
 None,
 None,
 '$105.9M',
 '$1.6M',
 None,
 None,
 None,
 None,
 None,
 None,
 '$281.5M',
 '$1.2M',
 '$282.0M',
 '$44.1K',
 None,
 None,
 '$60.1M',
 '$12.7M',
 '$4.5M',
 '$10.0M',
 None,
 '$125.9M',
 '$28.7M',
 '$9.0M',
 '$9.3M',
 None,
 '$177.1M',
 '$60.1M',
 '$50.1M',
 '$15.5M',
 None,
 '$50.1M',
 '$25.0M',
 None,
 '$34.2M',
 '$20.3K',
 '$369.3M',
 '$70.0M',
 '$42.7M',
 '$82.3M',
 '$25.9M',
 '$67.4M',
 None,
 '$166.1M',
 '$38.6M',
 '$653.0M',
 '

In [71]:
len(certificate)

222

In [72]:
certificate

['PG-13 (Some Violent Images|Strong Language)',
 'PG (Thematic Material and Smoking)',
 'R (Graphic Nudity|Gore|Language|Strong Violent Content)',
 'PG-13 (Some Language|Suggestive Material)',
 None,
 None,
 'PG (Rude Humor|Thematic Elements|Some Language)',
 'R (Grisly Images|Drug Use|Language Throughout|Strong Violent Content)',
 'R (Some Bloody Violence)',
 None,
 'PG (Thematic Elements|Action/Peril)',
 'PG (Brief Underage Smoking|Thematic Material)',
 'PG-13 (Some Strong Language|Smoking)',
 'PG-13',
 'PG-13',
 'PG-13 (Macabre and Bloody Images|Brief Drug Use|Some Suggestive Material|Strong Language|Violent Content)',
 'PG-13 (Strong Language|Some Suggestive Material|Smoking|Teen Drinking)',
 'R (Intense Graphic Combat)',
 'R (Language)',
 'R (Language|Bloody Violent Content)',
 'R',
 'PG (Peril|Brief Thematic Elements|Some Scary Images)',
 'R',
 'R (A Sexual Reference|Some Language)',
 'R (Language|Brief Sexual Material|Some Drug Use)',
 'R (Sexual Material|Language Throughout|Dru

In [73]:
len(directors)

222

In [74]:
directors

['Clint Eastwood',
 'Edward Berger',
 'Coralie Fargeat',
 'Robert Farrelly',
 'David Gordon Green',
 'Stephen Herek',
 'Simon Otto',
 'Parker Finn',
 'Scott Beck, Bryan Woods',
 'Jerry Ciccoritti',
 'Christopher Sanders',
 'Dallas Jenkins',
 'Greg Berlanti',
 'Steve McQueen',
 'Malcolm Washington',
 'Tim Burton',
 'Tyler Taormina',
 'Ridley Scott',
 'George Nolfi',
 'Fede Alvarez',
 'James Watkins',
 'John Musker, Ron Clements',
 'John Crowley',
 'Pablo Larraín',
 'Alexander Payne',
 'Megan Park',
 'Shawn Levy',
 'Chris Weitz',
 'Todd Phillips',
 'Kelly Marcel',
 'Damien Leone',
 'Jacques Audiard',
 'Anna Kendrick',
 'Robert Zemeckis',
 'Christopher Nolan',
 'Zoë Kravitz',
 'Roland Emmerich',
 'Richard Linklater',
 'Lee Isaac Chung',
 'Francis Ford Coppola',
 'Jeremy Saulnier',
 'Vicky Jenson',
 'Josh Cooley',
 'David Leitch',
 'M. Night Shyamalan',
 'Alex Garland',
 'Justin Baldoni',
 'Jordan Weiss',
 'George Huang',
 'Jeffrey Reiner',
 'Sergio Pablos',
 'Greg Jardin',
 'Peter Sulliva

In [75]:
len(writing_credits)

222

In [76]:
writing_credits

['Jonathan Abrams',
 'Peter Straughan',
 'Coralie Fargeat',
 'Ricky Blitt, Peter Farrelly',
 'Leland Douglas',
 'Hailey DeDominicis',
 'Richard Curtis, Peter Souter',
 'Parker Finn',
 'Scott Beck, Bryan Woods',
 'Russell Hainline',
 'Christopher Sanders',
 'Ryan Swanson, Platte F. Clark, Darin McDaniel',
 'Rose Gilroy',
 'Steve McQueen',
 'Malcolm Washington, Virgil Williams',
 'Alfred Gough, Miles Millar',
 'Eric Berger, Tyler Taormina',
 'David Franzoni, John Logan, William Nicholson, David Franzoni',
 'John Glenn, Jacob Roman, Kenny Ryan',
 'Fede Alvarez, Rodo Sayagues Mendez',
 'James Watkins',
 'Jared Bush',
 'Nick Payne',
 'Steven Knight',
 'David Hemingson',
 'Megan Park',
 'Ryan Reynolds, Rhett Reese, Paul Wernick, Zeb Wells, Shawn Levy',
 'Chris Weitz',
 'Scott Silver, Todd Phillips',
 'Kelly Marcel',
 'Damien Leone',
 'Jacques Audiard, Thomas Bidegain',
 'Ian MacAllister McDonald',
 'Eric Roth, Robert Zemeckis',
 'Jonathan Nolan, Christopher Nolan',
 'Zoë Kravitz, E.T. Feigen

***$\Rightarrow$ Ta thấy được khi crawl data thì có những trường hợp bị chênh lệch dữ liệu, hầu hết là dòng cuối của các thuộc tính ngoại trừ movie_link đầy đủ ( thuộc tính movies thì sẽ là none ở đầu ) sẽ là none nên ta sẽ bỏ dòng cuối đi.***

In [77]:
# Thay đổi các giá trị bị dư thừa

# Tạo DataFrame từ dữ liệu đã lọc
data = pd.DataFrame({
    'Link': movie_links,  # Fixed variable name here
    'Poster Links': poster_links,
    'Movie_Name': movies_name,
    'Ratings': ratings,
    'Movie_category': movie_category,
    'Film_Actor': film_actors, 
    'Release_Date': release_date,
    'Duration': duration,
    'Original_Language': original_language,
    'Companies': companies,
    'Description': descriptions,  # Changed variable name here
    'Box Office': boxoffice,
    'Certificate (MPAA)': certificate,
    'Directors': directors,
    'Writing_Credits': writing_credits
})

# Lưu DataFrame vào một tệp CSV
data.to_csv('Tomatoes_data.csv', index=False)

In [78]:
import pandas as pd
data = pd.read_csv('Tomatoes_data.csv')
data.isna().sum()

Link                   0
Poster Links           0
Movie_Name             0
Ratings                4
Movie_category         0
Film_Actor             0
Release_Date           0
Duration               1
Original_Language      0
Companies              3
Description            0
Box Office            79
Certificate (MPAA)    28
Directors              0
Writing_Credits        6
dtype: int64

In [79]:
data

Unnamed: 0,Link,Poster Links,Movie_Name,Ratings,Movie_category,Film_Actor,Release_Date,Duration,Original_Language,Companies,Description,Box Office,Certificate (MPAA),Directors,Writing_Credits
0,https://www.rottentomatoes.com/m/juror_2,https://resizing.flixster.com/Uweg-aB2RW7m1tiY...,Juror #2,93%,"Drama, Crime, Mystery & Thriller","Clint Eastwood, Nicholas Hoult, Toni Collette,...","Dec 3, 2024",1h 54m,English,"Malpaso Productions,Dichotomy Films","""Juror #2"" follows family man Justin Kemp (Nic...",,PG-13 (Some Violent Images|Strong Language),Clint Eastwood,Jonathan Abrams
1,https://www.rottentomatoes.com/m/conclave,https://resizing.flixster.com/qjfmXMJ6hiJ7x1WB...,Conclave,93%,"Drama, Mystery & Thriller","Edward Berger, Ralph Fiennes, Stanley Tucci, J...","Nov 26, 2024",2h 0m,English,"FilmNation Entertainment,House Productions",CONCLAVE follows one of the world’s most secre...,$15.0M,PG (Thematic Material and Smoking),Edward Berger,Peter Straughan
2,https://www.rottentomatoes.com/m/the_substance,https://resizing.flixster.com/1UnYNJn2dZk4aAiR...,The Substance,90%,"Drama, Horror","Coralie Fargeat, Demi Moore, Margaret Qualley,...","Oct 31, 2024",2h 21m,English,Working Title Films,Have you ever dreamt of a better version of yo...,$15.5M,R (Graphic Nudity|Gore|Language|Strong Violent...,Coralie Fargeat,Coralie Fargeat
3,https://www.rottentomatoes.com/m/dear_santa_2024,https://resizing.flixster.com/uJQmTX-G8kmT9ro3...,Dear Santa,22%,"Comedy, Holiday, Fantasy","Robert Farrelly, Jack Black, Keegan-Michael Ke...","Nov 25, 2024",1h 48m,English,Conundrum Entertainment,When a young boy mails his Christmas wish list...,,PG-13 (Some Language|Suggestive Material),Robert Farrelly,"Ricky Blitt, Peter Farrelly"
4,https://www.rottentomatoes.com/m/nutcrackers,https://resizing.flixster.com/pL0CpZc9elVbI3cw...,Nutcrackers,43%,"Drama, Comedy, Holiday","David Gordon Green, Ben Stiller, Linda Cardell...","Nov 29, 2024",1h 44m,English,"Rivulet Films,Rough House Pictures",A strait-laced man finds his life suddenly upe...,,,David Gordon Green,Leland Douglas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,https://www.rottentomatoes.com/m/five_nights_a...,https://resizing.flixster.com/LYJAHtmMWytRO5BX...,Five Nights at Freddy's,32%,"Horror, Mystery & Thriller","Emma Tammi, Josh Hutcherson, Elizabeth Lail, K...","Oct 27, 2023",1h 50m,English,"ScottGames,Universal Pictures,Striker Entertai...",The film follows a troubled security guard as ...,$137.3M,PG-13 (Language|Bloody Images|Strong Violent C...,Emma Tammi,"Scott Cawthon, Emma Tammi, Seth Cuddeback"
218,https://www.rottentomatoes.com/m/in_the_land_o...,https://resizing.flixster.com/LVnI-1FL8zRZ7mFN...,In the Land of Saints and Sinners,83%,"Mystery & Thriller, Action","Robert Lorenz, Liam Neeson, Kerry Condon, Jack...","Apr 16, 2024",1h 46m,English,"Saga Film,Facing East Entertainment,Bleiberg E...","Ireland, 1970s. Eager to leave his dark past b...",$56.8K,R (Violence|Language Throughout),Robert Lorenz,"Mark Michael McNally, Terry Loane"
219,https://www.rottentomatoes.com/m/didi_2024,https://resizing.flixster.com/DFH-4JKar738gMko...,Dìdi,96%,"Drama, Comedy","Sean Wang, Izaac Wang, Joan Chen, Shirley Chen...","Sep 3, 2024",1h 34m,English,Unapologetic Projects,"In 2008, during the last month of summer befor...",$4.8M,R (Sexual Material|Language Throughout|Drug an...,Sean Wang,Sean Wang
220,https://www.rottentomatoes.com/m/damsel_2023,https://resizing.flixster.com/sskaBgGy55RvP6NP...,Damsel,56%,"Adventure, Fantasy, Action","Juan Carlos Fresnadillo, Millie Bobby Brown, R...","Mar 8, 2024",1h 47m,English,"PCMA Management and Productions,Roth/Kirschenb...",A dutiful damsel agrees to marry a handsome pr...,,PG-13 (Strong Creature Violence|Bloody Images|...,Juan Carlos Fresnadillo,Dan Mazeau
