## **Project Title**  
### *Rotten Tomatoes Movie Scraper*

---

### Overview
This project contains a Python script to scrape TV series details from [Rotten Tomatoes](https://www.rottentomatoes.com). The script extracts key information such as show names, latest episodes, ratings, genres, and more, storing the data in a CSV file.

### Features
- Scrapes TV series names and their latest episodes.
- Extracts critic and audience scores.
- Retrieves additional details like network, rating, genre, original language, and release date.
- Saves the collected data into a `movies.csv` file.

---

In [4]:
#Import neccessary libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
from itertools import product
from urllib.parse import urljoin

In [5]:
def crawl_movies():

    """This function scrapes movies in the Rotten Tomatoes webpage and 
    returns some requested movie details"""


    domain_url = 'https://www.rottentomatoes.com/browse/tv_series_browse/?page=4'
    url1 = requests.get(domain_url).text
    soup1 = BeautifulSoup(url1, 'lxml')



    relative_urls = [a['href'] for a in soup1.find_all('a', href=True) if a['href'].startswith('/tv')]

    # filtered_urls = [url for url in relative_urls if not any(url.endswith(f'/s{i}{j}') for i in range(0, 10) for j in range(1, 10))]

    filtered_urls = [
        url for url in relative_urls 
        if not any(url.endswith(f'/s{i}{j}') for i, j in product(range(0, 10), repeat=2))
    ]
    relative_urls = filtered_urls

    full_url = [urljoin(domain_url, relative_url) for relative_url in relative_urls]


    latest_episodes = []


    # Find all 'div' tags with class 'flex-container'
    flex_containers = soup1.find_all('div', class_='flex-container')

    for container in flex_containers:
        # Check if 'span' with class 'smaller' exists in the current container
        smaller_span = container.find('span', class_='smaller')
        if smaller_span:
            # If it exists, append the text content to latest_episode
            smaller_span = (smaller_span.text.strip('  \n').split(':'))
            latest_episodes.append(smaller_span[1].strip())
        else:
            # If it doesn't exist, append 'missing'
            latest_episodes.append('Missing')


    movie_details = []

    movie = list(zip(full_url, latest_episodes))

    for link, latest_episode in movie:
        my_url = requests.get(link).text 
        soup = BeautifulSoup(my_url, 'lxml')

        rottentomatoes = soup.find_all('div', class_='container rt-layout__body')

        for detail in rottentomatoes:
            name = detail.find('rt-text', slot='title')
            if name:
                name = name.text
            else: name = 'Missing'
        
            
            critics_score = 'Missing'
            critscore1 = detail.find('rt-text', slot='criticsScore').text.strip(' ')
            if critscore1:
                critics_score = critscore1
            
            audience_score = 'Missing'
            audscore1 = detail.find('rt-text', slot='audienceScore').text.replace(' ','')
            if audscore1:
                audience_score = audscore1
            

            image = detail.find('rt-img', slot='posterImage')
            if image:
                image = image['src']
            else:
                image = 'Image not found'

            synopsis = detail.find('rt-text', {'data-qa': 'synopsis-value'})
            if synopsis:
                synopsis = synopsis.text
            else: synopsis = 'Missing'


            items = detail.find_all('div', class_='category-wrap')
            items = [t.text.split() for t in items]


            # List of keys to print
            keys_to_print = ['Network', 'Rating', 'Genre', 'Original Language', 'Release Date']

            # Initialize variables to store the values
            network = 'Missing'
            rating = 'Missing'
            genre = 'Missing'
            language = 'Missing'
            release_date = 'Missing'

            # Iterate through the list of lists
            for item in items:
                if item[0] == "Original" or item[0] == "Release":
                    # Extend the key to include the second element
                    key = f"{item[0]} {item[1]}"
                    value = " ".join(item[2:])  # Remaining elements as value
                else:
                    key = item[0]  # First element is the key
                    value = " ".join(item[1:])  # Remaining elements are the value

                # Check if the key is in the list of keys to print
                if key in keys_to_print:
                    if key == 'Network':
                        network = value
                    elif key == 'Rating':
                        rating = value
                    elif key == 'Genre':
                        genre = value
                    elif key == 'Original Language':
                        language = value
                    elif key == 'Release Date':
                        release_date = value


        movie_details.append({
                'Name of TV show': name,
                'Latest Episode': latest_episode,
                'Critics Score' : critics_score,
                'Audience Score': audience_score,
                'Movie Cover Image': image,
                'Synopsis': synopsis,
                'Network': network,
                'Genre': genre,
                'Rating': rating,
                'Language': language,
                'Release Date': release_date  
                
            })
        time.sleep(1)
        
    # Save data to CSV
    df = pd.DataFrame(movie_details)
    df.to_csv('movies.csv', index=False)

In [6]:
crawl_movies()

### Author: *UGWUANYI, ANTHONY C.*

**LinkedIn:** <a href="https://www.linkedin.com/in/chibi-ugwuanyi-663835252/" target="_blank">chibi-ugwuanyi-663835252</a>

**E-mail:** <a href="https://mail.google.com" target="_blank">chibiugwuanyi@gmail.com