In [None]:
pip install --quiet requests beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
import re

def fetch_soup(url):
    """
    Fetches and returns BeautifulSoup object from the given URL.
    """
    response = requests.get(url)
    return BeautifulSoup(response.text, 'html.parser')

def remove_unwanted_elements(content_div):
    """
    Removes unwanted elements from the content div.
    """
    if content_div:
        #remove unwanted div with class 'vector-column-start'
        vector_column_start = content_div.find('div', class_='vector-column-start')
        if vector_column_start:
            vector_column_start.decompose()

        #remove content from the 'Notes' section
        notes_section = content_div.find('span', id='Notes')
        if notes_section:
            for elem in notes_section.find_all_next():
                elem.decompose()
            notes_section.decompose()

        #remove content from the 'References' section
        references_section = content_div.find('span', id='References')
        if references_section:
            for elem in references_section.find_all_next():
                elem.decompose()
            references_section.decompose()

        #remove all tables
        tables = content_div.find_all('table')
        for table in tables:
            table.decompose()

        #remove content from the specific hatnote div
        hatnote_div = content_div.find('div', {'role': 'note', 'class': 'hatnote navigation-not-searchable'})
        if hatnote_div:
            hatnote_div.decompose()

def preprocess_text(text):
    """
    Preprocesses the text content.
    """
    text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
    text = text.replace('\n', ' ').replace('\t', ' ')
    text = re.sub(r'\[\w+\]', '', text)
    text = re.sub(r'[\u00A0]', ' ', text)
    #remove line breaks within paragraphs
    text = re.sub(r'\n(\S)', r' \1', text)

    return text.strip()

def process_links(links):
    """
    Processes each link and extracts text content.
    """
    base_url = "https://en.wikipedia.org"
    all_processed_text = []

    for link in links:
        href = link['href']
        full_url = base_url + href

        #fetch soup for the link
        link_soup = fetch_soup(full_url)

        #extract the content div
        content_div = link_soup.find('div', {'id': 'mw-content-text', 'class': 'mw-body-content'})

        #remove unwanted elements
        remove_unwanted_elements(content_div)

        #extract and preprocess the text content
        if content_div:
            text_content = content_div.get_text(separator='\n')
            processed_text = preprocess_text(text_content)

            #append processed text to the cumulative string
            all_processed_text.append(processed_text.strip())

    return all_processed_text

In [None]:
complete_list = []
urls = ["https://en.wikipedia.org/wiki/List_of_American_films_of_2024",
        "https://en.wikipedia.org/wiki/List_of_Malayalam_films_of_2024"]

for url in urls:
  soup = fetch_soup(url)

  #find all href links in the table
  table = soup.find('table', class_='wikitable')
  links = table.find_all('a', href=True)

  #process each link and get all processed text
  processed_text = process_links(links)
  complete_list.append(processed_text)

In [None]:
#save extracted content to txt file
with open('movie_data.txt', 'w') as f:
  for language in complete_list:
    for movie in language:
      f.write(movie)
      f.write('\n\n')
