<a href="https://colab.research.google.com/github/EmreErdem-2/WikirougeWebScraper/blob/main/WikiRougeScraperMECW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#WikiRouge [Marx & Engels Collected Works](https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works) Scrapper

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
# Remove <ul> elements within each <td>
def remove_ul_elements(td):
    # Find all <ul> elements within the <td>
    ul_tags = td.find_all('ul')
    for ul in ul_tags:
        ul.decompose()  # This removes the <ul> tag and its content

    # Get the cleaned text, excluding the original <td> tags
    cleaned_text = ''.join(str(content) for content in td.contents if not content.name == 'ul').strip()
    return cleaned_text

In [20]:
def scrape_table_with_rowspan(table):

    rows = table.find_all('tr')[1:]
    major_contents = ""
    results = []

    for row in rows:
        cols = row.find_all('td')

        # Extract volume, period, and major_contents
        if cols[0].find('a') != None:
            if cols[0].find('a').get('title') != None:
                volume = cols[0].find('a', href=True).get('title').replace(" ", "_")
            volume = cols[0].find('a').text.strip().replace(" ", "_")
        elif cols[0].find('td') != None:
            volume = cols[0].find('td').text.strip().replace(" ", "_")
            print("inside col td width")
        else:
            volume = "No Content found in html"
            print(cols[0])
            print(cols[0].find('td', width="90%"))

        period = cols[3].text.strip() if len(cols) > 3 else None

        pattern = r'\[(\d+)\]'
        if len(cols) > 5:
            major_contents = cols[5].text.strip()
            major_contents = re.sub(pattern, '', major_contents).strip()

        if(cols[0].find('a', href=True) != None):
            title_link = cols[0].find('a', href=True)['href']
        else:
            title_link = "Could not find title link"
        try:
            results.append({
                'Volume': volume.replace('_','').replace('≣', 'Volume_'),
                'Period': period,
                'Major_Contents': major_contents,
                'Link': "https://wikirouge.net"+title_link,
            })
        except:
            print("Exception thrown at results.append")
            print("cols: ")
            print(cols[0])

    return results


In [4]:
def process_table_row(row_html):
    # Parse the row HTML using BeautifulSoup
    soup = BeautifulSoup(str(row_html), 'html.parser')

    # Remove any <ul> and <li> elements
    for ul in soup.find_all('ul'):
        ul.decompose()
    for li in soup.find_all('li'):
        li.decompose()

    # print(soup)

    # Initialize the result dictionary
    result = {
        'content': None,
        'link': None,
        'page': None
    }

    # Find the first <a> element
    a_tag = soup.find('a')
    if a_tag:
        result['link'] = "https://wikirouge.net"+a_tag.get('href')
        result['content'] = a_tag.get('title', a_tag.text.strip())
    else:
        # Extract content from <td> if no <a> tag is found
        td_element = soup.find('td', width="90%")
        if td_element:
            result['content'] = td_element.text.strip()

    # Extract the page number from the second <td> element
    page_td = soup.find_all('td', width="10%")
    if page_td and len(page_td) > 0:
        result['page'] = page_td[0].text.strip()

    # Return the result dictionary
    if result['content'] != None and result['content'] != '':
        result['content'] = result['content'].replace(' ', '_')
        return result

In [5]:
# Scrapes Rows of a given table
def scrape_content_table(table):
  results = []
  rows = table.find_all('tr')
  for row in rows:
    scraped_data = process_table_row(row)
    results.append(scraped_data)
    # print(row)

  return results

In [6]:
def scrape_headlines_and_wikitables(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    headlines_tables = []
    tables = soup.find_all('table', class_='wikitable')

    for table in tables:
        # Initialize the string to store concatenated headlines
        concatenated_headlines = ""

        # Get the previous two siblings
        previous_siblings = table.find_previous_siblings(limit=2)

        # Iterate over the previous siblings and find headlines
        for sibling in reversed(previous_siblings):
            if sibling.name == 'span' and 'mw-headline' in sibling.get('class', []):
                headline_text = sibling.get_text().strip()
                headline_text = re.sub(r'\[edit source\]', '', headline_text)
                concatenated_headlines += headline_text + " "
            elif sibling.name == 'center':
                headline_text = sibling.get_text().strip()
                headline_text = re.sub(r'\[edit source\]', '', headline_text)
                concatenated_headlines += headline_text + " "

        # Remove any trailing whitespace from the concatenated headlines
        concatenated_headlines = concatenated_headlines.strip()
        concatenated_headlines = re.sub(r'\n\n', ' --- ', concatenated_headlines)

        # Append the headlines and table to the results list only if both exist
        if concatenated_headlines and table:
            headlines_tables.append({
                'headlines': concatenated_headlines,
                'table': str(table)
            })

    return headlines_tables


#Test Nodes

Test everything

In [22]:
# Example usage
url = 'https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

data_list = []
data = {
    'Volume': '', # Volume num + description
    'Headlines': '', # Headlines and year info in the volume
    'Page_Headline': '', # Headline of the actual text
    'Link': '', # Link to actual text
    'Page_Number': '' # Page number starting from
}
data_volume = ''
data_headlines = ''

table = soup.select_one('#mw-content-text > div > table')
results = scrape_table_with_rowspan(table)
# print(results)
for result in results:
  if(result['Link'] == ''):
    continue
  # print("\n")
  # print("Volume: " + result['Volume'])
  data_volume = result['Volume'] + ' --- ' + result['Period'] + ' --- ' + result['Major_Contents']
  centers_tables = scrape_headlines_and_wikitables(result['Link'])
  for table in centers_tables:
    headline = BeautifulSoup(table['headlines'], 'html.parser').text
    # print("\t"+headline)
    data_headlines = headline
    table = BeautifulSoup(table['table'], 'html.parser')
    content_results = scrape_content_table(table)
    for content_result in content_results:
      if(content_result == None):
        continue
      # print(content_result)
      data['Volume'] = data_volume
      data['Headlines'] = data_headlines
      data['Page_Headline'] = content_result['content']
      data['Link'] = content_result['link']
      data['Page_Number'] = content_result['page']
      data_list.append(data.copy())

for item in data_list:
  print(item)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'Volume': 'Volume_21 --- 1867-1870 --- Articles and writings concerning the First International, Value, Price and Profit (V. 20)', 'Headlines': 'November 1867-mid-July 1870', 'Page_Headline': "The_Fenian_Prisoners_at_Manchester_and_the_International_Working_Men's_Association", 'Link': 'https://wikirouge.net/texts/en/The_Fenian_Prisoners_at_Manchester_and_the_International_Working_Men%27s_Association', 'Page_Number': '3'}
{'Volume': 'Volume_21 --- 1867-1870 --- Articles and writings concerning the First International, Value, Price and Profit (V. 20)', 'Headlines': 'November 1867-mid-July 1870', 'Page_Headline': 'The_Position_of_The_International_On_Prussian_Protectionist_Tariffs', 'Link': 'https://wikirouge.net/texts/en/The_Position_of_The_International_On_Prussian_Protectionist_Tariffs', 'Page_Number': '5'}
{'Volume': 'Volume_21 --- 1867-1870 --- Articles and writings concerning the First International, Value, Price and 

In [21]:
# Example of main collected works page
url = 'https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Locate the table
table = soup.select_one('#mw-content-text > div > table')
scraped_data = scrape_table_with_rowspan(table)

for result in scraped_data:
    print(result)

{'Volume': 'Volume_1', 'Period': '1835-1843', 'Major_Contents': 'Early writings of Marx, including doctoral dissertation The Difference Between the Democritean and Epicurean Philosophy of Nature', 'Link': 'https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works/Volume_1'}
{'Volume': 'Volume_2', 'Period': '1838-1842', 'Major_Contents': 'Early writings of Engels', 'Link': 'https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works/Volume_2'}
{'Volume': 'Volume_3', 'Period': '1843-1844', 'Major_Contents': 'Early writings of both, including the Economic and Philosophic Manuscripts of 1844', 'Link': 'https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works/Volume_3'}
{'Volume': 'Volume_4', 'Period': '1844-1845', 'Major_Contents': 'The Holy Family, The Condition of the Working Class in England', 'Link': 'https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works/Volume_4'}
{'Volume': 'Volume_5', 'Period': '1845-1847', 'Major_Contents': 'The Ge

In [None]:
# Example usage
url = 'https://wikirouge.net/texts/en/Collection:Marx-Engels_Collected_Works/Volume_24'

headlines_tables = find_headlines_and_tables(url)

for item in headlines_tables:
    headline = BeautifulSoup(item['headline'], 'html.parser').text
    print(f"Headline: {headline}")

    table_parsed = BeautifulSoup(item['table'], 'html.parser')
    # Print table for debugging
    # print(table_parsed)

    results = scrape_content_table(table_parsed)

    for result in results:
        print(result)


Headline: Works of Marx and Engels (1874-83)
{'Content': 'Preface_to_Marx-Engels_Collected_Works_Volume_(24)', 'Page_Number': 'xiii', 'Link': 'https://wikirouge.net/texts/en/Preface_to_Marx-Engels_Collected_Works_Volume_(24)'}
{'Content': 'Collection:Refugee_Literature_(Engels)', 'Page_Number': '3', 'Link': 'https://wikirouge.net/texts/en/Collection:Refugee_Literature_(Engels)'}
{'Content': 'Epilogue_to_Revelations_Concerning_the_communist_Trial_in_Cologne', 'Page_Number': '51', 'Link': 'https://wikirouge.net/texts/en/Epilogue_to_Revelations_Concerning_the_communist_Trial_in_Cologne'}
{'Content': 'For_Poland_(1875)', 'Page_Number': '55', 'Link': 'https://wikirouge.net/texts/en/For_Poland_(1875)'}
{'Content': 'Semi-Official_War-Cries', 'Page_Number': '59', 'Link': 'https://wikirouge.net/texts/en/Semi-Official_War-Cries'}
{'Content': 'Letter_to_August_Bebel,_March_18-28,_1875', 'Page_Number': '67', 'Link': 'https://wikirouge.net/texts/en/Letter_to_August_Bebel,_March_18-28,_1875'}
{'Cont

In [None]:
# def find_headlines_and_tables(url):
#     response = requests.get(url)
#     soup = BeautifulSoup(response.content, 'html.parser')

#     headlines_tables = []
#     all_elements = soup.find_all(['span', 'table'])

#     current_headline = None
#     for element in all_elements:
#         if element.name == 'span' and element.get('class') == ['mw-headline']:
#             current_headline = str(element)
#         elif element.name == 'table' and current_headline:
#             headlines_tables.append({
#                 'headline': current_headline,
#                 'table': str(element)
#             })
#             current_headline = None

#     return headlines_tables