<a href="https://colab.research.google.com/github/EmreErdem-2/WikirougeWebScraper/blob/main/WikiRougeScraperLCW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#WikiRouge [Lenin Collected Works](https://wikirouge.net/texts/en/Collection:Lenin_Collected_Works) *Scrapper*

With this, we collect information about contents of all of the collected works of Vladimir Lenin as listed in the cite.

TO DO:
The Next step is to collect the actual text content of the headings we were able to scrape from it.

In [16]:
import requests
from bs4 import BeautifulSoup
import re
import pprint

In [2]:
# Remove <ul> elements within each <td>
def remove_ul_elements(td):
    # Find all <ul> elements within the <td>
    ul_tags = td.find_all('ul')
    for ul in ul_tags:
        ul.decompose()  # This removes the <ul> tag and its content

    # Get the cleaned text, excluding the original <td> tags
    cleaned_text = ''.join(str(content) for content in td.contents if not content.name == 'ul').strip()
    return cleaned_text

In [3]:
# Function to scrape rows and follow links
def scrape_volume_table_row(row):

    cols = row.find_all('td')
    if (cols == []):
        results = {
        'Volume': "",
        'Main_Title': "",
        'Link': "",
        'Col' : ""
      }
        return results
    title_link = cols[0].find('a', href=True)
    date = cols[1].text.strip()

    results = {
        'Volume': title_link.get('title').split("/")[1].replace(" ", "_"),
        'Main_Title': date.replace(" —", "-"),
        'Link': "https://wikirouge.net"+title_link['href'],
        'Col' : cols
    }

    return results

In [4]:
# Scrapes rows of a given table
def scrape_volume_table(table):
  results = []
  rows = table.find_all('tr')
  for row in rows:
    scraped_data = scrape_volume_table_row(row)
    results.append(scraped_data)

  return results

Center tags are filled with year knowledge followed by table of contents of the given year

In [21]:
def find_centers_and_tables(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    centers_tables = []
    all_elements = soup.find_all(['center', 'table'])

    current_center = None
    for element in all_elements:
        if element.name == 'center':
            current_center = str(element)
        elif element.name == 'table' and current_center:
            centers_tables.append({
                'center': current_center,
                'table': str(element)
            })
            current_center = None

    return centers_tables


def scrape_headlines_and_wikitables(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    headlines_tables = []
    tables = soup.find_all('table', class_='wikitable')

    for table in tables:
        # Initialize the string to store concatenated headlines
        concatenated_headlines = ""

        # Get the previous two siblings
        previous_siblings = table.find_previous_siblings(limit=2)

        # Iterate over the previous siblings and find headlines
        for sibling in reversed(previous_siblings):
            if sibling.name == 'span' and 'mw-headline' in sibling.get('class', []):
                headline_text = sibling.get_text().strip()
                headline_text = re.sub(r'\[edit source\]', '', headline_text)
                concatenated_headlines += headline_text + " "
            elif sibling.name == 'center':
                headline_text = sibling.get_text().strip()
                headline_text = re.sub(r'\[edit source\]', '', headline_text)
                concatenated_headlines += headline_text + " "

        # Remove any trailing whitespace from the concatenated headlines
        concatenated_headlines = concatenated_headlines.strip()
        concatenated_headlines = re.sub(r'\n\n', ' --- ', concatenated_headlines)

        # Append the headlines and table to the results list only if both exist
        if concatenated_headlines and table:
            headlines_tables.append({
                'headlines': concatenated_headlines,
                'table': str(table)
            })

    return headlines_tables


In [6]:
def scrape_content_table_row(row):
  cols = row.find_all('td')
  if (cols == []):
      results = {
      'Content': "",
      'Link': "",
      'Page_Number': ""
    }
      return results
  title_link = cols[0].find('a', href=True)
  date = cols[1].text.strip()

  content = remove_ul_elements(cols[0]) # Default to only showing regular heading text without list of items to make it simpler
  link = "Missing content. Could not find the link!!!" # Default to this instead of empty when no entry to that content

  # Most of the time there is content filled, so this is processes
  if title_link != None:
    link = "https://wikirouge.net"+title_link['href']
    content = title_link.get('title').replace(" ", "_")

  results = {
      'Content': content,
      'Link': link,
      'Page_Number': date.replace(" —", "-")
  }

  return results

In [7]:
# Scrapes Rows of a given table
def scrape_content_table(table):
  results = []
  rows = table.find_all('tr')
  for row in rows:
    scraped_data = scrape_content_table_row(row)
    results.append(scraped_data)

  return results

#Test Nodes

Test everything

In [23]:
# Example usage
url = 'https://wikirouge.net/texts/en/Collection:Lenin_Collected_Works'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.select_one('#mw-content-text > div > table')
results = scrape_volume_table(table)

data_list = []
data = {
    'Volume': '', # Volume num + description
    'Headlines': '', # Headlines and year info in the volume
    'Page_Headline': '', # Headline of the actual text
    'Link': '', # Link to actual text
    'Page_Number': '' # Page number starting from
}
data_volume = ''
data_headlines = ''

# print(results)
for result in results:
  if(result['Link'] == ''):
    continue
  # print("\n")
  # print("Volume: " + result['Volume'])
  data_volume = result['Volume'] + ' --- ' + result['Main_Title']
  # centers_tables = find_headlines_and_tables(result['Link'])
  centers_tables = scrape_headlines_and_wikitables(result['Link'])
  for table in centers_tables:
    headline = BeautifulSoup(table['headlines'], 'html.parser').text
    # print("\t"+headline)
    data_headlines = headline
    table = BeautifulSoup(table['table'], 'html.parser')
    content_results = scrape_content_table(table)
    for content_result in content_results:
      if(content_result == None):
        continue
      # print(content_result)
      data['Volume'] = data_volume
      data['Headlines'] = data_headlines
      data['Page_Headline'] = content_result['Content']
      data['Link'] = content_result['Link']
      data['Page_Number'] = content_result['Page_Number']
      data_list.append(data.copy())

for item in data_list:
  print(item)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'Volume': 'Volume_18 --- 1912—July 1913', 'Headlines': '1912', 'Page_Headline': 'The_Fox_and_the_Hen-Coop', 'Link': 'https://wikirouge.net/texts/en/The_Fox_and_the_Hen-Coop', 'Page_Number': '351'}
{'Volume': 'Volume_18 --- 1912—July 1913', 'Headlines': '1912', 'Page_Headline': 'A_Disgraceful_Resolution', 'Link': 'https://wikirouge.net/texts/en/A_Disgraceful_Resolution', 'Page_Number': '353'}
{'Volume': 'Volume_18 --- 1912—July 1913', 'Headlines': '1912', 'Page_Headline': 'Two_Utopias', 'Link': 'https://wikirouge.net/texts/en/Two_Utopias', 'Page_Number': '355'}
{'Volume': 'Volume_18 --- 1912—July 1913', 'Headlines': '1912', 'Page_Headline': 'Debates_in_Britain_on_Liberal_Labour_Policy', 'Link': 'https://wikirouge.net/texts/en/Debates_in_Britain_on_Liberal_Labour_Policy', 'Page_Number': '360'}
{'Volume': 'Volume_18 --- 1912—July 1913', 'Headlines': '1912', 'Page_Headline': 'A_Cadet_Professor', 'Link': 'https://wikirouge.ne

Test Only Collected Works scrape

In [None]:
# Example usage
url = 'https://wikirouge.net/texts/en/Collection:Lenin_Collected_Works'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.select_one('#mw-content-text > div > table')
results = scrape_volume_table(table)
print(results)

[{'Volume': '', 'Main_Title': '', 'Link': '', 'Col': ''}, {'Volume': 'Volume_1', 'Main_Title': '1893-1894', 'Link': 'https://wikirouge.net/texts/en/Collection:Lenin_Collected_Works/Volume_1', 'Col': [<td><a href="/texts/en/Collection:Lenin_Collected_Works/Volume_1" title="Collection:Lenin Collected Works/Volume 1">≣ 1</a>   <a class="external text" href="https://www.marxists.org/archive/lenin/works/cw/pdf/lenin-cw-vol-01.pdf" rel="nofollow noreferrer noopener" target="_blank"><img alt="PDF-horizontal.png" data-file-height="12" data-file-width="26" decoding="async" height="12" src="/texts/en/w/images/3/39/PDF-horizontal.png" width="26"/></a></td>, <td>1893 —1894</td>, <td align="center">1977</td>]}, {'Volume': 'Volume_2', 'Main_Title': '1895—1897', 'Link': 'https://wikirouge.net/texts/en/Collection:Lenin_Collected_Works/Volume_2', 'Col': [<td><a href="/texts/en/Collection:Lenin_Collected_Works/Volume_2" title="Collection:Lenin Collected Works/Volume 2">≣ 2</a>   <a class="external text"

Test Only Contents

In [10]:
# Example usage
url = 'https://wikirouge.net/texts/en/Collection:Lenin_Collected_Works/Volume_45'

centers_tables = scrape_headlines_and_wikitables(url)
for table in centers_tables:
  headline = BeautifulSoup(table['headlines'], 'html.parser').text
  print("\t"+headline)
  table = BeautifulSoup(table['table'], 'html.parser')
  content_results = scrape_content_table(table)
  for content_result in content_results:
    if(content_result == None):
      continue
    print(content_result)

	November—December 1920
{'Content': 'Letter_to_Semyon_Sereda,_November_6,_1920', 'Link': 'https://wikirouge.net/texts/en/Letter_to_Semyon_Sereda,_November_6,_1920', 'Page_Number': '47'}
{'Content': 'Letter_to_the_Narrow_Council_Of_People’s_Commissars,_November_6,_1920', 'Link': 'https://wikirouge.net/texts/en/Letter_to_the_Narrow_Council_Of_People%E2%80%99s_Commissars,_November_6,_1920', 'Page_Number': '48'}
{'Content': 'Telegram_to_the_Revolutionary_Military_Council_of_the_Southern_Front,_November_12,_1920', 'Link': 'https://wikirouge.net/texts/en/Telegram_to_the_Revolutionary_Military_Council_of_the_Southern_Front,_November_12,_1920', 'Page_Number': '48'}
{'Content': 'Letter_to_Nikolai_Bryukhanov,_Pavel_Ivanovich_Popov,_Varlam_Avanesov_and_Mikhail_Vladimirsky,_November_12,_1920', 'Link': 'https://wikirouge.net/texts/en/Letter_to_Nikolai_Bryukhanov,_Pavel_Ivanovich_Popov,_Varlam_Avanesov_and_Mikhail_Vladimirsky,_November_12,_1920', 'Page_Number': '48'}
{'Content': 'Letter_to_Mikhail_K

Test Special Cases

In [None]:
url = 'https://wikirouge.net/texts/en/Collection:Lenin_Collected_Works/Volume_38'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.select_one('#mw-content-text > div > table:nth-child(14) > tbody > tr:nth-child(3) > td:nth-child(1)')
res = remove_ul_elements(table)
print(res)

<a href="/texts/en/Conspectus_of_Hegel%E2%80%99s_Book_Lectures_On_the_History_of_Philosophy" title="Conspectus of Hegel’s Book Lectures On the History of Philosophy">Conspectus of Hegel’s Book <i>Lectures on the History of Philosophy</i></a>
