# Explainer Notebook

#### - Get all links in initial page
#### - Make dataframe with columns: [URL, Title, List references, List paragraph texts]

In [1]:
# Options
OPTION_PERFORM_SCRAPE = False

In [2]:
# Imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
from util import wiki_util
from tqdm import tqdm
from ast import literal_eval

In [3]:
# Static Variables
LINK_WIKI_ENGLISH = "https://en.wikipedia.org/wiki/Cold_War"
LINK_WIKI_GERMAN = "https://de.wikipedia.org/wiki/Kalter_Krieg"

In [4]:
def get_content_soup(link_wikipedia):
    website = requests.get(link_wikipedia)
    content_soup = BeautifulSoup(website.content)
    return content_soup

def soup_get_title(wiki_content_soup):
    head = wiki_content_soup.find("h1", {"id": "firstHeading"})
    return head.text

def soup_get_reference_links(wiki_content_soup):
    p_elements = wiki_content_soup.find_all("p")
    links = [a['href'] for p in p_elements for a in p.find_all("a", href=True)]
    links = [wiki_util.hyperlink_cleanup(link) for link in links]
    links = [link for link in links if link != False]
    return links

def soup_get_paragraph_texts(wiki_content_soup):
    p_elements = wiki_content_soup.find_all("p")
    paragraph_texts = [p.text for p in p_elements]
    return paragraph_texts

def get_all_reference_links(link_wikipedia):
    website = requests.get(link_wikipedia)
    content_soup = BeautifulSoup(website.content)
    p_elements = content_soup.find_all("p")
    links = [a['href'] for p in p_elements for a in p.find_all("a", href=True)]
    links = [wiki_util.hyperlink_cleanup(link) for link in links]
    links = [link for link in links if link != False]
    return links


In [5]:
# Scrape English version
if OPTION_PERFORM_SCRAPE:
    links_to_scan = get_all_reference_links(LINK_WIKI_ENGLISH) + [LINK_WIKI_ENGLISH]
    links_to_scan = sorted(list(set(links_to_scan)))
    data = []
    for url in tqdm(links_to_scan):
        soup = get_content_soup(url)
        title = soup_get_title(soup)
        list_references = soup_get_reference_links(soup)
        list_paragraph_texts = soup_get_paragraph_texts(soup)
        data.append([url, title, list_references, list_paragraph_texts])


In [6]:
# Save English version
if OPTION_PERFORM_SCRAPE:
    COLUMN_NAMES = ['URL', 'TITLE', 'LIST_REFERENCES', 'LIST_PARAGRAPH_TEXTS']
    df_wikipedia_english = pd.DataFrame(data, columns=COLUMN_NAMES)
    df_wikipedia_english = df_wikipedia_english.set_index('URL')
    df_wikipedia_english.to_csv('./data/wiki_english.csv')

In [7]:
# Scrape German Version
if OPTION_PERFORM_SCRAPE:
    links_to_scan = get_all_reference_links(LINK_WIKI_GERMAN) + [LINK_WIKI_GERMAN]
    links_to_scan = sorted(list(set(links_to_scan)))
    data = []
    for url in tqdm(links_to_scan):
        soup = get_content_soup(url)
        title = soup_get_title(soup)
        list_references = soup_get_reference_links(soup)
        list_paragraph_texts = soup_get_paragraph_texts(soup)
        data.append([url, title, list_references, list_paragraph_texts])


In [8]:
# Save German Version
if OPTION_PERFORM_SCRAPE:
    COLUMN_NAMES = ['URL', 'TITLE', 'LIST_REFERENCES', 'LIST_PARAGRAPH_TEXTS']
    df_wikipedia_german = pd.DataFrame(data, columns=COLUMN_NAMES)
    df_wikipedia_german = df_wikipedia_german.set_index('URL')
    df_wikipedia_german.to_csv('./data/wiki_german.csv')

In [9]:
df_wikipedia_english_fromCSV = pd.read_csv('./data/wiki_english.csv', index_col='URL', converters={'LIST_REFERENCES': literal_eval, 'LIST_PARAGRAPH_TEXTS': literal_eval})
df_wikipedia_german_fromCSV = pd.read_csv('./data/wiki_german.csv', index_col='URL', converters={'LIST_REFERENCES': literal_eval, 'LIST_PARAGRAPH_TEXTS': literal_eval})

In [10]:
df_wikipedia_english_fromCSV

Unnamed: 0_level_0,TITLE,LIST_REFERENCES,LIST_PARAGRAPH_TEXTS
URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"https://en.wikipedia.org/wiki/1,000,000,000_(number)",1000000000,"[https://en.wikipedia.org/wiki/Billion, https:...","[1,000,000,000 (one billion, short scale; one ..."
https://en.wikipedia.org/wiki/17th_parallel_north,17th parallel north,[https://en.wikipedia.org/wiki/Circle_of_latit...,[The 17th parallel north is a circle of latitu...
https://en.wikipedia.org/wiki/1947_Polish_legislative_election,1947 Polish legislative election,[https://en.wikipedia.org/wiki/Edward_Os%C3%B3...,"[\n, Edward Osóbka-Morawski\nPPS\n, Józef Cyra..."
https://en.wikipedia.org/wiki/1948_Czechoslovak_coup_d%27%C3%A9tat,1948 Czechoslovak coup d'état,[https://en.wikipedia.org/wiki/Communist_Party...,"[\n, In late February 1948, the Communist Part..."
https://en.wikipedia.org/wiki/1948_Italian_general_election,1948 Italian general election,[https://en.wikipedia.org/wiki/Alcide_De_Gaspe...,"[\n, Alcide De Gasperi\nChristian Democracy\n,..."
...,...,...,...
https://en.wikipedia.org/wiki/Yasser_Arafat,Yasser Arafat,[https://en.wikipedia.org/wiki/Help:IPA/Englis...,"[\n, Mohammed Abdel Rahman Abdel Raouf Arafat ..."
https://en.wikipedia.org/wiki/Yekaterinburg,Yekaterinburg,[https://en.wikipedia.org/wiki/Help:IPA/Englis...,"[\n, Yekaterinburg (/jɪˈkætərɪnbɜːrɡ/ yih-KAT-..."
https://en.wikipedia.org/wiki/Yugoslavia,Yugoslavia,[https://en.wikipedia.org/wiki/Geographic_coor...,"[\n, Coordinates: 44°49′N 20°27′E﻿ / ﻿44.817°N..."
https://en.wikipedia.org/wiki/Yuri_Andropov,Yuri Andropov,[https://en.wikipedia.org/wiki/Old_Style_and_N...,"[\n, Yuri Vladimirovich Andropov[a] (15 June [..."


In [11]:
df_wikipedia_german_fromCSV

Unnamed: 0_level_0,TITLE,LIST_REFERENCES,LIST_PARAGRAPH_TEXTS
URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
https://de.wikipedia.org/wiki/Kalter_Krieg,Kalter Krieg,[https://en.wikipedia.org/wiki/Westm%C3%A4chte...,[Als der Kalte Krieg wird der Konflikt zwische...
https://en.wikipedia.org/wiki/%C3%84gypten,Ägypten,[],[Other reasons this message may be displayed:\n]
https://en.wikipedia.org/wiki/%C3%84ra,Ära,[],[Other reasons this message may be displayed:\n]
https://en.wikipedia.org/wiki/%C3%96sterreich,Austria,[https://en.wikipedia.org/wiki/Geographic_coor...,[\nCoordinates: 47°20′N 13°20′E﻿ / ﻿47.333°N 1...
https://en.wikipedia.org/wiki/%C3%96sterreichische_Neutralit%C3%A4t,Österreichische Neutralität,[],[Other reasons this message may be displayed:\n]
...,...,...,...
https://en.wikipedia.org/wiki/Zentralverwaltungswirtschaft,Zentralverwaltungswirtschaft,[],[Other reasons this message may be displayed:\n]
https://en.wikipedia.org/wiki/Zerfall_der_Sowjetunion,Zerfall der Sowjetunion,[],[Other reasons this message may be displayed:\n]
https://en.wikipedia.org/wiki/Zivilperson,Zivilperson,[],[Other reasons this message may be displayed:\n]
https://en.wikipedia.org/wiki/Zweite_Polnische_Republik,Zweite Polnische Republik,[],[Other reasons this message may be displayed:\n]
