## Introduction and objectives

In order to analyze the article texts in wikipedia without having to actively load data via the MediaWiki-API, the article texts from a selected category within Wikipedia are to be accessed and stored in textfiles wihin a defined folder. In order to perform text analysis tasks the article texts are to be cleaned from information irrelevant for the task (e.g. references, external links, etc.). The objectives are:
- Create a list with all articles within a defined subcategory in Wikpeida
- Access all articles within a selected subcategory via the MediaWiki-API
- Clean the article texts and remove irrelevant information
- Store the article texts within a textfile within a folder


IMPORTANT: before starting this notebook, insert your Wikipedia account logindata under "Access article text" (ln[11]) to access the wikipedia API:

    'lgname': "[INSERT USERNAME HERE]", 
    'lgpassword': "[INSERT PASSWORD HERE]",

## B) Data Preparation

#### 1) Import libraries

In [1]:
import requests
import csv
from json import dumps
import re
import os

Variables

In [2]:
home_dir = os.getcwd()
folder_path_cleaned = 'Data\\cleaned'
folder_path_cleaned_articles = 'Data\\cleaned\\articles'
folder_path_cleaned_links = 'Data\\cleaned\\links'
abs_folder_path_cleaned = os.path.join(home_dir, folder_path_cleaned)
abs_folder_path_cleaned_articles = os.path.join(home_dir, folder_path_cleaned_articles)
abs_folder_path_cleaned_links = os.path.join(home_dir, folder_path_cleaned_links)

#### 2) Method definition


Get Overvied over relevant articles

In [3]:
def get_subcategories(category):
    # Method returns a list with subcategories for a selected category
    
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{category}&cmlimit=500"
    response = requests.get(url)
    data = response.json()
    
    subcategories = []
    
    if 'query' in data and 'categorymembers' in data['query']:
        for member in data['query']['categorymembers']:
            if member['ns'] == 14:
                subcategories.append(member['title'].replace('Category:', ''))
    
    return subcategories

In [4]:
def get_articles_by_category(category):
    # Method returns a list of articles for a selected category

    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{category}&cmlimit=500"
    response = requests.get(url)
    data = response.json()
    
    articles = {}
    
    if 'query' in data and 'categorymembers' in data['query']:
        for member in data['query']['categorymembers']:
            if member['ns'] == 0:
                articles[member['title']] = category
    
    return articles

In [5]:
def get_articles_by_subcategory(subcategory):
    # Method retuns a list of articles for a selected subcategory
        
    url = f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=categorymembers&cmtitle=Category:{subcategory}&cmlimit=500"
    response = requests.get(url)
    data = response.json()
    
    articles = {}
    
    if 'query' in data and 'categorymembers' in data['query']:
        for member in data['query']['categorymembers']:
            if member['ns'] == 0:
                articles[member['title']] = subcategory
    
    return articles

Specify method for purpose

In [6]:
def get_keys(dictionary):
    # Method returns the keys within a dictionary
    
    return list(dictionary.keys())

In [7]:
def create_article_list(category):
    # Method creates a list with unique titles for article concerning defined category
    
    article_dict = {}
    article_dict.update(get_articles_by_category(category))
    subcat_list = get_subcategories(category)
    
    for subcat in subcat_list:
        article_dict.update(get_articles_by_subcategory(subcat))
    
    article_list = []
    article_list = get_keys(article_dict)
    
    return article_list

In [8]:
def create_article_list_with_given_selection(category, subcategory_list):
    # Method creates a list with unique titles for article concerning defined category and subcategory
    
    article_dict = {}
    
    if category != "":
        article_dict.update(get_articles_by_category(category))
    
    for subcat in subcategory_list:
        article_dict.update(get_articles_by_subcategory(subcat))
    
    article_list = []
    article_list = get_keys(article_dict)
    
    return article_list

Cleaning the article texts

In [9]:
def remove_references(string):
    # Method that removes irrelevant information from the text
 
    # Define patterns to be removed within string
    #pattern_one = r"<ref name.*?/>"
    #pattern_two = r"<ref>.*?</ref>"
    #attern_two = r"<ref.*?>"
    #pattern_three = r"<ref name=.*?</ref>"
    #attern_four = r"[[File:.*?.]]"
    #attern_five = r"[[Category:.*?]]"
    #attern_six = r"\\.?\*?"
    #attern_seven = r"Help:List.*?"
    #attern_eight = r"wikt:.*?"
    #attern_nine = r"{.*?}}?"
    #attern_ten = r"'''.*?"
    
    # Order and application of patterns
    #ed_string = re.sub(pattern_ten, "", 
    #                   re.sub(pattern_nine, "", 
    #                          re.sub(pattern_eight, "", 
    #                                 re.sub(pattern_seven, "", 
    #                                       re.sub(pattern_six, "", 
    #                                              re.sub(pattern_five, "", 
    #                                                    re.sub(pattern_four, "",
    #                                                           #re.sub(pattern_three, "", 
    #                                                           re.sub(pattern_two, "", string))))))))
    #                                                                         #re.sub(pattern_one, "", string)
    #                                                                            #))
          
    red_string = re.sub(r"(<ref.*?>)|(<\/ref>)|(\[\[File:.*?\]\])|(\[\[Category:.*?\]\])|(\\n?\*?)|(Help:List.*?)|(wikt:.*?)|(\{\{reflist.+\]\})|('''.*?)|(<math>.*?<\/math>)|({pages.*?contentmodel: wikitext, \*: )", "", string)
    
    # Additionally remove reference list
    result = red_string[0:red_string.find("{{Refbegin}}")]
    
    return result

In [10]:
def clean_special_characters(input_string):
    # Method that cleans/replaces undesired combinations of characters within the string
    
    # Define the characters and strings to be replaced
    formatted_string = input_string.replace("}nn", "")
    formatted_string = formatted_string.replace("}n", "")  
    formatted_string = formatted_string.replace("[[", "") 
    formatted_string = formatted_string.replace("]]", "") 
    formatted_string = formatted_string.replace("==n", "== ")
    formatted_string = formatted_string.replace("nn==", " ==")
    formatted_string = formatted_string.replace("n==", " ==")
    formatted_string = formatted_string.replace("}</ref>", " ")
    formatted_string = input_string.replace('"\n', '') # relevant to avoid problem with "Twiddler's syndrome"
    formatted_string = formatted_string.replace("\\\\", "")
    formatted_string = formatted_string.replace("u2013", "-")
    formatted_string = formatted_string.replace("u00f6", "ö")
    formatted_string = formatted_string.replace("00fc", "ü")  
    formatted_string = formatted_string.replace("u00e4", "ä")  
    formatted_string = formatted_string.replace("u00fc", "ü")
    formatted_string = formatted_string.replace("u00e9", "é")
    formatted_string = formatted_string.replace("u00e2", "â")
    formatted_string = formatted_string.replace("u00e8", "è")
    formatted_string = formatted_string.replace("u00e0", "à")
    formatted_string = formatted_string.replace("u00f3", "ó")
    formatted_string = formatted_string.replace("u00e1", "á")
    formatted_string = formatted_string.replace("u00f8", "ø")
    formatted_string = formatted_string.replace('"', '')
    

    return formatted_string

Access article text

In [11]:
def get_article_text(title):
    # Method accesses the MediaWiki API and queries the defined article via the title
    
    # Access MediaWiki API
    session = requests.Session()
    url = "https://en.wikipedia.org/w/api.php"
    
    params_0 = {
    'action':"query",
    'meta':"tokens",
    'type':"login",
    'format':"json"
    }
    
    req = session.get(url=url, params= params_0)
    data = req.json()
    
    login_token = data['query']['tokens']['logintoken']
    
    params_1 = {
    'action': "login",
    'lgname': "[INSERT USERNAME HERE]", 
    'lgpassword': "[INSERT PASSWORD HERE]",
    'lgtoken': login_token,
    'format': "json"
    }
    req = session.post(url, data=params_1)
    data = req.json()
    
    # Get article text
    content_params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvprop": "content",
        "format": "json"
    }
    
    content_response = session.get(url, params=content_params).json()

    return content_response

In [12]:
def convert_tostring(dictionary):
    # Method converts type dict to type string
    
    reduced_dictionary = dictionary['query']
    convert = dumps(reduced_dictionary)
    return convert

Store extracted article text as text files within defined folder

In [13]:
def extract_text_and_save_data(links_list, abs_folder_path_cleaned):
    for link in links_list:
        content = get_article_text(link)
        content_str = convert_tostring(content)
        content_str_without_ref = remove_references(content_str)
        content_str_clean =  clean_special_characters(content_str_without_ref)    
    
        data_name = link
        
        path = abs_folder_path_cleaned
        
        # Save article text with name
        try:
            data_path = path + '/' + data_name + ".txt"
            with open(data_path, 'w') as data:
                data.write(content_str_clean)

        except IOError:
            print("Fehler beim Schreiben der Datei: " + data_name)

In [14]:
def read_article_text(file_name, folder_path):
    # Method extracts article text from text file and returns a string
    
    # Define file path incl. ending .txt
    file_path = os.path.join(folder_path, file_name)
    file_path += ".txt"

    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    else:
        
        # If the article is not wihin the folder - return an empty string
        print(f"Die Datei '{file_name}' wurde nicht im Ordner '{folder_path}' gefunden.")
        return ""

Extract links from article text

In [15]:
def extract_links(string):
    # Method extracts links from article text and returns a list of strings
    
    result = []
    
    # Links within Wikipedia articles are marked by '[[...]]'
    start = string.find("[[")
    while start != -1:
        end = string.find("]]", start + 1)
        if end == -1:
            break
        
        # Check if it is a masked link
        linked_content = string[start + 2:end]
        
        
        # If so remove first part (up to |)
        if '|' in linked_content:
            linked_content = linked_content[:linked_content.find("|")]
        
        result.append(linked_content)
        start = string.find("[[", end + 1)        
        
    return result

In [16]:
def remove_empty_entries(entry_list):
    # Method that removes empty entries from a list and returns a list
    
    try:
        while True:
            entry_list.remove("")
            
    except ValueError:
        pass
    
    return entry_list

In [17]:
def clean_multiple_entries(entry_list):
    # Method meant to clean multiple entries within the nodes - a node is unique within the linkage graph
    
    # Transform list into a set and then back into a list
    unique_list = list(set(entry_list))
    
    return unique_list

In [18]:
def remove_duplicate_variables(string_list):
    # Method that removes variables which are not unique within list and returns a list of unique entris
    # -> A link declaration could be "Disease" or "disease" by deriving the same neo4j variable only a single node
    # is created within the linkage graph
    
    string_dict = {}
    
    # Create dict entry with neo4j variable as key and extracted link as value
    for string in string_list:
        key = convert_to_neo4j_variable(string)
        value = string
        
        # Check if key exists already in dict
        if key not in string_dict:
            string_dict[key] = value
    
    unique_values = list(string_dict.values())
    
    return unique_values

In [19]:
def convert_to_neo4j_variable(string):
    # Method that convers extracted link to neo4j variable
    
    # Replace all characters not admitted for variable name
    cleaned_string = re.sub(r'[^a-zA-Z0-9_]', '_', string)
    
    # If first letter is a digit, add "_"
    if cleaned_string[0].isdigit():
        cleaned_string = '_' + cleaned_string
    
    # Change capital letters wihin variable name
    cleaned_string = cleaned_string.lower() 
    
    return cleaned_string

In [20]:
def extract_nodes_from_article_list(articles_list):
    # Method that extracts all nodes from all articles within the defined article list and folder
    
    nodes = []
    
    # Iterate over article list
    for article in articles_list:
        
        # Add the original article titel -> cleaned
        nodes.append(clean_special_characters(article))
        
        # Read article text from txt file in folder
        article_text = read_article_text(article, abs_folder_path_cleaned_articles)

        # Remove references
        text_without_references = remove_references(article_text)

        # Clean special characters in string
        text_cleaned = clean_special_characters(text_without_references)

        # Extract links from text
        text_links = extract_links(text_cleaned)

        # add extracted links zu nodes list
        nodes.extend(text_links)
    
    # clean multiple entries -> each node is unique
    nodes_cleaned = clean_multiple_entries(nodes)
    
    # Remove empty entries within list
    nodes_cleaned_without_empty_entries = remove_empty_entries(nodes_cleaned)
    
    # Remove duplicate variable declarations
    nodes_cleaned_no_duplicates = remove_duplicate_variables(nodes_cleaned_without_empty_entries)
    
    return nodes_cleaned_no_duplicates

#### 3) Main-Method - Extraction of article titles and article texts

This variable is critical: Enter the name(s) of the categories to download here

In [21]:
# Create list of articles within subcategory 
subcat_list_cat = ['Radioactive contamination', 'Pollutants']
article_list = create_article_list_with_given_selection("", subcat_list_cat)

In [22]:
# Save list with article titles as csv-file
with open(abs_folder_path_cleaned + '//' + "list" + '/' + "Article_list_Pollution_n+1" + ".txt", 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(article_list)

In [23]:
# Extract article texts according to list of articles, clean text and store it
extract_text_and_save_data(article_list, abs_folder_path_cleaned_articles)

c) Extract all links within the article texts of articles from 'article_list_pollution.txt' and save it as list 'article_list_pollution_n+1'

In [24]:
nodes_list = extract_nodes_from_article_list(article_list)

In [25]:
len(nodes_list)

3911

In [31]:
# Save list with article titles as text file 
with open(abs_folder_path_cleaned + '//' + "list" + '/' + "Article_list_Pollution_n+1_links" + ".txt", 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(nodes_list)

In [32]:
# Extract article texts according to list of articles 
extract_text_and_save_data(nodes_list, abs_folder_path_cleaned_links)

Fehler beim Schreiben der Datei: :fr :Association pour le contru00f4le de la radioactivité dans l'Ouest
Fehler beim Schreiben der Datei: creativecommons:by/4.0/
Fehler beim Schreiben der Datei: :File:A RES 71 313 E.pdf
Fehler beim Schreiben der Datei: :File:N1529189.pdf
Fehler beim Schreiben der Datei: Bofors 40 mm Automatic Gun L/60
Fehler beim Schreiben der Datei: HIV/AIDS in the United States
Fehler beim Schreiben der Datei: : dilute
Fehler beim Schreiben der Datei: :clumping
Fehler beim Schreiben der Datei: MARPOL 73/78
Fehler beim Schreiben der Datei: :dilute
Fehler beim Schreiben der Datei: R/V
Fehler beim Schreiben der Datei: R/V Ocean Starr
Fehler beim Schreiben der Datei: MARPOL 73/78#Annexes


e) If there are exceptions due to the large number of articles - restart the the method at the point it failed

In [72]:
# Nach Fehler weitermachen - evtl. Exception mit einfügen
extract_text_and_save_data(nodes_list[12000:], abs_folder_path_cleaned_links)

Fehler beim Schreiben der Datei: :fr :Association pour le contru00f4le de la radioactivité dans l'Ouest
Fehler beim Schreiben der Datei: ISO/IEC 17025


####  4) Testing and post preparation

The articles "HIV/AIDS" and "Opitz G/BBB syndrome" have been downloaded and stored as text files with the name "HIVAIDS" and "OPITZ GBBB syndrome" in the defined folder. A post processing of the article's text is necessary.

In [17]:
def read_article_text(file_name, folder_path):
    
    # Define File path incl. ending .txt
    file_path = os.path.join(folder_path, file_name)
    file_path += ".txt"

    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    else:
        
        # If the article is not wihin the folder - return an empty string
        print(f"Die Datei '{file_name}' wurde nicht im Ordner '{folder_path}' gefunden.")
        return ""

In [18]:
# Open the text file "HIVAIDS" from the folder
hivaids_new = read_article_text("HIVAIDS",abs_folder_path_cleaned )

In [19]:
# Open the text file "Opitz GBBB syndrome" from the folder
opitz_new = read_article_text("Opitz GBBB syndrome",abs_folder_path_cleaned )

In [20]:
# Clean article text for "HIV/AIDS" and store string within text file "HIVAIDS"
hivaids_str_without_ref = remove_references(hivaids_new)
hivaids_str_clean =  clean_special_characters(hivaids_str_without_ref)    

In [21]:
# Clean article text for "" and store string within text file "HIVAIDS"
opitz_str_without_ref = remove_references(opitz_new)
opitz_str_clean =  clean_special_characters(opitz_str_without_ref)    

In [22]:
# Save article text for "HIV/AIDS" within text file "HIVAIDS"
try:
    data_path = abs_folder_path_cleaned + '/' + "HIVAIDS" + ".txt"
    with open(data_path, 'w') as data:
        data.write(hivaids_str_clean)
except IOError:
    print("Fehler beim Schreiben der Datei: " + "HIVAIDS")

In [23]:
# Save article text for "Opitz G/BBB syndrome" within text file "Opitz GBBB syndrome"
try:
    data_path = abs_folder_path_cleaned + '/' + "Opitz GBBB syndrome" + ".txt"
    with open(data_path, 'w') as data:
        data.write(opitz_str_clean)
except IOError:
    print("Fehler beim Schreiben der Datei: " + "Opitz GBBB syndrome")

The articles "HIV/AIDS" and "Opitz G/BBB syndrome" have been downloaded and stored as text files with the name "HIVAIDS" and "OPITZ GBBB syndrome" in the defined folder. For the Creation of the linkage graph, two exceptions will be implemented for these articles.