In [1]:
!pip install wikipedia setuptools pywikibot mwparserfromhell pandas numpy scipy nltk tqdm seaborn pyqt5 pyqtwebengine ruamel-yaml lxml datetime

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pywikibot
  Downloading pywikibot-9.5.0-py3-none-any.whl.metadata (17 kB)
Collecting mwparserfromhell
  Downloading mwparserfromhell-0.6.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting pyqt5
  Downloading PyQt5-5.15.11-cp38-abi3-manylinux_2_17_x86_64.whl.metadata (2.1 kB)
Collecting pyqtwebengine
  Downloading PyQtWebEngine-5.15.7-cp38-abi3-manylinux_2_17_x86_64.whl.metadata (1.8 kB)
Collecting ruamel-yaml
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)
Collecting datetime
  Downloading DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting PyQt5-sip<13,>=12.15 (from pyqt5)
  Downloading PyQt5_sip-12.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (421 bytes)
Collecting PyQt5-Qt5<5.16.0,>=5.15.2 (from pyqt5)
  Downloading PyQt5_Qt5-5.15.15-py3-none-manylinux2014_x86_64.whl.m

In [3]:
import wikipedia
import re
import pywikibot
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.request import urlopen
import json
from datetime import datetime

RuntimeError: No user-config.py found in directory '/content'.

Please check that user-config.py is stored in the correct location.
Directory where user-config.py is searched is determined as follows:

    Return the directory in which user-specific information is stored.

    This is determined in the following order:
     1.  If the script was called with a `-dir:` argument, use the
         directory provided in this argument.
     2.  If the user has a `PYWIKIBOT_DIR` environment variable, use the
         value of it.
     3.  If `user-config` is present in current directory, use the
         current directory.
     4.  If `user-config` is present in `pwb.py` directory, use that
         directory
     5.  Use (and if necessary create) a `'pywikibot'` folder under
         `'Application Data'` or `'AppData\Roaming'` (Windows) or
         `'.pywikibot'` directory (Unix and similar) under the user's
         home directory.

    Set `PYWIKIBOT_NO_USER_CONFIG=1` to disable loading user config file
    (`user-config.py`) or install Pywikibot as a site-package.

    .. versionchanged:: 7.7
       Added the *config_file* parameter.

    :param test_directory: Assume that a user config file exists in this
        directory. Used to test whether placing a user config file in this
        directory will cause it to be selected as the base directory.
    :param config_file: filename of the user config file
    

In [None]:
wikipedia.set_lang("en")  #Limit to English WP

# Creating a manual corpus by providing a list of Wikipedia article titles

In [4]:
def manual_corpus_creation(article_titles):
    manual_corpus = []

    for article_title in article_titles:
        article_title = article_title.strip()  # Remove extra spaces
        try:
            wikipedia.summary(article_title, auto_suggest=False)
            manual_corpus.append(article_title)
        except wikipedia.exceptions.PageError:
            print(f"Article '{article_title}' not found on Wikipedia.")
        except wikipedia.exceptions.DisambiguationError:
            print(f"Article '{article_title}' is ambiguous. Skipping.")

    return manual_corpus

# Example usage
article_titles = ["Python (programming language)", "Artificial intelligence", "Quantum computing"]
manual_corpus = manual_corpus_creation(article_titles)
print(f"Your manually created corpus: {manual_corpus}")

Your manually created corpus: ['Python (programming language)', 'Artificial intelligence', 'Quantum computing']


## Creation of a dataframe with informations on the pages

In [5]:
def crea_dataframe(search_list: list, keyword: str, section_search=False) -> pd.DataFrame:
    """
    Create a DataFrame from Wikipedia articles in the search list.

    :param search_list: List containing the name of all the articles.
    :param keyword: String containing the keyword to search within sections.
    :param section_search: If True, extract the wikitext of the section containing the keyword in its name.
    :type section_search: bool
    :returns: DataFrame containing the article name, URL, and wikitext for each article.
    :rtype: pd.DataFrame
    """
    tableau = []  # List to hold dictionaries with information for each page

    # Regular expressions for section, subsection, and subsubsection
    recode = r'(== .*' + re.escape(keyword) + r'.* ==)(.*?)(?== )'
    recode2 = r'(=== .*' + re.escape(keyword) + r'.* ===)(.*?)(?=== )'
    recode3 = r'(==== .*' + re.escape(keyword) + r'.* ====)(.*?)(?==== )'

    # Loop through all the articles in the search list
    for i in tqdm(range(len(search_list))):
        try:
            # Find the Wikipedia page
            page = wikipedia.page(search_list[i], auto_suggest=False)
            page_title = page.title  # Clean name of the page
            page_url = page.url  # URL of the page
            page_text = page.content  # Retrieve the full content of the article

            if section_search:
                # If section search is enabled, use regular expressions to find specific sections
                page_text_sections = re.findall(recode, page_text) or re.findall(recode2, page_text) or re.findall(recode3, page_text)
                if page_text_sections:
                    page_text = '\n'.join([section[1] for section in page_text_sections])
                else:
                    page_text = "Section with the given keyword not found."

            # Add the info to the list
            infopage = {'Name only': page_title, 'page url': page_url, 'text': page_text}
            tableau.append(infopage)

        except wikipedia.exceptions.PageError:
            print(f"PageError: {search_list[i]} not found.")
        except wikipedia.exceptions.DisambiguationError:
            print(f"DisambiguationError: {search_list[i]} is ambiguous.")
        except Exception as e:
            print(f"Error processing {search_list[i]}: {e}")

    # Create DataFrame from the list of dictionaries
    df = pd.DataFrame.from_dict(tableau)
    return df

# Assuming you have already created manual_corpus using manual_corpus_creation()
# You can then call the function like this:
keyword = "Technology"  # Example keyword to search within sections
df = crea_dataframe(manual_corpus, keyword, section_search=False)
df

NameError: name 'pd' is not defined

In [None]:
# prompt: please explain the keyword meaning in the chunk of code above. It means that if I put True instead of False I can search by the keyword in the section names? But if it's Falst, then this search isn't conducted, right?

Okay, let's break down the `section_search` parameter in your code.

**Explanation**

- **Purpose:** The `section_search` parameter controls whether your function will look for a specific keyword within section names within the Wikipedia page.

- **Behavior:**
   - If `section_search` is set to `True`, the function will try to find sections within the Wikipedia page whose names contain the specified `keyword`.
   - If such a section is found, its content (`page_text`) will be updated to contain only the extracted content of that specific section.
   - If `section_search` is set to `False`, the function will skip this step, and `page_text` will contain the entire content of the Wikipedia article.

**Example**
- If you set `section_search=True` and your `keyword` is "History", the function will try to find a section title like "History of..." or "The history of...". If it finds it, it will collect the content specifically belonging to that section.
- If you set `section_search=False`, the entire content of the Wikipedia page will be captured in the `page_text`.

**In essence:**

- `section_search=True`: Extract content from sections whose names contain the `keyword`.
- `section_search=False`: Extract the entire Wikipedia page content without considering the `keyword` for section filtering.

**Your understanding is correct:** Setting `section_search` to `True` will enable the search for the `keyword` within the section names and the extraction of the corresponding sections' content. If it's `False`, this search won't be conducted.

I hope this clarifies the role of the `section_search` parameter. Let me know if you have any further questions.

## Adding WikiData page links

In [None]:
def get_wikidata_url(wikipedia_url):
    # Extract the article title from the URL
    title = wikipedia_url.split('/wiki/')[-1]
    # Query the Wikidata API
    response = requests.get(f'https://www.wikidata.org/w/api.php?action=wbgetentities&sites=enwiki&titles={title}&format=json')
    data = response.json()
    entities = data.get('entities')
    if entities:
        entity_id = list(entities.keys())[0]
        if entity_id != '-1':
            return f'https://www.wikidata.org/wiki/{entity_id}'
    return ''

df_WD = df.copy()
# Apply the function to the DataFrame
df_WD['wikidata_url'] = df_WD['page url'].apply(get_wikidata_url)

df_WD

Unnamed: 0,Name only,page url,text,wikidata_url
0,Python (programming language),https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...",https://www.wikidata.org/wiki/Q28865
1,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,"Artificial intelligence (AI), in its broadest ...",https://www.wikidata.org/wiki/Q11660
2,Quantum computing,https://en.wikipedia.org/wiki/Quantum_computing,A quantum computer is a computer that exploits...,https://www.wikidata.org/wiki/Q17995793


## Adding the Creation Dates of Wikipedia and Wikidata pages

In [None]:
# Function to get Wikipedia article creation date
def get_wikipedia_creation_date(page_url):
    if pd.isna(page_url):
        return None
    page_title = page_url.split('/')[-1]
    endpoint = f"https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvlimit=1&rvdir=newer&titles={page_title}&format=json"

    response = requests.get(endpoint)
    data = response.json()
    page_id = next(iter(data['query']['pages']))

    if 'revisions' in data['query']['pages'][page_id]:
        creation_date = data['query']['pages'][page_id]['revisions'][0]['timestamp']
        creation_date = datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%SZ')
        return creation_date
    else:
        return None


# Function to get Wikidata item creation date
def get_wikidata_creation_date(wikidata_url):
    if pd.isna(wikidata_url):
        return None
    entity_id = wikidata_url.split('/')[-1]
    endpoint = f"https://www.wikidata.org/w/api.php?action=query&prop=revisions&rvlimit=1&rvdir=newer&titles=Item:{entity_id}&format=json"

    response = requests.get(endpoint)
    data = response.json()
    page_id = next(iter(data['query']['pages']))

    if 'revisions' in data['query']['pages'][page_id]:
        creation_date = data['query']['pages'][page_id]['revisions'][0]['timestamp']
        creation_date = datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%SZ')
        return creation_date
    else:
        return None

df_DOB = df_WD.copy()
# Add new columns for creation dates
df_DOB['Wikipedia Creation Date'] = df_DOB['page url'].apply(get_wikipedia_creation_date)
df_DOB['Wikidata Creation Date'] = df_DOB['wikidata_url'].apply(get_wikidata_creation_date)

df_DOB

Unnamed: 0,Name only,page url,text,wikidata_url,Wikipedia Creation Date,Wikidata Creation Date
0,Python (programming language),https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...",https://www.wikidata.org/wiki/Q28865,2001-10-29 18:24:39,2012-11-12 05:29:01
1,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,"Artificial intelligence (AI), in its broadest ...",https://www.wikidata.org/wiki/Q11660,2001-10-08 16:55:49,2012-11-04 04:03:34
2,Quantum computing,https://en.wikipedia.org/wiki/Quantum_computing,A quantum computer is a computer that exploits...,https://www.wikidata.org/wiki/Q17995793,2001-10-21 16:10:53,2014-09-06 23:39:30


## Adding the first level of WD properties

In [None]:
# Use the correct column name for Wikidata URLs
wikidata_url_column = 'wikidata_url'  # Update this if the column name is different

# Function to fetch Wikidata properties
def fetch_wikidata_properties(wikidata_url):
    if pd.isna(wikidata_url):
        return {}, {}, {}

    entity_id = wikidata_url.split('/wiki/')[-1]
    url = f'https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json'
    response = requests.get(url)
    data = response.json()
    claims = data['entities'][entity_id]['claims']

    instance_of = claims.get('P31', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', '')
    part_of = claims.get('P361', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', '')
    subclass_of = claims.get('P279', [{}])[0].get('mainsnak', {}).get('datavalue', {}).get('value', {}).get('id', '')

    return instance_of, part_of, subclass_of

df_first_level = df_WD.copy()

# Fetch properties for each Wikidata item and add them to the DataFrame
df_first_level[['instance_of', 'part_of', 'subclass_of']] = df_first_level[wikidata_url_column].apply(
    lambda url: pd.Series(fetch_wikidata_properties(url)))

df_first_level

Unnamed: 0,Name only,page url,text,wikidata_url,instance_of,part_of,subclass_of
0,Python (programming language),https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...",https://www.wikidata.org/wiki/Q28865,Q899523,,
1,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,"Artificial intelligence (AI), in its broadest ...",https://www.wikidata.org/wiki/Q11660,Q268592,,Q21198
2,Quantum computing,https://en.wikipedia.org/wiki/Quantum_computing,A quantum computer is a computer that exploits...,https://www.wikidata.org/wiki/Q17995793,Q11862829,,Q12525525


#### Retrieving the whole hierarchy of "subclass of"

In [None]:
# Extract Wikidata IDs from the 'wikidata_url' column
df_WD['wikidata_url'] = df_WD['wikidata_url'].astype(str)
df_WD['wikidata_id'] = df_WD['wikidata_url'].apply(lambda x: re.search(r'Q\d+', x).group() if re.search(r'Q\d+', x) else None)


# Function to get the "subclass of" hierarchy for a given Wikidata item ID (iterative approach)
def get_subclass_of_hierarchy(item_id):
    hierarchy = []
    stack = [item_id]  # Using a stack for iterative depth-first search
    while stack:
        current_id = stack.pop()
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={current_id}&format=json&props=claims"
        response = requests.get(url).json()
        if 'entities' in response and current_id in response['entities']:
            claims = response['entities'][current_id].get('claims', {})
            if 'P279' in claims:  # P279 is "subclass of"
                subclass_of_ids = [claim['mainsnak'].get('datavalue', {}).get('value', {}).get('id') for claim in claims['P279'] if claim['mainsnak'].get('datavalue')]
                for subclass_of_id in subclass_of_ids:
                    if subclass_of_id not in hierarchy:
                        hierarchy.append(subclass_of_id)
                        stack.append(subclass_of_id)
    return hierarchy

results = []
for i, wikidata_id in enumerate(tqdm(df_WD['wikidata_id'], desc='Processing', unit='item')):
    if wikidata_id:
        hierarchy = get_subclass_of_hierarchy(wikidata_id)
    else:
        hierarchy = []
    results.append(hierarchy)

# Add the results to the DataFrame
df_subclass_of = df_DOB.copy()
df_subclass_of['subclass_of_hierarchy'] = results

df_subclass_of

Processing: 100%|███████████████████████████████| 3/3 [00:50<00:00, 16.86s/item]


Unnamed: 0,Name only,page url,text,wikidata_url,Wikipedia Creation Date,Wikidata Creation Date,subclass_of_hierarchy
0,Python (programming language),https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...",https://www.wikidata.org/wiki/Q28865,2001-10-29 18:24:39,2012-11-12 05:29:01,[]
1,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,"Artificial intelligence (AI), in its broadest ...",https://www.wikidata.org/wiki/Q11660,2001-10-08 16:55:49,2012-11-04 04:03:34,"[Q21198, Q120208, Q1156402, Q7048977, Q9420, Q..."
2,Quantum computing,https://en.wikipedia.org/wiki/Quantum_computing,A quantum computer is a computer that exploits...,https://www.wikidata.org/wiki/Q17995793,2001-10-21 16:10:53,2014-09-06 23:39:30,"[Q12525525, Q622821, Q104637332, Q3249551, Q45..."


## Retrieving the whole hierarchy of "part of"

In [None]:
# Function to get the "part of" hierarchy for a given Wikidata item ID (iterative approach)
def get_part_of_hierarchy(item_id):
    hierarchy = []
    stack = [item_id]  # Using a stack for iterative depth-first search
    while stack:
        current_id = stack.pop()
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={current_id}&format=json&props=claims"
        response = requests.get(url).json()
        if 'entities' in response and current_id in response['entities']:
            claims = response['entities'][current_id].get('claims', {})
            if 'P361' in claims:  # P361 is "part of"
                part_of_ids = [claim['mainsnak'].get('datavalue', {}).get('value', {}).get('id') for claim in claims['P361'] if claim['mainsnak'].get('datavalue')]
                for part_of_id in part_of_ids:
                    if part_of_id not in hierarchy:
                        hierarchy.append(part_of_id)
                        stack.append(part_of_id)
    return hierarchy

results_part_of = []

for i, wikidata_id in enumerate(tqdm(df_WD['wikidata_id'], desc='Processing', unit='item')):
    if wikidata_id:
        hierarchy = get_part_of_hierarchy(wikidata_id)
    else:
        hierarchy = []
    results_part_of.append(hierarchy)

# Add the results to the DataFrame
df_part_of = df_DOB.copy()
df_part_of['part_of_hierarchy'] = results_part_of

df_part_of

Processing: 100%|███████████████████████████████| 3/3 [00:02<00:00,  1.29item/s]


Unnamed: 0,Name only,page url,text,wikidata_url,Wikipedia Creation Date,Wikidata Creation Date,part_of_hierarchy
0,Python (programming language),https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...",https://www.wikidata.org/wiki/Q28865,2001-10-29 18:24:39,2012-11-12 05:29:01,[]
1,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,"Artificial intelligence (AI), in its broadest ...",https://www.wikidata.org/wiki/Q11660,2001-10-08 16:55:49,2012-11-04 04:03:34,[]
2,Quantum computing,https://en.wikipedia.org/wiki/Quantum_computing,A quantum computer is a computer that exploits...,https://www.wikidata.org/wiki/Q17995793,2001-10-21 16:10:53,2014-09-06 23:39:30,[]


## Retrieving the whole hierarchy of "instance of"

In [None]:
# Function to get the "instance of" hierarchy for a given Wikidata item ID (iterative approach)
def get_instance_of_hierarchy(item_id):
    hierarchy = []
    stack = [item_id]  # Using a stack for iterative depth-first search
    while stack:
        current_id = stack.pop()
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={current_id}&format=json&props=claims"
        response = requests.get(url).json()
        if 'entities' in response and current_id in response['entities']:
            claims = response['entities'][current_id].get('claims', {})
            if 'P31' in claims:  # P31 is "instance of"
                instance_of_ids = [claim['mainsnak'].get('datavalue', {}).get('value', {}).get('id') for claim in claims['P31'] if claim['mainsnak'].get('datavalue')]
                for instance_of_id in instance_of_ids:
                    if instance_of_id not in hierarchy:
                        hierarchy.append(instance_of_id)
                        stack.append(instance_of_id)
    return hierarchy

results_instance_of = []

for i, wikidata_id in enumerate(tqdm(df_WD['wikidata_id'], desc='Processing', unit='item')):
    if wikidata_id:
        hierarchy = get_instance_of_hierarchy(wikidata_id)
    else:
        hierarchy = []
    results_instance_of.append(hierarchy)

# Add the results to the DataFrame
df_instance_of = df_WD.copy()
df_instance_of['instance_of_hierarchy'] = results_instance_of
df_instance_of

Processing: 100%|███████████████████████████████| 3/3 [00:25<00:00,  8.55s/item]


Unnamed: 0,Name only,page url,text,wikidata_url,wikidata_id,instance_of_hierarchy
0,Python (programming language),https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...",https://www.wikidata.org/wiki/Q28865,Q28865,"[Q899523, Q1268980, Q3839507, Q187432, Q127720..."
1,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,"Artificial intelligence (AI), in its broadest ...",https://www.wikidata.org/wiki/Q11660,Q11660,"[Q268592, Q112057532, Q123370638, Q125161275, ..."
2,Quantum computing,https://en.wikipedia.org/wiki/Quantum_computing,A quantum computer is a computer that exploits...,https://www.wikidata.org/wiki/Q17995793,Q17995793,"[Q11862829, Q110402867, Q19478619, Q151885, Q1..."


## Retrieving labels of the Wikidata articles

In [None]:
# Function to get a label for a Wikidata ID
def get_wikidata_label(wikidata_id):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{wikidata_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        try:
            return data['entities'][wikidata_id]['labels']['en']['value']
        except KeyError:
            return None
    else:
        return None

# Define column names to process
columns_to_process = df_first_level[['instance_of', 'part_of', 'subclass_of']] #change to the necessary columns

# Extract unique Wikidata IDs from the selected columns
unique_wikidata_ids = pd.unique(columns_to_process.values.ravel('K')).tolist()
unique_wikidata_ids = [x for x in unique_wikidata_ids if pd.notna(x)]

# Get labels for all unique Wikidata IDs
wikidata_labels = {wid: get_wikidata_label(wid) for wid in unique_wikidata_ids}

# Replace Wikidata IDs with their labels in the selected columns
df_labeled = df_first_level.copy()

for column in columns_to_process:
    df_labeled[column] = df_labeled[column].apply(lambda wid: wikidata_labels.get(wid, wid))

df_labeled

Unnamed: 0,Name only,page url,text,wikidata_url,instance_of,part_of,subclass_of
0,Python (programming language),https://en.wikipedia.org/wiki/Python_(programm...,"Python is a high-level, general-purpose progra...",https://www.wikidata.org/wiki/Q28865,object-based language,,
1,Artificial intelligence,https://en.wikipedia.org/wiki/Artificial_intel...,"Artificial intelligence (AI), in its broadest ...",https://www.wikidata.org/wiki/Q11660,industry,,computer science
2,Quantum computing,https://en.wikipedia.org/wiki/Quantum_computing,A quantum computer is a computer that exploits...,https://www.wikidata.org/wiki/Q17995793,academic discipline,,computation


In [None]:
def save_dataframe(df, save_as="both"):
# Get the current datetime without seconds
    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M")

    # Generate the base file name
    file_name_base = f"WikiData_properties_manual_corpus_{keyword}_{current_time}"

    # Save as Excel
    if save_as == "excel" or save_as == "both":
        excel_file_name = file_name_base + ".xlsx"
        df.to_excel(excel_file_name, index=False)
        print(f"DataFrame saved as Excel file: {excel_file_name}")

    # Save as CSV
    if save_as == "csv" or save_as == "both":
        csv_file_name = file_name_base + ".csv"
        df.to_csv(csv_file_name, index=False)
        print(f"DataFrame saved as CSV file: {csv_file_name}")

# Example usage:
# Choose the name of the dataframe above that you need to save (df, df_part_of, etc.) and place it as the first argument in the parenthesis
save_dataframe(df_subclass_of, save_as="excel") # "csv", "excel" or "both

DataFrame saved as Excel file: WikiData_properties_manual_corpus_Technology_2024-10-17_19-17.xlsx
