In [1]:
import pandas as pd
import requests

# Function to read and preprocess each glossary CSV file
def read_and_preprocess(file_name, domain):
    df = pd.read_csv(file_name, sep='delimiter', header=None, engine='python')
    df = df[0].str.split(',', expand=True)[[0]]
    df.columns = ['term']
    df['domain'] = domain
    return df

# Function to fetch Wikipedia page details
def get_wikipedia_page_details(title):
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "format": "json",
        "prop": "extracts|info|categories|links",
        "explaintext": True,
        "inprop": "url",
        "redirects": True,
        "cllimit": "max",
        "pllimit": "max"
    }

    response = requests.get(endpoint, params=params)
    if response.status_code != 200:
        return {"page_id": 'N/A', "title": title, "summary": 'Request failed', "content": '', "lastrevid": 'N/A', "length": 0, "fullurl": 'N/A', "categories": [], "links": []}

    data = response.json()
    pages = data.get('query', {}).get('pages', {})
    page = next(iter(pages.values()))

    if 'extract' not in page:
        return {"page_id": 'N/A', "title": title, "summary": 'Not found', "content": '', "lastrevid": 'N/A', "length": 0, "fullurl": 'N/A', "categories": [], "links": []}

    summary_params = params.copy()
    summary_params['exintro'] = True
    summary_response = requests.get(endpoint, params=summary_params)
    summary_data = summary_response.json()
    summary_page = next(iter(summary_data.get('query', {}).get('pages', {}).values()))

    return {
        "page_id": page.get('pageid', 'N/A'),
        "title": page.get('title', title),
        "summary": summary_page.get('extract', 'Summary not found'),
        "content": page.get('extract', 'Content not found'),
        "lastrevid": page.get('lastrevid', 'N/A'),
        "length": len(page.get('extract', '')),
        "fullurl": page.get('fullurl', 'N/A'),
        "categories": [cat['title'] for cat in page.get('categories', [])],
        "links": [link['title'] for link in page.get('links', [])]
    }


In [2]:
# Read and preprocess all the glossaries
math = read_and_preprocess('glossary_of_areas_of_mathematics.csv', 'math')
cs = read_and_preprocess('glossary_of_computer_science.csv', 'cs')
physics = read_and_preprocess('glossary_of_physics.csv', 'physics')

# Merge all dataframes into one
all_data = pd.concat([math, cs, physics], ignore_index=True)

# Save the combined dataframe to a CSV file
all_data.to_csv('all_glossary.csv', index=False)

# Apply the function and create new columns
wikipedia_details = all_data['term'].apply(get_wikipedia_page_details)
wikipedia_df = pd.DataFrame(wikipedia_details.tolist())

# Combine the original dataframe with the new Wikipedia details dataframe
all_data = pd.concat([all_data, wikipedia_df], axis=1)

# Drop the temporary 'wikipedia_details' column if it exists
all_data.drop(columns=['wikipedia_details'], errors='ignore', inplace=True)

# Save the final dataframe with Wikipedia details to a CSV file
all_data.to_csv('all_glossary_with_wikipedia.csv', index=False)