In [3]:

import pandas as pd
import wikipedia
from urllib.parse import urlparse, unquote
import warnings
import re
import requests
from bs4 import BeautifulSoup
import os

In [4]:
warnings.filterwarnings("ignore")

In [5]:
# Function to extract a clean title from Wikipedia URL
def extract_title(url):
    try:
        path = urlparse(url).path  # Get path after domain
        title = path.split("/wiki/")[-1]  # Extract after '/wiki/'
        title = title.split("#")[0]  # Remove fragment identifiers
        return unquote(title.replace("_", " "))  # Decode URL and replace underscores
    except Exception as e:
        return f"Error extracting title: {e}"

# Function to get Wikipedia content
def get_wikipedia_content(url):
    title = extract_title(url)
    if "Error" in title:
        return pd.NA  # Return error message if extraction fails
    try:
        page = wikipedia.page(title)
        return page.content  # Get full content
    except wikipedia.exceptions.DisambiguationError as e:
        return f"Disambiguation Error: {e.options}"  # Handle ambiguous titles
    except wikipedia.exceptions.PageError:
        return pd.NA
    except Exception as e:
        return f"Error: {e}"

In [6]:
# Function to remove unwanted sections from the content
def remove_unwanted_sections(content):
    # Define unwanted section headings
    unwanted_sections = ["See also", "References", "External links", "Citations", "Cited works"]
    
    # Iterate over unwanted sections and remove them from content
    for section in unwanted_sections:
        # Use regex to find the section and remove the section's content
        content = re.sub(rf"==\s*{re.escape(section)}\s*==.*?((?==\s*==)|(?=\n*$))", "", content, flags=re.DOTALL)
    
    return content

In [7]:
# Function to scrape tables and insert into Wikipedia API content
def integrate_wikipedia_tables(url):
    title = extract_title(url)
    if "Error" in title:
        return pd.NA  # Return None if extraction fails
    
    # Get content from Wikipedia API
    content = get_wikipedia_content(title)
    if pd.isna(content):
        return pd.NA
    
    content = remove_unwanted_sections(content)
    
    # Scrape tables from Wikipedia page
    response = requests.get(url, verify=False)  # Ignore SSL verification issues
    soup = BeautifulSoup(response.text, "html.parser")
    # content_div = soup.find("div", {"class": "mw-parser-output"})
    
    # Iterate through page elements and insert tables into the content
    # for element in soup.find_all("table", {"class": "wikitable"}, recursive=True):
    # for element in soup.find_all("table", class_=["wikitable", "infobox", "sidebar"], recursive=True):
    table_classes = {"class": ["wikitable", "infobox"]}

    for element in soup.find_all("table",table_classes, recursive=True):
        if element.name == "table":
            try:
                df = pd.read_html(str(element))[0]  # Convert table to DataFrame
                table_text = df.to_csv(index=False)  # Convert to CSV format
                
                # Find the previous text element to locate where to insert the table
                previous_element = element.find_previous(["ul", "ol", "h1", "h2", "h3", "h4", "p"])
                if previous_element.name in ["h1", "h2", "h3", "h4"]:
                    previous_text = f"== {previous_element.get_text(' ', strip=True)} =="
                else:
                    previous_text = previous_element.get_text(" ", strip=True)
                
                # Use regex to find this text in content and insert table after it
                pattern = re.escape(previous_text)
                replacement = f"{previous_text}\n\nTable:\n{table_text}"
                content = re.sub(pattern, replacement, content, count=1)
            except:
                continue  # Skip tables that can't be read
    
    return content

# # Example usage
# url = "https://en.wikipedia.org/wiki/List_of_international_presidential_trips_made_by_Donald_Trump#See_also"
# # url = "https://en.wikipedia.org/wiki/Prime_Minister_of_Japan#:~:text=Incumbent%0AShigeru%20Ishiba"
# integrated_content = integrate_wikipedia_tables(url)
# print(integrated_content)  # Print first 2000 characters for preview

In [8]:
# df = pd.read_excel('Fresh_QA.xlsx', sheet_name='Filtered')
df = pd.read_excel('FreshQADataset_filtered.xlsx')

In [10]:
# Apply functions to dataset
# short_df = df.head(50)
# df["context_relevant_full"] = df["relevant_source"].apply(get_wikipedia_content)
# df["context_irrelevant_full"] = df["irrelevant_source"].apply(get_wikipedia_content)

df["context_relevant_full"] = df["relevant_source"].apply(integrate_wikipedia_tables)
df["context_irrelevant_full"] = df["irrelevant_source"].apply(integrate_wikipedia_tables)
df.dropna(subset=['context_relevant_full', 'context_irrelevant_full'], inplace=True)
df['context_r_len'] = df['context_relevant_full'].apply(lambda x: len(x))
df['context_i_len'] = df['context_irrelevant_full'].apply(lambda x: len(x))
df = df[df['context_r_len'] > 1000]


In [None]:
df['real_needle'] = df['needle']
# df.loc[:, 'statements_misleading'] = [['test1', 'test2', 'test3']]
df = df.assign(statements_misleading=[['test1', 'test2', 'test3']] * len(df))
df.dropna(subset=["context_irrelevant_full"], inplace=True, ignore_index=True)
df

Unnamed: 0,id,question,relevant_source,needle,irrelevant_source,context_relevant_full,context_irrelevant_full,real_needle,statements_misleading
0,43,How long has Elon Musk been X Corp.'s CEO?,https://en.wikipedia.org/wiki/X_Corp.#:~:text=...,Elon Musk is no longer X Corp.'s CEO.,https://en.wikipedia.org/wiki/OpenAI,"\n\nTable:\n0,1\n,\nCompany type,Subsidiary\nI...","\n\nTable:\n0,1\n,\nCompany type,Private\nIndu...",Elon Musk is no longer X Corp.'s CEO.,"[test1, test2, test3]"
1,44,Where will the FIFA World Cup be hosted this y...,https://en.wikipedia.org/wiki/FIFA_World_Cup#:...,There won't be a FIFA World Cup this year.,https://en.wikipedia.org/wiki/Premier_League,"\n\nTable:\n0,1\n,\nOrganising body,FIFA\nFoun...","\n\nTable:\n0,1\n,\nFounded,20 February 1992; ...",There won't be a FIFA World Cup this year.,"[test1, test2, test3]"
2,92,Alphabet's market capitalization reached its h...,https://en.wikipedia.org/wiki/List_of_public_c...,The all-time highest value of Alphabet was in ...,https://en.wikipedia.org/wiki/Google_DeepMind,The following is a list of publicly traded com...,"\n\nTable:\n0,1\n,\nTrade name,Google DeepMind...",The all-time highest value of Alphabet was in ...,"[test1, test2, test3]"
3,95,Which Republican was elected Speaker of the Ho...,https://en.wikipedia.org/wiki/January_2023_Spe...,No one received a majority of the votes on the...,https://en.wikipedia.org/wiki/October_2023_Spe...,"\n\nTable:\n0,1,2\n,,\n""← 2021 January 3–7, 20...","\n\nTable:\n0,1,2,3\n,,,\n""← January 2023 Octo...",No one received a majority of the votes on the...,"[test1, test2, test3]"
4,96,"In January 2023, the NHC revised the fatality ...",https://en.wikipedia.org/wiki/2005_levee_failu...,"The reported death toll decreased to 1,392",https://en.wikipedia.org/wiki/Cyclone_Gabriell...,"On Monday, August 29, 2005, there were over 50...","\n\nTable:\n0,1\nGabrielle near its peak inten...","The reported death toll decreased to 1,392","[test1, test2, test3]"
5,121,What is the most recent country that President...,https://en.wikipedia.org/wiki/List_of_internat...,President Donald Trump hasn't visited any coun...,https://en.wikipedia.org/wiki/Foreign_policy_o...,This is a list of international presidential t...,U.S. foreign policy during the first presidenc...,President Donald Trump hasn't visited any coun...,"[test1, test2, test3]"
6,122,Who was the winner of The Voice US this year?,https://en.wikipedia.org/wiki/The_Voice_(Ameri...,"This season of The Voice is still ongoing, and...",https://en.wikipedia.org/wiki/The_Voice_%28Ame...,"\n\nTable:\nThe Voice,The Voice.1\n,\nGenre,Re...","\n\nTable:\nThe Voice,The Voice.1\nSeason 25,S...","This season of The Voice is still ongoing, and...","[test1, test2, test3]"
7,123,Who did Michael van Gerwen beat to win this ye...,https://en.wikipedia.org/wiki/PDC_World_Darts_...,Michael van Gerwen lost to Luke Littler in the...,https://en.wikipedia.org/wiki/Phil_Taylor_(dar...,"\n\nTable:\n0,1\nThe stage at the 2016 edition...","\n\nTable:\nPhil Taylor,Phil Taylor.1\nTaylor ...",Michael van Gerwen lost to Luke Littler in the...,"[test1, test2, test3]"
8,155,Who is the most recent player to win both the ...,https://en.wikipedia.org/wiki/PDC_World_Darts_...,Luke Littler,https://en.wikipedia.org/wiki/Michael_Smith_(d...,"\n\nTable:\n0,1\nThe stage at the 2016 edition...","\n\nTable:\nMichael Smith,Michael Smith.1\nSmi...",Luke Littler,"[test1, test2, test3]"
9,156,What was the Weeknd's last studio album titled?,https://en.wikipedia.org/wiki/The_Weeknd#Disco...,Hurry Up Tomorrow,https://en.wikipedia.org/wiki/Kendrick_Lamar,"\n\nTable:\nThe Weeknd,The Weeknd.1\nTesfaye a...","\n\nTable:\nKendrick Lamar,Kendrick Lamar.1\nL...",Hurry Up Tomorrow,"[test1, test2, test3]"


In [None]:
def save_context_as_text(df, id_col='id', context_col='context', output_dir='output_texts', save_col='save_col'):
    """
    Saves each row's context as a text file named after the id and adds the file name to a new column.

    :param df: Pandas DataFrame containing the data.
    :param id_col: Column name for the ID.
    :param context_col: Column name for the text to save.
    :param output_dir: Directory to save text files in.
    :return: Updated DataFrame with a new column 'file_name'.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    file_names = []  # List to store file names

    for _, row in df.iterrows():
        file_name = f"{row[id_col]}.txt"  # Construct filename as "<id>.txt"
        file_path = os.path.join(output_dir, file_name)

        with open(file_path, "w", encoding="utf-8") as file:
            file.write(str(row[context_col]))  # Write context content

        file_names.append(file_name)  # Store the filename

    # Add file names as a new column in the DataFrame
    df[save_col] = file_names

    print(f"Saved {len(df)} text files in '{output_dir}' and updated the DataFrame.")
    
    return df  # Return updated DataFrame

# Example usage
# df = save_context_as_text(df)


In [None]:
save_context_as_text(df, id_col='id', context_col='context_relevant_full', output_dir='sources_relevant', save_col='context_relevant')
save_context_as_text(df, id_col='id', context_col='context_irrelevant_full', output_dir='sources_irrelevant', save_col='context_irrelevant')
df = df[['id', 'question', 'needle', 'real_needle', 'context_relevant', 'context_irrelevant', 'statements_misleading']]
df

Saved 29 text files in 'sources_relevant' and updated the DataFrame.
Saved 29 text files in 'sources_irrelevant' and updated the DataFrame.


Unnamed: 0,id,question,needle,real_needle,context_relevant,context_irrelevant,statements_misleading
0,43,How long has Elon Musk been X Corp.'s CEO?,Elon Musk is no longer X Corp.'s CEO.,Elon Musk is no longer X Corp.'s CEO.,43.txt,43.txt,"[test1, test2, test3]"
1,44,Where will the FIFA World Cup be hosted this y...,There won't be a FIFA World Cup this year.,There won't be a FIFA World Cup this year.,44.txt,44.txt,"[test1, test2, test3]"
2,92,Alphabet's market capitalization reached its h...,The all-time highest value of Alphabet was in ...,The all-time highest value of Alphabet was in ...,92.txt,92.txt,"[test1, test2, test3]"
3,95,Which Republican was elected Speaker of the Ho...,No one received a majority of the votes on the...,No one received a majority of the votes on the...,95.txt,95.txt,"[test1, test2, test3]"
4,96,"In January 2023, the NHC revised the fatality ...","The reported death toll decreased to 1,392","The reported death toll decreased to 1,392",96.txt,96.txt,"[test1, test2, test3]"
5,121,What is the most recent country that President...,President Donald Trump hasn't visited any coun...,President Donald Trump hasn't visited any coun...,121.txt,121.txt,"[test1, test2, test3]"
6,122,Who was the winner of The Voice US this year?,"This season of The Voice is still ongoing, and...","This season of The Voice is still ongoing, and...",122.txt,122.txt,"[test1, test2, test3]"
7,123,Who did Michael van Gerwen beat to win this ye...,Michael van Gerwen lost to Luke Littler in the...,Michael van Gerwen lost to Luke Littler in the...,123.txt,123.txt,"[test1, test2, test3]"
8,155,Who is the most recent player to win both the ...,Luke Littler,Luke Littler,155.txt,155.txt,"[test1, test2, test3]"
9,156,What was the Weeknd's last studio album titled?,Hurry Up Tomorrow,Hurry Up Tomorrow,156.txt,156.txt,"[test1, test2, test3]"


In [None]:
json_output = df.to_json(orient="records", indent=4, force_ascii=False)

# Save JSON to a file
with open("context.json", "w") as f:
    f.write(json_output)