In [None]:
import ast
import pandas as pd
import bs4
import requests
import unicodedata
import re
from tqdm import tqdm
import json
import deepl


def get_wikipedia_page_details(title):
    endpoint = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "format": "json",
        "prop": "extracts|info|categories|links",
        "explaintext": True,
        "inprop": "url",
        "redirects": True,
        "cllimit": "max",
        "pllimit": "max"
    }

    response = requests.get(endpoint, params=params)
    if response.status_code != 200:
        return {"page_id": 'N/A', "title": title, "summary": 'Request failed', "content": '', "lastrevid": 'N/A', "length": 0, "fullurl": 'N/A', "categories": [], "links": []}

    data = response.json()
    pages = data.get('query', {}).get('pages', {})
    page = next(iter(pages.values()))

    if 'extract' not in page:
        return {"page_id": 'N/A', "title": title, "summary": 'Not found', "content": '', "lastrevid": 'N/A', "length": 0, "fullurl": 'N/A', "categories": [], "links": []}

    return {
        "page_id": page.get('pageid', 'N/A'),
        "title": page.get('title', title),
        "summary": page.get('extract', 'Summary not found').split('\n\n')[0],
        "content": page.get('extract', 'Content not found'),
        "lastrevid": page.get('lastrevid', 'N/A'),
        "length": len(page.get('extract', '')),
        "fullurl": page.get('fullurl', 'N/A'),
        "categories": [cat['title'] for cat in page.get('categories', [])],
        "links": [link['title'] for link in page.get('links', [])]
    }

# Fetch glossary details for multiple subjects
subjects = [
    "Glossary of physics",
    "Glossary of areas of mathematics",
    "Glossary of calculus",
    "Glossary of computer science",
    "Glossary of artificial intelligence",
    "Machine learning",
    "Deep learning",
    "Natural language processing"
]

glossary_data = [get_wikipedia_page_details(subject) for subject in subjects]
glossary_df = pd.DataFrame(glossary_data)
glossary_df.to_csv("outputs/intermediate_outputs/list_of_glossaries.csv", index=False)

# Extracting and processing the links from the dataset
data = pd.read_csv("outputs/intermediate_outputs/list_of_glossaries.csv")

# Collecting all unique links into a dictionary
all_links = {}
for index, row in data.iterrows():
    links = ast.literal_eval(row['links'])
    for link in links:
        all_links[link] = {
            'source_title': row['title'],
            'source_page_id': row['page_id']
        }

# Converting the dictionary to a DataFrame
all_terms_df = pd.DataFrame.from_dict(all_links, orient='index').reset_index().rename(columns={'index': 'linked_page_title'})
all_terms_df.sort_values(by=['source_page_id', 'linked_page_title'], inplace=True)
all_terms_df.to_csv('outputs/intermediate_outputs/all_terms.csv', index=False)

# Function to get paragraphs from a Wikipedia page
def get_paragraphs(page_name):
    try:
        r = requests.get(f'https://en.wikipedia.org/api/rest_v1/page/html/{page_name}')
        soup = bs4.BeautifulSoup(r.content, 'html.parser')

        paragraphs = [
            re.sub(r'(\[[0-9]+\])', '', unicodedata.normalize('NFKD', tag.text)).strip()
            for tag in soup.find_all('p')
            if '.' in tag.text and '\n' not in tag.text
        ]
        return paragraphs
    except Exception as e:
        print(f"Error fetching page {page_name}: {e}")
        return []

# Function to merge short paragraphs into longer ones based on a length limit
def merge_short_paragraphs(paragraphs, length_limit):
    merged_paragraphs = []
    current_paragraph = ""

    for paragraph in paragraphs:
        if len(current_paragraph) + len(paragraph) + 1 <= length_limit:
            current_paragraph += " " + paragraph if current_paragraph else paragraph
        else:
            if current_paragraph:
                merged_paragraphs.append(current_paragraph)
            current_paragraph = paragraph

    if current_paragraph:
        merged_paragraphs.append(current_paragraph)

    return merged_paragraphs

# Process each linked_page_title in the CSV file with a progress bar
length_limit = 1000
all_terms_df['paragraphs'] = [get_paragraphs(title) for title in tqdm(all_terms_df['linked_page_title'], desc="Fetching paragraphs")]
all_terms_df['samples'] = [merge_short_paragraphs(paragraphs, length_limit) for paragraphs in tqdm(all_terms_df['paragraphs'], desc="Merging paragraphs")

# Save the extended DataFrame to a new CSV file
all_terms_df.to_csv('outputs/final_outputs/all_terms_with_paragraphs_and_samples.csv', index=False)

# save as xlsx 
all_terms_df.to_excel('outputs/final_outputs/all_terms_with_paragraphs_and_samples.xlsx', index=False)


# Replace with your DeepL auth key
auth_key = "bc17660f-cf42-46a1-bd41-ff7e522ac749:fx"  
translator = deepl.Translator(auth_key)

# Load the Excel file
file_path = 'outputs/final_outputs/sample_2.5_all_terms_cleaned_en.xlsx'
data = pd.read_excel(file_path)

# Convert the string representation of lists in 'samples' column back to actual lists
data['samples'] = data['samples'].apply(ast.literal_eval)

# Function to translate a list of samples
def translate_samples(samples, target_lang="TR"):
    translated = []
    for sample in samples:
        result = translator.translate_text(sample, source_lang="EN", target_lang=target_lang)
        translated.append(result.text)
    return translated

# Apply the translation to each row in the DataFrame
data['translated_samples'] = data['samples'].apply(lambda x: translate_samples(x, target_lang="TR"))

# Save the DataFrame to a new Excel file to verify the results
output_file_path = 'outputs/final_outputs/translated_samples.xlsx'
data.to_excel(output_file_path, index=False)

# Load the Excel file
file_path = 'outputs/final_outputs/translated_samples.xlsx'
df = pd.read_excel(file_path)

# Convert the string representation of a list to a list
df['samples'] = df['samples'].apply(ast.literal_eval)
df['translated_samples'] = df['translated_samples'].apply(ast.literal_eval)

# Create a JSON structure with sequential IDs
json_data = []
id_counter = 1

for index, row in df.iterrows():
    for i in range(len(row['samples'])):
        json_data.append({
            "id": id_counter,
            "term_id": row["term_id"],
            "term_name": row["linked_page_title"],
            "data": {
                "my_text": f"ENGLISH: {row['samples'][i]} \n \n TURKISH: {row['translated_samples'][i]}"
            }
        })
        id_counter += 1

# Save the JSON structure to a file
json_file_path = 'label_studio/translated_samples.json'
with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)