In [None]:
# Import che stavano già nel notebook
import re
import nltk
import tqdm
import time
import json
import spacy
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import concurrent.futures

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from wikidata.client import Client
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict

In [None]:
import concurrent.futures

def extract_qid(url):
    return url.strip().split("/")[-1]

def get_wiki_link(qid, lang='en'):
    try:
        entity = client.get(qid, load=True)
        sitelinks = entity.data.get('sitelinks', {})
        page_info = sitelinks.get(f'{lang}wiki')
        return page_info['url'] if page_info else None
    except Exception as e:
        print(f"ERROR retrieving Wikipedia link for {qid}: {e}")
        return None

def get_intro_paragraph(wikipedia_link, min_chars=200):
    try:
        response = requests.get(wikipedia_link, allow_redirects=True)
        response.raise_for_status()
        if response.is_redirect:
            print(f"WARNING: Redirecting... {response.status_code}")
        soup = BeautifulSoup(response.content, 'html.parser')

        content = soup.find('div', class_='mw-content-ltr mw-parser-output')
        if not content:
            return ""

        paragraph_text = ""
        for p in content.find_all('p'):
            text = p.get_text(separator=" ", strip=True)
            text = re.sub(r'\[\d+\]', '', text)
            paragraph_text += " " + text
            if len(paragraph_text) > min_chars:
                break

        return paragraph_text.strip()

    except requests.exceptions.RequestException as e:
        print(f"ERROR fetching paragraph from {wikipedia_link}: {e}")
        return ""
    except Exception as e:
        print(f"ERROR parsing content from {wikipedia_link}: {e}")
        return ""
    
def process_item(index, item, df, lang):
    try:
        qid = extract_qid(item)
        link = get_wiki_link(qid, lang)
        paragraph = get_intro_paragraph(link)

        if not link:
            print(f"WARNING: missing Wikipedia link for QID {qid}")
            return index, df['description'][df['item'] == item].values[0]

        if not paragraph:
            print(f"WARNING: empty or missing content for {link}")
            return index, df['description'][df['item'] == item].values[0]

        return index, paragraph

    except Exception as e:
        print(f"ERROR processing item {item} (QID: {qid if 'qid' in locals() else 'UNKNOWN'}): {e}")
        return index, df['description'][df['item'] == item].values[0]

def text_extraction(df, lang='en', max_workers=10):
    results = [None] * len(df)
    items = list(enumerate(df['item']))

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_item, idx, item, df, lang): idx for idx, item in items}

        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            idx, paragraph = future.result()
            results[idx] = paragraph

    return results

In [None]:
# Esempio utilizzo
train_txt = text_extraction(df=train_df)

In [None]:
# Processo effettuato dopo l'addestramento del modello per mappare le 50 parole più "influenti" nella classificazione da parte del modello

tfidf_size = len(word_index)
reverse_word_index = {v: k for k, v in word_index.items()}

for i, class_name in enumerate(classes):
    print(f"\Class: {class_name}")
    top_features = np.argsort(best_model.coef_[i])[::-1]

    count = 0
    for feat_idx in top_features:
        if feat_idx < tfidf_size:
            word = reverse_word_index[feat_idx]
            weight = best_model.coef_[i][feat_idx]
            print(f"{word}: {weight:.4f}")
            count += 1
            if count == 50:
                break