In [15]:
import warnings
warnings.filterwarnings("ignore")
import os
import time
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np
import openai
import pickle
from concurrent.futures import ThreadPoolExecutor
# Set up your OpenAI API key
openai.api_key = "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"
from urllib.parse import quote

class WikipediaAPI:
    def __init__(self, lang='en'):
        self.lang = lang
        self.base_url = f'https://{lang}.wikipedia.org/w/api.php'

    def _api_call(self, params):
        response = requests.get(self.base_url, params=params)
        data = response.json()
        if 'error' in data:
            print(f"Error: {data['error']['info']}")
        return data

    def get_inlinks(self, title, limit=5000):
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'backlinks',
            'bltitle': title,
            'bllimit': limit,
            'blnamespace': 0,  # Only retrieve links from main namespace
            'continue': ''  # Placeholder for pagination
        }
        inlinks = []
        while True:
            data = self._api_call(params)
            inlink_pages = data['query']['backlinks']
            for page in inlink_pages:
                inlinks.append(page['title'])
            if 'continue' not in data:
                break
            params['continue'] = data['continue']['continue']
            params['blcontinue'] = data['continue']['blcontinue']
        return inlinks

    def get_page_content(self, title):
        params = {
            'action': 'parse',
            'page': title,
            'format': 'json',
            'prop': 'text',
            'contentmodel': 'wikitext'
        }
        data = self._api_call(params)
        html_content = data['parse']['text']['*']
        return html_content


class TextExtractor:
    def __init__(self, lang='en'):
        self.lang = lang
        self.api = WikipediaAPI(lang)

    def normalize_title(self, title):
        return quote(title.replace(' ', '_'), safe='')

    def get_paragraph_with_link(self, page_title, link_title):
        content = self.api.get_page_content(page_title)
        soup = BeautifulSoup(content, 'html.parser')
        elements = soup.find_all('p')
        link_href = "/wiki/" + self.normalize_title(link_title)
        for element in elements:
            a_tags = element.find_all('a', href=True)
            if any(a['href'] == link_href for a in a_tags):
                return element.text.strip()
        return None



class OpenAI_API:
    def __init__(self, retries=5, backoff_factor=0.1):
        self.retries = retries
        self.backoff_factor = backoff_factor

    def _api_call(self, method, *args, **kwargs):
        for i in range(self.retries):
            try:
                return method(*args, **kwargs)
            except (openai.error.RateLimitError, openai.error.APIError, openai.error.ServiceUnavailableError) as e:
                if i < self.retries - 1:  # if it's not the last retry attempt
                    sleep_time = self.backoff_factor * (2 ** i)  # exponential backoff
                    time.sleep(sleep_time)
                else:  # if it's the last retry attempt
                    print(f"Failed to call API after {self.retries} attempts.")
                    raise

    def inference(self, prompt):
        # Your API call here
        response = self._api_call(openai.Completion.create,
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=500,
            n=1,
            stop = "",
            temperature=0.1,
        )
        return response.choices[0].text.strip()

    def embed(self, text):
        response = self._api_call(openai.Embedding.create,
            input=text,
            model="text-embedding-ada-002"
        )
        return response['data'][0]['embedding']


class HyperlinkAnalysis:
    def __init__(self):
        self.text_extractor = TextExtractor()
        self.openai_api = OpenAI_API()

    def _get_inlink_data_helper(self, inlink, target_page):
        text = self.text_extractor.get_paragraph_with_link(inlink, target_page)
        if text is None:
            return None

        analysis = self.hyperlink_analysis(target_page, text, inlink)
        embedding = self.openai_api.embed(analysis)

        return inlink, text, analysis, embedding

    def get_inlink_data(self, target_page, batch_size=100):
        inlinks = self.text_extractor.api.get_inlinks(target_page)
        found_inlinks = []
        none_inlinks = []  # This is the new list to store inlinks that return None
        texts = []
        analyses = []
        embeddings = []
        
        # Check if there is already saved data
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
            already_processed = set(found_inlinks).union(none_inlinks)  # Consider None inlinks as already processed
            inlinks = [inlink for inlink in inlinks if inlink not in already_processed]
        except Exception:
            pass

        with ThreadPoolExecutor() as executor:
            for idx, result in enumerate(tqdm(executor.map(self._get_inlink_data_helper, inlinks, [target_page]*len(inlinks)), total=len(inlinks), desc=target_page, unit="page"), 1):
                try:
                    if result is None:
                        none_inlinks.append(inlinks[idx-1])  # Store the inlink that returned None
                        continue
                    inlink, text, analysis, embedding = result
                    found_inlinks.append(inlink)
                    texts.append(text)
                    analyses.append(analysis)
                    embeddings.append(embedding)

                    # Save data after every batch
                    if idx % batch_size == 0:
                        self._save_data(target_page, 'found_inlinks', found_inlinks)
                        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save None inlinks after every batch
                        self._save_data(target_page, 'texts', texts)
                        self._save_data(target_page, 'analyses', analyses)
                        self._save_data(target_page, 'embeddings', embeddings)
                except Exception as e:
                    print(f"An exception occurred: {e}")
                    continue

        # Save remaining data that didn't fit into a full batch
        self._save_data(target_page, 'found_inlinks', found_inlinks)
        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save remaining None inlinks
        self._save_data(target_page, 'texts', texts)
        self._save_data(target_page, 'analyses', analyses)
        self._save_data(target_page, 'embeddings', embeddings)

        return found_inlinks, none_inlinks, texts, analyses, embeddings  # Return None inlinks along with other data

    def get_inlink_data_iter(self, target_page, batch_size=100):
        inlinks = self.text_extractor.api.get_inlinks(target_page)
        found_inlinks = []
        none_inlinks = []  # This is the new list to store inlinks that return None
        texts = []
        analyses = []
        embeddings = []

        # Check if there is already saved data
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
            already_processed = set(found_inlinks).union(none_inlinks)  # Consider None inlinks as already processed
            inlinks = [inlink for inlink in inlinks if inlink not in already_processed]
        except Exception:
            pass

        idx = 0  # Initialize index counter
        for inlink in inlinks:
            idx += 1  # Increment the counter
            print(f"Processing inlink {idx} out of {len(inlinks)}: {inlink}")

            result = self._get_inlink_data_helper(inlink, target_page)

            if result is None:
                print(f"Text extraction returned None for inlink: {inlink}")
                none_inlinks.append(inlink)  # Store the inlink that returned None
                continue

            inlink, text, analysis, embedding = result
            found_inlinks.append(inlink)
            texts.append(text)
            analyses.append(analysis)
            embeddings.append(embedding)

            # Save data after every batch
            if idx % batch_size == 0:
                print(f"Saving data after processing {idx} inlinks.")
                self._save_data(target_page, 'found_inlinks', found_inlinks)
                self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save None inlinks after every batch
                self._save_data(target_page, 'texts', texts)
                self._save_data(target_page, 'analyses', analyses)
                self._save_data(target_page, 'embeddings', embeddings)

        # Save remaining data that didn't fit into a full batch
        print("Saving remaining data.")
        self._save_data(target_page, 'found_inlinks', found_inlinks)
        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save remaining None inlinks
        self._save_data(target_page, 'texts', texts)
        self._save_data(target_page, 'analyses', analyses)
        self._save_data(target_page, 'embeddings', embeddings)

        return found_inlinks, none_inlinks, texts, analyses, embeddings  # Return None inlinks along with other data


    def load_saved_data(self, target_page):
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
        except FileNotFoundError as e:
            print(f"No saved data found for {target_page}. Please check the target page or the saved data.")
            return None
        except Exception as e:
            print(f"An unexpected error occurred while loading saved data for {target_page}: {e}")
            return None

        return found_inlinks, none_inlinks, texts, analyses, embeddings


    def hyperlink_analysis(self, hyperlink, paragraph, page):
        prompt = f"""In the context of '{paragraph}' on the Wikipedia page '{page}', the hyperlink '{hyperlink}' appears. The following factors come into consideration:
        1) Extent of '{hyperlink}' usage within this context.
        2) Boundaries and limitations regarding this usage.
        3) Any interplay with other concepts or events within this context.
        4) The relevance and necessity of '{hyperlink}' within this specific context.

        1) Extent of '{hyperlink}' usage within this context can be described as:
        """
        return f"1) Extent of '{hyperlink}' usage within this context can be described as:\n\n" + self.openai_api.inference(prompt)
    
    def _load_data(self, target_page, data_type):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory
        file_path = os.path.join(directory, f'{data_type}.pkl')

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No such file or directory: '{file_path}'")

        with open(file_path, 'rb') as f:
            data = pickle.load(f)

        return data

    def _save_data(self, target_page, data_type, data):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory

        # Create target Directory if doesn't exist
        if not os.path.exists(directory):
            os.makedirs(directory)
            print("Directory ", directory, " Created ")

        file_path = os.path.join(directory, f'{data_type}.pkl')

        with open(file_path, 'wb') as f:
            pickle.dump(data, f)

        

In [16]:
target_page = "Israeli–Palestinian conflict"
hyperlink_analysis = HyperlinkAnalysis()

found_inlinks, none_inlinks, texts, analyses, embeddings = hyperlink_analysis.get_inlink_data_iter(target_page)
wapi = WikipediaAPI()


Processing inlink 1 out of 2397: Asia
Text extraction returned None for inlink: Asia
Processing inlink 2 out of 2397: Bill Clinton


KeyboardInterrupt: 

In [3]:
def test_hyperlink_analysis(page_title, num_inlinks_to_test=5):
    # Test the WikipediaAPI class
    print("Testing WikipediaAPI class...")
    wikipedia_api = WikipediaAPI()
    inlinks = wikipedia_api.get_inlinks(page_title)
    print(f"Inlinks for {page_title}: {inlinks[:num_inlinks_to_test]}...")  # Print the first num_inlinks_to_test inlinks for brevity
    page_content = wikipedia_api.get_page_content(page_title)
    print(f"First 100 characters of page content for {page_title}: {page_content[:100]}...")  # Print the first 100 characters for brevity
    print("")

    # Test the TextExtractor class
    print("Testing TextExtractor class...")
    text_extractor = TextExtractor()
    
    # Test multiple inlinks
    for i in range(min(num_inlinks_to_test, len(inlinks))):
        print(f"Testing inlink #{i+1}: {inlinks[i]}")
        paragraph = text_extractor.get_paragraph_with_link(page_title, inlinks[i])
        if paragraph:
            test_paragraph = paragraph
        print(f"Paragraph with link '{inlinks[i]}' in '{page_title}': {paragraph}")
        print("")

    # Test the OpenAI_API class
    print("Testing OpenAI_API class...")
    openai_api = OpenAI_API()
    inference = openai_api.inference(test_paragraph)
    print(f"Inference: {inference}")
    embedding = openai_api.embed(inference)
    print(f"Embedding: {embedding}")  
    print("")

    # Test the HyperlinkAnalysis class
    print("Testing HyperlinkAnalysis class...")
    hyperlink_analysis = HyperlinkAnalysis()
    found_inlinks, none_inlinks, texts, analyses, embeddings = hyperlink_analysis.get_inlink_data(page_title)
    print(f"Found inlinks: {found_inlinks[:num_inlinks_to_test]}...")  # Print the first num_inlinks_to_test for brevity
    print(f"Inlinks that returned None: {none_inlinks[:num_inlinks_to_test]}...")  # Print the first num_inlinks_to_test for brevity
    print(f"First text: {texts[0] if texts else None}")
    print(f"First analysis: {analyses[0] if analyses else None}")
    print(f"First embedding: {embeddings[0] if embeddings else None}")

# Replace 'Python (programming language)' with your page title
test_hyperlink_analysis('Israeli–Palestinian conflict', num_inlinks_to_test=5)


Testing WikipediaAPI class...
Inlinks for Israeli–Palestinian conflict: ['Asia', 'Bill Clinton', 'Balfour Declaration', 'December 9', 'Diaspora']...
First 100 characters of page content for Israeli–Palestinian conflict: <div class="mw-parser-output"><style data-mw-deduplicate="TemplateStyles:r1033289096">.mw-parser-out...

Testing TextExtractor class...
Testing inlink #1: Asia
Paragraph with link 'Asia' in 'Israeli–Palestinian conflict': None

Testing inlink #2: Bill Clinton
Paragraph with link 'Bill Clinton' in 'Israeli–Palestinian conflict': In 1993, Israeli officials led by Yitzhak Rabin and Palestinian leaders from the Palestine Liberation Organization led by Yasser Arafat strove to find a peaceful solution through what became known as the Oslo peace process. A crucial milestone in this process was Arafat's letter of recognition of Israel's right to exist. In 1993, the Oslo Accords were finalized as a framework for future Israeli–Palestinian relations. The crux of the Oslo agreemen

Israeli–Palestinian conflict: 100%|██████████████████████████████████████████| 2398/2398 [06:01<00:00,  6.64page/s]

Found inlinks: []...
Inlinks that returned None: ['Asia', 'Bill Clinton', 'Balfour Declaration', 'December 9', 'Diaspora']...
First text: None
First analysis: None
First embedding: None



