In [5]:
import warnings
warnings.filterwarnings("ignore")
import os
import time
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np
import openai
import pickle
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import quote
# Set up your OpenAI API key
openai.api_key = "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"




class WikipediaAPI:
    def __init__(self, lang='en'):
        self.lang = lang
        self.base_url = f'https://{lang}.wikipedia.org/w/api.php'

    def _api_call(self, params):
        response = requests.get(self.base_url, params=params)
        data = response.json()
        if 'error' in data:
            print(f"Error: {data['error']['info']}")
        return data

    def get_inlinks(self, title, limit=5000):
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'backlinks',
            'bltitle': title,
            'bllimit': limit,
            'blnamespace': 0,  # Only retrieve links from main namespace
            'continue': ''  # Placeholder for pagination
        }
        inlinks = []
        while True:
            data = self._api_call(params)
            inlink_pages = data['query']['backlinks']
            for page in inlink_pages:
                inlinks.append(page['title'])
            if 'continue' not in data:
                break
            params['continue'] = data['continue']['continue']
            params['blcontinue'] = data['continue']['blcontinue']
        return inlinks

    def get_page_content(self, title):
        params = {
            'action': 'parse',
            'page': title,
            'format': 'json',
            'prop': 'text',
            'contentmodel': 'wikitext'
        }
        data = self._api_call(params)
        html_content = data['parse']['text']['*']
        return html_content


class TextExtractor:
    def __init__(self, lang='en'):
        self.lang = lang
        self.api = WikipediaAPI(lang)
    
    #The page Israeli–Palestinian was the page which exposed the bug, this method fixes that bug, it has to do with the -
    def normalize_title(self, title):
        return quote(title.replace(' ', '_'), safe='')

    def get_paragraph_with_link(self, page_title, link_title):
        content = self.api.get_page_content(page_title)
        soup = BeautifulSoup(content, 'html.parser')
        elements = soup.find_all('p')
        link_href = "/wiki/" + self.normalize_title(link_title)
        for element in elements:
            a_tags = element.find_all('a', href=True)
            if any(a['href'] == link_href for a in a_tags):
                return element.text.strip()
        return None


class OpenAI_API:
    def __init__(self, retries=5, backoff_factor=0.1):
        self.retries = retries
        self.backoff_factor = backoff_factor

    def _api_call(self, method, *args, **kwargs):
        for i in range(self.retries):
            try:
                return method(*args, **kwargs)
            except (openai.error.RateLimitError, openai.error.APIError, openai.error.ServiceUnavailableError) as e:
                if i < self.retries - 1:  # if it's not the last retry attempt
                    sleep_time = self.backoff_factor * (2 ** i)  # exponential backoff
                    time.sleep(sleep_time)
                else:  # if it's the last retry attempt
                    print(f"Failed to call API after {self.retries} attempts.")
                    raise

    def inference(self, prompt):
        # Your API call here
        response = self._api_call(openai.Completion.create,
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=500,
            n=1,
            stop = "",
            temperature=0.1,
        )
        return response.choices[0].text.strip()

    def embed(self, text):
        response = self._api_call(openai.Embedding.create,
            input=text,
            model="text-embedding-ada-002"
        )
        return response['data'][0]['embedding']


class HyperlinkAnalysis:
    def __init__(self):
        self.text_extractor = TextExtractor()
        self.openai_api = OpenAI_API()

    def _get_inlink_data_helper(self, inlink, target_page):
        text = self.text_extractor.get_paragraph_with_link(inlink, target_page)
        if text is None:
            return None

        analysis = self.hyperlink_analysis(target_page, text, inlink)
        embedding = self.openai_api.embed(analysis)

        return inlink, text, analysis, embedding

    def get_inlink_data(self, target_page, batch_size=100):
        inlinks = self.text_extractor.api.get_inlinks(target_page)
        found_inlinks = []
        none_inlinks = []  # This is the new list to store inlinks that return None
        texts = []
        analyses = []
        embeddings = []
        
        # Check if there is already saved data
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
            already_processed = set(found_inlinks).union(none_inlinks)  # Consider None inlinks as already processed
            inlinks = [inlink for inlink in inlinks if inlink not in already_processed]
        except Exception:
            pass

        with ThreadPoolExecutor() as executor:
            for idx, result in enumerate(tqdm(executor.map(self._get_inlink_data_helper, inlinks, [target_page]*len(inlinks)), total=len(inlinks), desc=target_page, unit="page"), 1):
                try:
                    if result is None:
                        none_inlinks.append(inlinks[idx-1])  # Store the inlink that returned None
                        continue
                    inlink, text, analysis, embedding = result
                    found_inlinks.append(inlink)
                    texts.append(text)
                    analyses.append(analysis)
                    embeddings.append(embedding)

                    # Save data after every batch
                    if idx % batch_size == 0:
                        self._save_data(target_page, 'found_inlinks', found_inlinks)
                        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save None inlinks after every batch
                        self._save_data(target_page, 'texts', texts)
                        self._save_data(target_page, 'analyses', analyses)
                        self._save_data(target_page, 'embeddings', embeddings)
                except Exception as e:
                    print(f"An exception occurred: {e}")
                    continue

        # Save remaining data that didn't fit into a full batch
        self._save_data(target_page, 'found_inlinks', found_inlinks)
        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save remaining None inlinks
        self._save_data(target_page, 'texts', texts)
        self._save_data(target_page, 'analyses', analyses)
        self._save_data(target_page, 'embeddings', embeddings)

        return found_inlinks, none_inlinks, texts, analyses, embeddings  # Return None inlinks along with other data


    def load_saved_data(self, target_page):
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
        except FileNotFoundError as e:
            print(f"No saved data found for {target_page}. Please check the target page or the saved data.")
            return None
        except Exception as e:
            print(f"An unexpected error occurred while loading saved data for {target_page}: {e}")
            return None

        return found_inlinks, none_inlinks, texts, analyses, embeddings


    def hyperlink_analysis(self, hyperlink, paragraph, page):
        prompt = f"""In the context of '{paragraph}' on the Wikipedia page '{page}', the hyperlink '{hyperlink}' appears. The following factors come into consideration:
        1) Extent of '{hyperlink}' usage within this context.
        2) Boundaries and limitations regarding this usage.
        3) Any interplay with other concepts or events within this context.
        4) The relevance and necessity of '{hyperlink}' within this specific context.

        1) Extent of '{hyperlink}' usage within this context can be described as:
        """
        return f"1) Extent of '{hyperlink}' usage within this context can be described as:\n\n" + self.openai_api.inference(prompt)
    
    def _load_data(self, target_page, data_type):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory
        file_path = os.path.join(directory, f'{data_type}.pkl')

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No such file or directory: '{file_path}'")

        with open(file_path, 'rb') as f:
            data = pickle.load(f)

        return data

    def _save_data(self, target_page, data_type, data):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory

        # Create target Directory if doesn't exist
        if not os.path.exists(directory):
            os.makedirs(directory)
            print("Directory ", directory, " Created ")

        file_path = os.path.join(directory, f'{data_type}.pkl')

        with open(file_path, 'wb') as f:
            pickle.dump(data, f)

In [7]:
target_page = "Gemini (astrology)"
hyperlink_analysis = HyperlinkAnalysis()

found_inlinks, none_inlinks, texts, analyses, embeddings = hyperlink_analysis.get_inlink_data(target_page)


Gemini (astrology): 100%|██████████████████████████████████████████████████████| 308/308 [00:27<00:00, 11.22page/s]


In [8]:
for found in found_inlinks:
    print(found)

In [8]:
num_clusters = 4
num_documents = 7

kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(embeddings)

labels = kmeans.labels_

# Split the original data according to the clusters
clustered_data = {i: [] for i in range(num_clusters)}
for i, label in enumerate(labels):
    clustered_data[label].append((found_inlinks[i], texts[i], analyses[i], embeddings[i]))

# Now, clustered_data is a dictionary where keys are cluster labels (0 to num_clusters-1)
# and values are lists of tuples belonging to each cluster.

# Calculate the distance from each point to the center of its cluster
for label, data in clustered_data.items():
    # Calculate the Euclidean distance from each point to the cluster center
    distances = [distance.euclidean(d[-1], kmeans.cluster_centers_[label]) for d in data]
    
    # Create a combined list of distances and data
    data_with_distances = list(zip(distances, data))

    # Sort by distance (each element is a tuple where the first element is the distance)
    sorted_data_with_distances = sorted(data_with_distances, key=lambda x: x[0])

    # Remove distances, keep only data
    sorted_data = [d for _, d in sorted_data_with_distances]

    # Replace original cluster data with sorted data
    clustered_data[label] = sorted_data
# Calculate the sizes of the clusters
cluster_sizes = [(cluster, len(data)) for cluster, data in clustered_data.items()]

# Sort the clusters based on their sizes in decreasing order
cluster_sizes.sort(key=lambda x: x[1], reverse=True)

# Print out the corresponding inlink, texts, and analysis for the first 5 embeddings in each cluster
cluster_text = ""
for cluster, size in cluster_sizes:
    cluster_data = clustered_data[cluster]
    cluster_text += f"==Cluster {cluster} (Number of data points: {size})==\n\n"
    for data in cluster_data[:num_documents]:  # Here we take only the first 5 data points in each cluster
        inlink, text, analysis, _ = data  # Unpack data tuple
        cluster_text += f"Title: [[{inlink}]]\nText: {text}\n\n"
print(cluster_text)




==Cluster 0 (Number of data points: 510)==

Title: [[The New York Times]]
Text: For its coverage of the Israeli–Palestinian conflict, some (such as Ed Koch) have claimed that the paper is pro-Palestinian, while others (such as As'ad AbuKhalil) have claimed that it is pro-Israel.[265][266] The Israel Lobby and U.S. Foreign Policy, by political science professors John Mearsheimer and Stephen Walt, alleges The New York Times sometimes criticizes Israeli policies but is not even-handed and is generally pro-Israel.[267] In 2009, the Simon Wiesenthal Center criticized the newspaper for printing cartoons regarding the Israeli-Palestinian conflict that were described as "hideously anti-Semitic".[268]

Title: [[2007 Arab League summit]]
Text: The power struggle in Palestine will, consequently, make it impossible to implement any security understandings reached in the past or future between Israel and Abbas. The official statement from the Israeli Foreign Ministry Spokesman said, "Israel is sinc

In [9]:
completion = openai.ChatCompletion.create(
  model="gpt-4",
  messages=[
    {"role": "system", "content": f"You are a Wikipedia Article Writer creating a dual page for {target_page}, synthesized from text on pages that hyperlink to {target_page}."},
    {"role": "user", "content": f"Based on the four clusters of {target_page}'s occurrences on Wikipedia, write a Wikipedia-style article with four sections. Use the top {num_documents} nearest paragraphs in each cluster to define the sections. Each section should contain multiple paragraphs to ensure clarity and integrity of information. Avoid lists and instead weave information into a narrative. Ensure the sections reflect different lenses of understanding {target_page}. Include all paragraph titles and use the Wikipedia format for linking ([[wikilink]]) and headers (==header==).\n{cluster_text}"}
  ]
)


content = completion.choices[0].message['content']
print(content)

==Cluster 0: International Coverage and Perception==
A significant portion of the global perspective on the Israeli–Palestinian conflict is shaped by various platforms, including media outlets such as [[The New York Times]]. There exist differing claims on whether the renowned newspaper's coverage tends to be pro-Israeli or pro-Palestinian[^1^], which underlines the ambiguity surrounding the international understanding of this conflict.

Such debates also spill over into politics, with several crucial international events like the [[2007 Arab League summit]] bearing significant implications for the conflict[^2^]. For instance, Jordan, a key player in [[Jordan–United States relations]], emphasizes the importance for added efforts to resolve the conflict, advocating for a two-state solution[^3^]. 

The conflict attracts notable attention from various public figures too; American entertainer [[Bill Maher]], for instance, expresses his stance on the conflict[^4^], while political leaders l