In [8]:
import os
import time
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np
import openai
import pickle
from concurrent.futures import ThreadPoolExecutor
# Set up your OpenAI API key
openai.api_key = "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"

class WikipediaAPI:
    def __init__(self, lang='en'):
        self.lang = lang
        self.base_url = f'https://{lang}.wikipedia.org/w/api.php'

    def _api_call(self, params):
        response = requests.get(self.base_url, params=params)
        data = response.json()
        if 'error' in data:
            print(f"Error: {data['error']['info']}")
        return data

    def get_inlinks(self, title, limit=5000):
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'backlinks',
            'bltitle': title,
            'bllimit': limit,
            'blnamespace': 0,  # Only retrieve links from main namespace
            'continue': ''  # Placeholder for pagination
        }
        inlinks = []
        while True:
            data = self._api_call(params)
            inlink_pages = data['query']['backlinks']
            for page in inlink_pages:
                inlinks.append(page['title'])
            if 'continue' not in data:
                break
            params['continue'] = data['continue']['continue']
            params['blcontinue'] = data['continue']['blcontinue']
        return inlinks

    def get_page_content(self, title):
        params = {
            'action': 'parse',
            'page': title,
            'format': 'json',
            'prop': 'text',
            'contentmodel': 'wikitext'
        }
        data = self._api_call(params)
        html_content = data['parse']['text']['*']
        return html_content


class TextExtractor:
    def __init__(self, lang='en'):
        self.lang = lang
        self.api = WikipediaAPI(lang)

    def get_paragraph_with_link(self, page_title, link_title):
        content = self.api.get_page_content(page_title)
        soup = BeautifulSoup(content, 'html.parser')
        elements = soup.find_all('p')
        link_href = "/wiki/" + link_title.replace(' ', '_')
        for element in elements:
            a_tags = element.find_all('a', href=True)
            if any(a['href'] == link_href for a in a_tags):
                return element.text.strip()
        return None


class OpenAI_API:
    def __init__(self, retries=5, backoff_factor=0.1):
        self.retries = retries
        self.backoff_factor = backoff_factor

    def _api_call(self, method, *args, **kwargs):
        for i in range(self.retries):
            try:
                return method(*args, **kwargs)
            except (openai.error.RateLimitError, openai.error.APIError, openai.error.ServiceUnavailableError) as e:
                if i < self.retries - 1:  # if it's not the last retry attempt
                    sleep_time = self.backoff_factor * (2 ** i)  # exponential backoff
                    time.sleep(sleep_time)
                else:  # if it's the last retry attempt
                    print(f"Failed to call API after {self.retries} attempts.")
                    raise

    def inference(self, prompt):
        # Your API call here
        completion = self._api_call(openai.ChatCompletion.create,
            model="gpt-3.5-turbo",                      
            messages = [
                {"role": "system", "content": "You are a Wikipedian, helping to build the Dual Wikipedia. This alternative perspective on Wikipedia focuses on the incoming links to a page and the contexts in which a page is referenced by other pages."},
                {"role": "user", "content": prompt}
            ]
        )
        return completion.choices[0].message['content']


    def embed(self, text):
        response = self._api_call(openai.Embedding.create,
            input=text,
            model="text-embedding-ada-002"
        )
        return response['data'][0]['embedding']


class HyperlinkAnalysis:
    def __init__(self):
        self.text_extractor = TextExtractor()
        self.openai_api = OpenAI_API()

    def _get_inlink_data_helper(self, inlink, target_page):
        text = self.text_extractor.get_paragraph_with_link(inlink, target_page)
        if text is None:
            return None

        analysis = self.hyperlink_analysis(target_page, text, inlink)
        embedding = self.openai_api.embed(analysis)

        return inlink, text, analysis, embedding

    def get_inlink_data(self, target_page, batch_size=100):
        inlinks = self.text_extractor.api.get_inlinks(target_page)
        found_inlinks = []
        none_inlinks = []  # This is the new list to store inlinks that return None
        texts = []
        analyses = []
        embeddings = []
        
        # Check if there is already saved data
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
            already_processed = set(found_inlinks).union(none_inlinks)  # Consider None inlinks as already processed
            inlinks = [inlink for inlink in inlinks if inlink not in already_processed]
        except Exception:
            pass

        with ThreadPoolExecutor() as executor:
            for idx, result in enumerate(tqdm(executor.map(self._get_inlink_data_helper, inlinks, [target_page]*len(inlinks)), total=len(inlinks), desc=target_page, unit="page"), 1):
                try:
                    if result is None:
                        none_inlinks.append(inlinks[idx-1])  # Store the inlink that returned None
                        continue
                    inlink, text, analysis, embedding = result
                    found_inlinks.append(inlink)
                    texts.append(text)
                    analyses.append(analysis)
                    embeddings.append(embedding)

                    # Save data after every batch
                    if idx % batch_size == 0:
                        self._save_data(target_page, 'found_inlinks', found_inlinks)
                        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save None inlinks after every batch
                        self._save_data(target_page, 'texts', texts)
                        self._save_data(target_page, 'analyses', analyses)
                        self._save_data(target_page, 'embeddings', embeddings)
                except Exception as e:
                    print(f"An exception occurred: {e}")
                    continue

        # Save remaining data that didn't fit into a full batch
        self._save_data(target_page, 'found_inlinks', found_inlinks)
        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save remaining None inlinks
        self._save_data(target_page, 'texts', texts)
        self._save_data(target_page, 'analyses', analyses)
        self._save_data(target_page, 'embeddings', embeddings)

        return found_inlinks, none_inlinks, texts, analyses, embeddings  # Return None inlinks along with other data


    def load_saved_data(self, target_page):
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
        except FileNotFoundError as e:
            print(f"No saved data found for {target_page}. Please check the target page or the saved data.")
            return None
        except Exception as e:
            print(f"An unexpected error occurred while loading saved data for {target_page}: {e}")
            return None

        return found_inlinks, none_inlinks, texts, analyses, embeddings


    def hyperlink_analysis(self, hyperlink, paragraph, page):
        prompt = f"""In the context of '{paragraph}' on the Wikipedia page '{page}', the hyperlink '{hyperlink}' appears. The following factors come into consideration:
        1) Extent of '{hyperlink}' usage within this context.
        2) Boundaries and limitations regarding this usage.
        3) Any interplay with other concepts or events within this context.
        4) The relevance and necessity of '{hyperlink}' within this specific context.

        1) Extent of '{hyperlink}' usage within this context can be described as:
        """
        return f"1) Extent of '{hyperlink}' usage within this context can be described as:\n\n" + self.openai_api.inference(prompt)
    
    def _load_data(self, target_page, data_type):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory
        file_path = os.path.join(directory, f'{data_type}.pkl')

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No such file or directory: '{file_path}'")

        with open(file_path, 'rb') as f:
            data = pickle.load(f)

        return data

    def _save_data(self, target_page, data_type, data):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory

        # Create target Directory if doesn't exist
        if not os.path.exists(directory):
            os.makedirs(directory)
            print("Directory ", directory, " Created ")

        file_path = os.path.join(directory, f'{data_type}.pkl')

        with open(file_path, 'wb') as f:
            pickle.dump(data, f)
hyperlink_analysis = HyperlinkAnalysis()            

In [9]:
target_page = "Afterlife"#replace with any wikipedia page

found_inlinks, none_inlinks, texts, analyses, embeddings = hyperlink_analysis.get_inlink_data(target_page)


Afterlife: 0page [00:00, ?page/s]


Albert Einstein

Ancient Egypt

Agnosticism

Argument from morality

Albert Brooks

Black Death

The Church of Jesus Christ of Latter-day Saints

Christian eschatology

Christian views on marriage

Christianity and Judaism

Death

Eschatology

Edward Elgar

Ancient Egyptian religion

Funeral

Fantasy film

Gnosticism

Ghost

Gerald Gardner

Giordano Bruno

Hades

Horror film

Hinduism

Heaven

History of painting

Islamic eschatology

Jorge Luis Borges

Limbo

La Tène culture

Life

Lord's Prayer

Mind

Mars in fiction

Meaning of life

Martin Gardner

Marvel Universe

Osiris

Paleolithic

Plato

Priest

Persephone

Renaissance

Reincarnation

Homosexuality and religion

Resurrection of Jesus

Resurrection

Religious pluralism

Shamanism

Soul

Salvation

Thomas Jefferson

Problem of evil

Temple of Set

Divine Comedy

Veneration of the dead

Cosmos

Rastafari

Religion and sexuality

Life After Life (Moody book)

Quantum suicide and immortality

Hugo Chávez

Sumer

Eva Perón

Eleusini

In [11]:
num_clusters = 2

kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(embeddings)

labels = kmeans.labels_

# Split the original data according to the clusters
clustered_data = {i: [] for i in range(num_clusters)}
for i, label in enumerate(labels):
    clustered_data[label].append((found_inlinks[i], none_inlinks[i], texts[i], analyses[i], embeddings[i]))

# Now, clustered_data is a dictionary where keys are cluster labels (0 to num_clusters-1)
# and values are lists of tuples belonging to each cluster.

# Calculate the distance from each point to the center of its cluster
for label, data in clustered_data.items():
    # Calculate the Euclidean distance from each point to the cluster center
    distances = [distance.euclidean(d[-1], kmeans.cluster_centers_[label]) for d in data]
    
    # Create a combined list of distances and data
    data_with_distances = list(zip(distances, data))

    # Sort by distance (each element is a tuple where the first element is the distance)
    sorted_data_with_distances = sorted(data_with_distances, key=lambda x: x[0])

    # Remove distances, keep only data
    sorted_data = [d for _, d in sorted_data_with_distances]

    # Replace original cluster data with sorted data
    clustered_data[label] = sorted_data





In [26]:
# Print out the corresponding inlink, texts, and analysis for the first 5 embeddings in each cluster
for cluster, cluster_data in clustered_data.items():
    print(f"\nCluster {cluster}:\n-------------------------")
    for data in cluster_data[:5]:  # Here we take only the first 5 data points in each cluster
        inlink, _, text, analysis, _ = data  # Unpack data tuple
        print(f"Inlink: {inlink}\nText: {text}\n\n")





Cluster 0:
-------------------------
Inlink: Girl Meets Ghost
Text: Girl Meets Ghost is a children's novel series launched in 2013 by author Lauren Barnholdt about a tween girl who can see and talk with ghosts as she helps them move on to the afterlife, though what happens when ghosts "move on" is unclear.[1]


Inlink: The Last Descent
Text: Some time later, John suddenly awakens, and to his surprise, has the strength to push himself out of the hole. He calls for Aaron and Susie and finds the pulley ropes. He notices that the injuries he sustained have disappeared. As he finds his way back to the entrance, he finds it deserted and realizes he has died and is now a spirit in the afterlife. Saddened, John goes back to the cave.


Inlink: The Plattner Story
Text: "The Plattner Story" is a short story by English writer H. G. Wells, first published in 1896 in The New Review. It was included in The Plattner Story and Others, a collection of short stories by Wells first published in 1897, an

In [23]:
target_page = "Afterlife"
# For each cluster, generate a summary
for cluster, cluster_data in clustered_data.items():
    # Extract the texts from this cluster
    cluster_texts = [data[2] for data in cluster_data]

    # Take a representative sample, for example, first 10 texts
    representative_sample = cluster_texts[:10]

    # Create a prompt for GPT-4
    prompt = f"The following texts represent a particular cluster of incoming links to {target_page}: {representative_sample}\n"

    # Use OpenAI API to generate a summary
    summary = hyperlink_analysis.openai_api.inference(prompt)

    # Print the summary
    print(f"Summary for Cluster {cluster}:\n{summary}\n")


Summary for Cluster 0:
These texts understand Afterlife as a concept or realm that exists beyond physical life and is often associated with the spirit or soul after death. It is depicted as a place where ghosts or spirits go, where they may find peace or continue their existence in some form. The understanding of Afterlife varies across different texts, including children's novels, short stories, TV series, and philosophical inquiries, highlighting its diverse interpretations and representations.

Summary for Cluster 1:
These texts understand the concept of afterlife as a belief in life after death, where the continuation of the soul, spirit, or mind is believed to occur. They also associate the afterlife with concepts such as heaven, the spirit world, and the veneration of the dead. The afterlife is seen as a place where beings such as gods, angels, spirits, saints, or ancestors originate or reside, and it is often connected to religious consolations, mystical awakening, and communal 