In [12]:
import requests
from bs4 import BeautifulSoup
import openai
import IPython.core.getipython
import time
import os
import concurrent.futures
from tqdm import tqdm


# Set up your OpenAI API key
openai.api_key = "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"
def get_wikipedia_inlinks(title, lang='en', limit=1500):
    base_url = f'https://{lang}.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'backlinks',
        'bltitle': title,
        'bllimit': limit,
        'blnamespace': 0,  # Only retrieve links from main namespace
        'continue': ''  # Placeholder for pagination
    }
    inlinks = []
    while True:
        response = requests.get(base_url, params=params)
        data = response.json()
        if 'error' in data:
            print(f"Error: {data['error']['info']}")
            break
        inlink_pages = data['query']['backlinks']
        for page in inlink_pages:
            inlinks.append(page['title'])
        if 'continue' not in data:
            break
        params['continue'] = data['continue']['continue']
        params['blcontinue'] = data['continue']['blcontinue']
    return inlinks

def get_wikipedia_page_content(title, lang='en'):
    base_url = f'https://{lang}.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'page': title,
        'format': 'json',
        'prop': 'text',
        'contentmodel': 'wikitext'
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    if 'error' in data:
        print(f"Error: {data['error']['info']}")
        return None
    html_content = data['parse']['text']['*']
    return html_content

def get_paragraph_with_link(page_title, link_title, lang='en'):
    content = get_wikipedia_page_content(page_title, lang)
    soup = BeautifulSoup(content, 'html.parser')
    
    #elements = soup.find_all(['p', 'li', 'dl'])
    elements = soup.find_all('p')
    
    #element_names = {'p': 'Paragraph', 'li': 'List item', 'dl': 'Description list'}
    
    link_href = "/wiki/" + link_title.replace(' ', '_')
    
    for element in elements:
        a_tags = element.find_all('a', href=True)
        if any(a['href'] == link_href for a in a_tags):
            context_info = "Unknown"
            preceding_header = element.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            if preceding_header:
                context_info = preceding_header.text.replace('[edit]', '')  # Remove the '[edit]' portion

            # Handle paragraphs separately from list items and description lists
            if element.name == 'p':
                #return f"\nElement type: {element_names[element.name]}, \nContext: {context_info}, \nText: \n{element.text.strip()}"
                return f"\n{element.text.strip()}\n"
    
            else:
                fallback_context = None
                current_context = element
                while current_context is not None:
                    if current_context.name not in ['li', 'dl']:
                        fallback_context = current_context
                        break
                    current_context = current_context.find_parent()

                return f"\nElement type: {element_names[element.name]}, \nContext: {context_info}, \nText: \n{element.text.strip()}"
    
    return None



def inference(prompt, retries=5, backoff_factor=0.1):
    for i in range(retries):
        try:
            # Your API call here
            response = openai.Completion.create(
                engine="text-davinci-003",
                prompt=prompt,
                max_tokens=500,
                n=1,
                stop = "",
                temperature=0.1,
            )
            return response.choices[0].text.strip()
        except (openai.error.RateLimitError, openai.error.APIError) as e:
            if i < retries - 1:  # if it's not the last retry attempt
                sleep_time = backoff_factor * (2 ** i)  # exponential backoff
                time.sleep(sleep_time)
            else:  # if it's the last retry attempt
                print(f"Failed to generate inference after {retries} attempts.")
                raise

def embed(prompt, retries=5, backoff_factor=0.1):
    for i in range(retries):
        try:
            response = openai.Embedding.create(
                input=text,
                model="text-embedding-ada-002"
            )
            return response['data'][0]['embedding']
        except (openai.error.RateLimitError, openai.error.APIError) as e:
            if i < retries - 1:  # if it's not the last retry attempt
                sleep_time = backoff_factor * (2 ** i)  # exponential backoff
                time.sleep(sleep_time)
            else:  # if it's the last retry attempt
                print(f"Failed to generate embedding after {retries} attempts.")
                raise               
                

def hyperlink_analysis(hyperlink, paragraph, page):
    prompt = f"In the context of '{paragraph}' on the Wikipedia page '{page}', the hyperlink '{hyperlink}' appears. The following factors come into consideration:\n\n1) Extent of '{hyperlink}' usage within this context.\n2) Boundaries and limitations regarding this usage.\n3) Any interplay with other concepts or events within this context.\n4) The relevance and necessity of '{hyperlink}' within this specific context.\n\n1) Extent of '{hyperlink}' usage within this context can be described as:\n\n"

    analysis = inference(prompt)

    return f"1) Extent of '{hyperlink}' usage within this context can be described as:\n\n" + analysis


def save_paragraphs_to_file(paragraphs, gpt_paragraphs, file_name):
    # Get the current working directory
    current_directory = os.getcwd()

    # Construct the relative file path
    file_path = os.path.join(current_directory, 'file.txt')
    
    # Define a unique delimiter to separate the paragraphs
    delimiter = "===PARAGRAPH==="

    # Open the file in write mode
    with open(file_path, 'w') as file:
        # Write the paragraphs to the file with the delimiter
        file.write(delimiter.join(paragraphs))

        # Add a separator between paragraphs and GPT analysis
        file.write('\n\n--- GPT Analysis ---\n\n')

        # Write the GPT paragraphs to the file with the delimiter
        file.write(delimiter.join(gpt_paragraphs))

    print(f"Paragraphs saved as: {file_name}")



In [2]:
target_page = "Wiki"

# Get all pages that link to the target page.
inlinks = get_wikipedia_inlinks(target_page)
found_inlinks = []

print(f"Found {len(inlinks)} pages linking to {target_page}.")

info = []

def process_inlink(inlink):
    paragraph = get_paragraph_with_link(inlink, target_page)
    if paragraph:
        analysis = hyperlink_analysis(target_page, paragraph, inlink)
        return (inlink, paragraph, analysis)
    return None

with concurrent.futures.ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(process_inlink, inlinks), total=len(inlinks), desc="Processing", unit="page"))

paragraphs = []
gpt_paragraphs = []
for result in results:
    if result is not None:
        inlink, paragraph, analysis = result
        found_inlinks.append(inlink)
        paragraphs.append(f"\nPage: {inlink}\n{paragraph}")
        gpt_paragraphs.append(f"Analysis:\n {analysis}\n")

print(f"Found {len(found_inlinks)} paragraphs linking to {target_page}.\n") 

for paragraph, gpt_paragraph in zip(paragraphs, gpt_paragraphs):
    print(f"{paragraph}\n{gpt_paragraph}")

# Assuming the save_paragraphs_to_file function can accept a list of strings.
save_paragraphs_to_file(paragraphs, gpt_paragraphs, 'Color Theory.txt')




Found 1707 pages linking to Wiki.


Processing: 100%|████████████████████████████████████████████████████████████████| 1707/1707 [22:22<00:00,  1.27page/s]


Found 677 paragraphs linking to Wiki.


Page: Camel case

Camel case is used in some wiki markup languages for terms that should be automatically linked to other wiki pages. This convention was originally used in Ward Cunningham's original wiki software, WikiWikiWeb,[36] and can be activated in most other wikis. Some wiki engines such as TiddlyWiki, Trac and PmWiki make use of it in the default settings, but usually also provide a configuration mechanism or plugin to disable it. Wikipedia formerly used camel case linking as well, but switched to explicit link markup using square brackets[37] and many other wiki sites have done the same. MediaWiki, for example, does not support camel case for linking. Some wikis that do not use camel case linking may still use the camel case as a naming convention, such as AboutUs.

Analysis:
 1) Extent of 'Wiki' usage within this context can be described as:

The term 'Wiki' is used multiple times throughout the context. It is used to refer to the orig

UnicodeEncodeError: 'charmap' codec can't encode character '\u02c8' in position 28627: character maps to <undefined>

In [None]:
import time
for paragraph, gpt_paragraph in zip(paragraphs, gpt_paragraphs):
    time.sleep(0.01)
    print(f"{paragraph}\n{gpt_paragraph}")

# Assuming the save_paragraphs_to_file function can accept a list of strings.
#save_paragraphs_to_file(paragraphs, gpt_paragraphs, 'Color Theory.txt')

In [4]:
from tqdm import tqdm

texts = []
embeddings = []
for gptparagraph, paragraph in tqdm(zip(gpt_paragraphs, paragraphs)):
    text = gptparagraph + "\n" + paragraph
    texts.append(text)
    embedding = embed(text)
    embeddings.append(embedding)
    
import fastcluster
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
Z = fastcluster.linkage(embeddings, method='ward')
plt.figure(figsize=(len(embeddings), 7))
dendrogram(Z)
plt.show()


677it [01:41,  6.67it/s]


ValueError: Image size of 67700x700 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 67700x700 with 1 Axes>

In [16]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

import numpy as np
num_clusters = 5

# Fit the KMeans model to your data
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)

# Now `kmeans.cluster_centers_` contains the centroids of the clusters
cluster_centroids = kmeans.cluster_centers_

# And `kmeans.labels_` contains the cluster number for each document
clusters = kmeans.labels_

# Get the representative document (the one closest to the centroid) for each cluster
representative_docs = []

for i in range(num_clusters):
    # Find the points in this cluster
    idx = np.where(clusters == i)[0]
    cluster_points = np.array(embeddings)[idx]
    
    # Find the 9 documents closest to the centroid
    closest_indices = cdist([cluster_centroids[i]], cluster_points).argsort()[0][:5]
    representatives = [paragraphs[idx[closest_idx]] for closest_idx in closest_indices]
    representative_docs.append(representatives)

# Now `representative_docs` contains the most representative documents for each cluster

combined_summaries = ""
combined_texts = ""
for i, docs in enumerate(representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"

    prompt = f"The following documents are from cluster {i} on the topic {target_page}:\n{combined_text}\nThe topic which this cluster may address is:"

    combined_summaries += f"{inference(prompt)}\n"
    combined_texts += "\n" + combined_text
    
print(combined_summaries)    
    




The Use of Wikis for Collaboration and E-Participation.
The Use of Wikis in Gaming and Creative Storytelling.
The Use of Wikis in Online Encyclopedias.
The Use of Wiki Software for Collaborative Editing and Social Networking.
The use of wikis in web development and software engineering.



In [14]:
prompt = f"The following text summarizes different clusters on the topic {target_page}:\n{combined_summaries}\nThe key differences between the {num_clusters} clusters can be described as:\nCluster 1:"

# Inference for new prompt
print("Cluster 1: " + inference(prompt))

Cluster 1: Wiki-based collaboration, which focuses on the use of wiki software to facilitate collaboration between users.
Cluster 2: Wiki-based content management, which focuses on the use of wiki software to manage content.
Cluster 3: Wiki-based knowledge management, which focuses on the use of wiki software to store and share knowledge.
Cluster 4: Wiki-based education, which focuses on the use of wiki software to support teaching and learning.
Cluster 5: Wiki-based research, which focuses on the use of wiki software to support research activities.


In [11]:
print(combined_texts)


Cluster 1 representative documents:
Document 1: 

Page: List of wikis

This article contains a list of notable wikis, which are websites that use wiki software, allowing users to collaboratively edit content and view old versions of the content. These websites use several different wiki software packages.

Document 2: 

Page: Digital collaboration

Wikis are websites which allow collaborative modification of its content and structure directly from the web browser. In a typical wiki, text is written using a simplified markup language (known as "wiki markup"), and often edited with the help of a rich-text editor. A wiki is run using wiki software, otherwise known as a wiki engine. There are dozens of different wiki engines in use, both standalone and part of other software, such as bug tracking systems. Some wiki engines are open source, whereas others are proprietary.

Document 3: 

Page: Wikimapia

Wikimapia is a geographic online encyclopedia project. The project implements an intera

In [7]:
# Let's consider the first cluster (index 0)
first_cluster_idx = np.where(clusters == 0)[0]
first_cluster_embeddings = np.array(embeddings)[first_cluster_idx]

# Decide on the number of sub-clusters
num_sub_clusters = 3

# Fit the KMeans model to the first cluster data
kmeans_sub = KMeans(n_clusters=num_sub_clusters)
kmeans_sub.fit(first_cluster_embeddings)

# Now `kmeans_sub.cluster_centers_` contains the centroids of the sub-clusters
sub_cluster_centroids = kmeans_sub.cluster_centers_

# And `kmeans_sub.labels_` contains the sub-cluster number for each document in the first cluster
sub_clusters = kmeans_sub.labels_

# Get the representative document (the one closest to the centroid) for each sub-cluster
sub_representative_docs = []

for i in range(num_sub_clusters):
    # Find the points in this sub-cluster
    sub_idx = np.where(sub_clusters == i)[0]
    sub_cluster_points = first_cluster_embeddings[sub_idx]
    
    # Find the 9 documents closest to the centroid
    sub_closest_indices = cdist([sub_cluster_centroids[i]], sub_cluster_points).argsort()[0][:5]
    sub_representatives = [paragraphs[first_cluster_idx[sub_closest_idx]] for sub_closest_idx in sub_closest_indices]
    sub_representative_docs.append(sub_representatives)

# Now `sub_representative_docs` contains the most representative documents for each sub-cluster

sub_combined_summaries = ""

for i, docs in enumerate(sub_representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"

    prompt = f"The following documents are from cluster {i} on the topic {target_page}:\n{combined_text}\nThe summary for this cluster is:"

    sub_combined_summaries += f"cluster {i}: {inference(prompt)}\n"
    print(f"{combined_text}\n")
    
print(sub_combined_summaries)    



Cluster 1 representative documents:
Document 1: 

Page: Cocaine

The pharmacodynamics of cocaine involve the complex relationships of neurotransmitters (inhibiting monoamine uptake in rats with ratios of about: serotonin:dopamine = 2:3, serotonin:norepinephrine = 2:5).[106][16] The most extensively studied effect of cocaine on the central nervous system is the blockade of the dopamine transporter protein. Dopamine neurotransmitter released during neural signaling is normally recycled via the transporter; i.e., the transporter binds the transmitter and pumps it out of the synaptic cleft back into the presynaptic neuron, where it is taken up into storage vesicles. Cocaine binds tightly at the dopamine transporter forming a complex that blocks the transporter's function. The dopamine transporter can no longer perform its reuptake function, and thus dopamine accumulates in the synaptic cleft. The increased concentration of dopamine in the synapse activates post-synaptic dopamine receptors,

In [9]:
prompt = f"The following text summarizes different sub-clusters on the topic {target_page} where the super-cluster focuses on the pharmacodynamics of various drugs, such as ethylphenidate, viloxazine, acamprosate, Adderall, and neurotrophin mimetics. It examines the different pharmacodynamic profiles of these drugs, such as their selectivity to the dopamine transporter, affinity for serotonin receptors, action as an NMDA receptor antagonist, and modulation of the action of neurotrophin receptors.:\n{sub_combined_summaries}\nThese are descriptions which differentiate the 3 sub_clusters\nCluster 1:"

# Inference for new prompt
print("Cluster 1: " + inference(prompt))

Cluster 1: This cluster focuses on the pharmacodynamics of various substances, such as cocaine, sleep medications, and hypnotics. It examines how these substances interact with neurotransmitters, such as blocking the dopamine transporter protein or increasing dopamine concentrations in the synaptic cleft. It also looks at the similarities between benzodiazepines and nonbenzodiazepines, as well as the effects of exercise on sleep.

Cluster 2: This cluster focuses on the pharmacodynamics of various drugs, such as cocaine, chloral hydrate, hypnotics, fenfluramine/phentermine, and sleep medications. It examines the complex relationships between neurotransmitters and the drugs, as well as the effects of nonbenzodiazepines, exercise, and white noise on sleep.

Cluster 3: This cluster focuses on the pharmacodynamics of various drugs, such as viloxazine, Adderall, MEAI, glycine receptor, and cocaine. It examines the complex relationships between neurotransmitters and the drugs, as well as the 