In [23]:
import requests
from bs4 import BeautifulSoup
import openai
import IPython.core.getipython
import time
import os
import concurrent.futures
from tqdm import tqdm


# Set up your OpenAI API key
openai.api_key = "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"
def get_wikipedia_inlinks(title, lang='en', limit=1500):
    base_url = f'https://{lang}.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'backlinks',
        'bltitle': title,
        'bllimit': limit,
        'blnamespace': 0,  # Only retrieve links from main namespace
        'continue': ''  # Placeholder for pagination
    }
    inlinks = []
    while True:
        response = requests.get(base_url, params=params)
        data = response.json()
        if 'error' in data:
            print(f"Error: {data['error']['info']}")
            break
        inlink_pages = data['query']['backlinks']
        for page in inlink_pages:
            inlinks.append(page['title'])
        if 'continue' not in data:
            break
        params['continue'] = data['continue']['continue']
        params['blcontinue'] = data['continue']['blcontinue']
    return inlinks

def get_wikipedia_page_content(title, lang='en'):
    base_url = f'https://{lang}.wikipedia.org/w/api.php'
    params = {
        'action': 'parse',
        'page': title,
        'format': 'json',
        'prop': 'text',
        'contentmodel': 'wikitext'
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    if 'error' in data:
        print(f"Error: {data['error']['info']}")
        return None
    html_content = data['parse']['text']['*']
    return html_content

def get_paragraph_with_link(page_title, link_title, lang='en'):
    content = get_wikipedia_page_content(page_title, lang)
    soup = BeautifulSoup(content, 'html.parser')
    
    #elements = soup.find_all(['p', 'li', 'dl'])
    elements = soup.find_all('p')
    
    #element_names = {'p': 'Paragraph', 'li': 'List item', 'dl': 'Description list'}
    
    link_href = "/wiki/" + link_title.replace(' ', '_')
    
    for element in elements:
        a_tags = element.find_all('a', href=True)
        if any(a['href'] == link_href for a in a_tags):
            context_info = "Unknown"
            preceding_header = element.find_previous(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            if preceding_header:
                context_info = preceding_header.text.replace('[edit]', '')  # Remove the '[edit]' portion

            # Handle paragraphs separately from list items and description lists
            if element.name == 'p':
                #return f"\nElement type: {element_names[element.name]}, \nContext: {context_info}, \nText: \n{element.text.strip()}"
                return f"\n{element.text.strip()}\n"
    
            else:
                fallback_context = None
                current_context = element
                while current_context is not None:
                    if current_context.name not in ['li', 'dl']:
                        fallback_context = current_context
                        break
                    current_context = current_context.find_parent()

                return f"\nElement type: {element_names[element.name]}, \nContext: {context_info}, \nText: \n{element.text.strip()}"
    
    return None



def inference(prompt, retries=5, backoff_factor=0.1):
    for i in range(retries):
        try:
            # Your API call here
            response = openai.Completion.create(
                engine="text-davinci-003",
                prompt=prompt,
                max_tokens=500,
                n=1,
                stop = "",
                temperature=0.1,
            )
            return response.choices[0].text.strip()
        except (openai.error.RateLimitError, openai.error.APIError) as e:
            if i < retries - 1:  # if it's not the last retry attempt
                sleep_time = backoff_factor * (2 ** i)  # exponential backoff
                time.sleep(sleep_time)
            else:  # if it's the last retry attempt
                print(f"Failed to generate inference after {retries} attempts.")
                raise

def embed(text, retries=5, backoff_factor=0.1):
    for i in range(retries):
        try:
            response = openai.Embedding.create(
                input=text,
                model="text-embedding-ada-002"
            )
            return response['data'][0]['embedding']
        except (openai.error.RateLimitError, openai.error.APIError) as e:
            if i < retries - 1:  # if it's not the last retry attempt
                sleep_time = backoff_factor * (2 ** i)  # exponential backoff
                time.sleep(sleep_time)
            else:  # if it's the last retry attempt
                print(f"Failed to generate embedding after {retries} attempts.")
                raise               
                

def hyperlink_analysis(hyperlink, paragraph, page):
    prompt = f"In the context of '{paragraph}' on the Wikipedia page '{page}', the hyperlink '{hyperlink}' appears. The following factors come into consideration:\n\n1) Extent of '{hyperlink}' usage within this context.\n2) Boundaries and limitations regarding this usage.\n3) Any interplay with other concepts or events within this context.\n4) The relevance and necessity of '{hyperlink}' within this specific context.\n\n1) Extent of '{hyperlink}' usage within this context can be described as:\n\n"

    analysis = inference(prompt)

    return f"1) Extent of '{hyperlink}' usage within this context can be described as:\n\n" + analysis


def understand(target_page):
    inlinks = get_wikipedia_inlinks(target_page)
    found_inlinks = []

    print(f"Found {len(inlinks)} pages linking to {target_page}.")

    info = []

    def process_inlink(inlink):
        paragraph = get_paragraph_with_link(inlink, target_page)
        if paragraph:
            analysis = hyperlink_analysis(target_page, paragraph, inlink)
            return (inlink, paragraph, analysis)
        return None

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_inlink, inlinks), total=len(inlinks), desc="Analysis", unit="page"))

    paragraphs = []
    gpt_paragraphs = []
    for result in results:
        if result is not None:
            inlink, paragraph, analysis = result
            found_inlinks.append(inlink)
            paragraphs.append(f"\nPage: {inlink}\n{paragraph}")
            gpt_paragraphs.append(f"Analysis:\n {analysis}\n")
    texts = []
    embeddings = []
    for gptparagraph, paragraph in tqdm(zip(gpt_paragraphs, paragraphs), total=len(gpt_paragraphs), desc="Embedding", unit="page"):
        text = gptparagraph + "\n" + paragraph
        texts.append(text)
        embedding = embed(text)
        embeddings.append(embedding)
    return found_inlinks, texts, embeddings    

In [None]:
found_inlinks, texts, embeddings = understand("Sustainability")


Found 3069 pages linking to Sustainability.


Analysis:   2%|█▌                                                                  | 68/3069 [01:21<40:44,  1.23page/s]

In [6]:
print(len(found_inlinks))

1976


In [7]:
target_page = "Sustainability"

In [11]:
paragraphs = texts

In [12]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
num_clusters = 1
num_documents = 10
# Fit the KMeans model to your data
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)

# Now `kmeans.cluster_centers_` contains the centroids of the clusters
cluster_centroids = kmeans.cluster_centers_

# And `kmeans.labels_` contains the cluster number for each document
clusters = kmeans.labels_

# Get the representative document (the one closest to the centroid) for each cluster
representative_docs = []

for i in range(num_clusters):
    # Find the points in this cluster
    idx = np.where(clusters == i)[0]
    cluster_points = np.array(embeddings)[idx]
    
    # Find the 9 documents closest to the centroid
    closest_indices = cdist([cluster_centroids[i]], cluster_points).argsort()[0][:num_documents]
    representatives = [paragraphs[idx[closest_idx]] for closest_idx in closest_indices]
    representative_docs.append(representatives)

# Now `representative_docs` contains the most representative documents for each cluster

combined_summaries = ""
combined_texts = ""
for i, docs in enumerate(representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"

    prompt = f"The following documents are from cluster {i} on the topic {target_page}:\n{combined_text}\nThis cluster of documents discusses {target_page} in the context of"

    combined_summaries += f"This cluster of documents discusses {target_page} in the context of {inference(prompt)}\n\n"
    combined_texts += "\n" + combined_text
    
print(combined_summaries)      
    
prompt = f"These {num_documents} paragraphs were extracted from clustering all occurences of {target_page} on wikipedia. These {num_documents} paragraphs represent the core of {target_page}:\n{combined_texts}\nThe following represents a textual reprsentation for the core of {target_page}:\nAt the core of {target_page}"
print(f"At the core of {target_page} {inference(prompt)}")




This cluster of documents discusses Sustainability in the context of environmentalism, socio-political movements, urban ecology, natural capitalism, and global food crises. The documents provide information on the definition of sustainability, its implications, and how it can be applied to various contexts.


At the core of Sustainability is the socio-political movement that is concerned about humankind's effects on the Earth. This movement advocates for sustainable management of resources and stewardship of the environment through changes in public policy and individual behavior. To achieve this, campaigns to move toward sustainable energy and resource consumption have been implemented, such as LEED certification of buildings, Energy Star certified appliances, and zero emission vehicles. Additionally, techniques such as carbon recapture may be used to sequester carbon compounds produced in urban centers. To further this goal, the concept of natural capitalism has been proposed, which 

In [14]:
print(prompt)

These 10 paragraphs were extracted from clustering all occurences of Sustainability on wikipedia. These 10 paragraphs represent the core of Sustainability:

Cluster 1 representative documents:
Document 1: 
Analysis:
 


Page: Eco commerce

This sustainability-related article is a stub. You can help Wikipedia by expanding it.

Document 2: 
Analysis:
 


Page: Jim Merkel

This sustainability-related article is a stub. You can help Wikipedia by expanding it.

Document 3: 
Analysis:
 


Page: Earth in culture

Over the past two centuries a growing environmental movement has emerged that is concerned about humankind's effects on the Earth. The key issues of this socio-political movement are the conservation of natural resources, elimination of pollution, and the usage of land.[45] Although diverse in interests and goals, environmentalists as a group tend to advocate sustainable management of resources and stewardship of the environment through changes in public policy and individual behavio

In [32]:
from tqdm import tqdm

texts = []
embeddings = []
for gptparagraph, paragraph in tqdm(zip(gpt_paragraphs, paragraphs)):
    text = gptparagraph
    texts.append(text)
    embedding = embed(text)
    embeddings.append(embedding)
    
import fastcluster
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
Z = fastcluster.linkage(embeddings, method='ward')
plt.figure(figsize=(len(embeddings), 7))
dendrogram(Z)
plt.show()


3711it [08:15,  7.50it/s]


ValueError: Image size of 371100x700 pixels is too large. It must be less than 2^16 in each direction.

<Figure size 371100x700 with 1 Axes>

In [19]:
num_clusters = 1
num_documents = 10
# Fit the KMeans model to your data
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)

# Now `kmeans.cluster_centers_` contains the centroids of the clusters
cluster_centroids = kmeans.cluster_centers_

# And `kmeans.labels_` contains the cluster number for each document
clusters = kmeans.labels_

# Get the representative document (the one closest to the centroid) for each cluster
representative_docs = []

for i in range(num_clusters):
    # Find the points in this cluster
    idx = np.where(clusters == i)[0]
    cluster_points = np.array(embeddings)[idx]
    
    # Find the 9 documents closest to the centroid
    closest_indices = cdist([cluster_centroids[i]], cluster_points).argsort()[0][:num_documents]
    representatives = [paragraphs[idx[closest_idx]] for closest_idx in closest_indices]
    representative_docs.append(representatives)

# Now `representative_docs` contains the most representative documents for each cluster

combined_centroid = []
for i, docs in enumerate(representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"
    combined_centroid.append(combined_text)
    prompt = f"A dataset is generated from embedding all occurence of {target_page} on wikipedia. These {num_documents} are the nearest occurances to a cluster centroid:\n{combined_texts}\nThe following represents a textual reprsentation for this cluster based on these occurences:\nBased on these occurences {target_page} can be seen as"
    print(f"Based on these occurences {target_page} can be seen as {inference(prompt)}")
    
     
    
#prompt = f"A dataset is generated from embedding all occurence of {target_page} on wikipedia. These {num_documents} are the nearest occurances to a cluster centroid:\n{combined_texts}\nThe following represents a textual reprsentation for this cluster based on these occurences:\nBased on these occurences {target_page} can be seen as"
#print(f"Based on these occurences {target_page} can be seen as {inference(prompt)}")



Based on these occurences Sustainability can be seen as a socio-political movement that is concerned with the conservation of natural resources, elimination of pollution, and the usage of land. It is also a concept that is encapsulated by the Brundtland Commission's definition of development that meets the needs and aspirations of the present without compromising the ability of future generations to meet their own needs. It is also a concept that is related to eco commerce, Jim Merkel, Earth in culture, urban ecology, Alan Durning, complexity, problem solving, and sustainable societies, Mootral, State of the World (book series), and Applied Sustainability.


In [17]:
print(prompt)

A dataset is generated from embedding all occurence of Sustainability on wikipedia. These 10 are the nearest occurances to a cluster centroid:

Cluster 1 representative documents:
Document 1: 
Analysis:
 


Page: Eco commerce

This sustainability-related article is a stub. You can help Wikipedia by expanding it.

Document 2: 
Analysis:
 


Page: Jim Merkel

This sustainability-related article is a stub. You can help Wikipedia by expanding it.

Document 3: 
Analysis:
 


Page: Earth in culture

Over the past two centuries a growing environmental movement has emerged that is concerned about humankind's effects on the Earth. The key issues of this socio-political movement are the conservation of natural resources, elimination of pollution, and the usage of land.[45] Although diverse in interests and goals, environmentalists as a group tend to advocate sustainable management of resources and stewardship of the environment through changes in public policy and individual behavior.[46] Of par

In [21]:
num_clusters = 2
num_documents = 10
# Fit the KMeans model to your data
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)

# Now `kmeans.cluster_centers_` contains the centroids of the clusters
cluster_centroids = kmeans.cluster_centers_

# And `kmeans.labels_` contains the cluster number for each document
clusters = kmeans.labels_

# Get the representative document (the one closest to the centroid) for each cluster
representative_docs = []

for i in range(num_clusters):
    # Find the points in this cluster
    idx = np.where(clusters == i)[0]
    cluster_points = np.array(embeddings)[idx]
    
    # Find the 9 documents closest to the centroid
    closest_indices = cdist([cluster_centroids[i]], cluster_points).argsort()[0][:num_documents]
    representatives = [paragraphs[idx[closest_idx]] for closest_idx in closest_indices]
    representative_docs.append(representatives)

# Now `representative_docs` contains the most representative documents for each cluster

combined_centroid = []
for i, docs in enumerate(representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"
    combined_centroid.append(combined_text)
    
    prompt = f"A dataset is generated from embedding all occurence of {target_page} on wikipedia. These {num_documents} are the nearest occurances to a cluster centroid:\n{combined_texts}\nThe following represents a textual reprsentation for this cluster based on these occurences:\nBased on these occurences {target_page} can be seen as"
    print(prompt + "\n")
    print(f"Based on these occurences {target_page} can be seen as {inference(prompt)}")
    



A dataset is generated from embedding all occurence of Sustainability on wikipedia. These 10 are the nearest occurances to a cluster centroid:

Cluster 1 representative documents:
Document 1: 
Analysis:
 


Page: Eco commerce

This sustainability-related article is a stub. You can help Wikipedia by expanding it.

Document 2: 
Analysis:
 


Page: Jim Merkel

This sustainability-related article is a stub. You can help Wikipedia by expanding it.

Document 3: 
Analysis:
 


Page: Earth in culture

Over the past two centuries a growing environmental movement has emerged that is concerned about humankind's effects on the Earth. The key issues of this socio-political movement are the conservation of natural resources, elimination of pollution, and the usage of land.[45] Although diverse in interests and goals, environmentalists as a group tend to advocate sustainable management of resources and stewardship of the environment through changes in public policy and individual behavior.[46] Of par

Based on these occurences Sustainability can be seen as a socio-political movement that is concerned with the conservation of natural resources, elimination of pollution, and the usage of land. It is also a concept that is encapsulated by the Brundtland Commission's definition of development that meets the needs and aspirations of the present without compromising the ability of future generations to meet their own needs. It is also a concept that is related to eco commerce, Jim Merkel, Earth in culture, urban ecology, Alan Durning, complexity, problem solving, and sustainable societies, Mootral, State of the World (book series), and applied sustainability.


In [36]:
print(combined_texts)


Cluster 1 representative documents:
Document 1: 

Page: Christian Reformed Church in North America

The CRC is opposed to abortion except in cases when the "life of the mother is genuinely threatened" by her pregnancy. The church "affirms the unique value of all human life" from the "moment of conception". Believers are called upon to show "compassion" to those experiencing unwanted pregnancies, even while they speak out against the "atrocity" of abortion. In 2010, the Synod adopted a recommendation "to instruct the Office of Social Justice and Hunger Action (OSJ) to boldly advocate for the church's position against abortion, and to help equip churches to promote the sanctity of human life" (Acts of Synod 2010, p. 883)."[14]

Document 2: 

Page: United States anti-abortion movement

The United States anti-abortion movement (also called the pro-life movement or right-to-life movement) contains elements opposing induced abortion on both moral and religious grounds and supports its legal

In [38]:
num_clusters = 3

# Fit the KMeans model to your data
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)

# Now `kmeans.cluster_centers_` contains the centroids of the clusters
cluster_centroids = kmeans.cluster_centers_

# And `kmeans.labels_` contains the cluster number for each document
clusters = kmeans.labels_

# Get the representative document (the one closest to the centroid) for each cluster
representative_docs = []

for i in range(num_clusters):
    # Find the points in this cluster
    idx = np.where(clusters == i)[0]
    cluster_points = np.array(embeddings)[idx]
    
    # Find the 9 documents closest to the centroid
    closest_indices = cdist([cluster_centroids[i]], cluster_points).argsort()[0][:5]
    representatives = [paragraphs[idx[closest_idx]] for closest_idx in closest_indices]
    representative_docs.append(representatives)

# Now `representative_docs` contains the most representative documents for each cluster

combined_summaries = ""
combined_texts = ""
for i, docs in enumerate(representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"

    prompt = f"The following documents are from cluster {i} on the topic {target_page}:\n{combined_text}\nThis cluster of documents discusses {target_page} in the context of"

    combined_summaries += f"This cluster of documents discusses {target_page} in the context of {inference(prompt)}\n\n"
    combined_texts += "\n" + combined_text
    
print(combined_summaries)    



This cluster of documents discusses Abortion in the context of various stories and scenarios, including a Ludacris song, the play and movie For Colored Girls, the biography of Tim Flannery, the movie Knocked Up, and the character Adrian "Fletch" Fletcher from the TV show Casualty. The documents discuss the various characters' reactions to unplanned pregnancies, the consequences of not being able to afford an abortion, and the emotional impact of terminating a pregnancy.

This cluster of documents discusses Abortion in the context of various organizations and political parties. The documents discuss the stances of different organizations and parties on the issue of Abortion, such as the Minnesota Family Council, the Louisiana Family Forum, the Constitution Party, and Jody Hice. They also discuss the positions of individual politicians, such as David Ramsay, on the issue.

This cluster of documents discusses Abortion in the context of its legality, availability, and religious and moral i

In [39]:
prompt = f"These 3 clusters represent sub-categories of {target_page}:\n{combined_summaries}\nThese 3 clusters can be differentiated as:\nCluster 1:"
print(f"Cluster 1: {inference(prompt)}")

Cluster 1: Abortion in Popular Culture
Cluster 2: Abortion in Politics
Cluster 3: Abortion in Law and Religion


In [29]:
print(combined_texts)


Cluster 1 representative documents:
Document 1: 

Page: Person (disambiguation)

A person is a being, such as a human, that has certain capacities or attributes constituting personhood.

Document 2: 

Page: Personalism

Personalism is an intellectual stance that emphasizes the importance of human persons. Personalism exists in many different versions, and this makes it somewhat difficult to define as a philosophical and theological movement.[1] Friedrich Schleiermacher first used the term personalism (German: Personalismus) in print in 1799.[2] One can trace the concept back to earlier thinkers in various parts of the world.[3]

Document 3: 

Page: Subject (philosophy)

A subject is a being who has a unique consciousness and/or unique personal experiences, or an entity that has a relationship with another entity that exists outside itself (called an "object").

Document 4: 

Page: Nontrinitarianism

Nontrinitarianism is a form of Christianity that rejects the mainstream Christian theo

In [40]:
num_clusters = 4

# Fit the KMeans model to your data
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)

# Now `kmeans.cluster_centers_` contains the centroids of the clusters
cluster_centroids = kmeans.cluster_centers_

# And `kmeans.labels_` contains the cluster number for each document
clusters = kmeans.labels_

# Get the representative document (the one closest to the centroid) for each cluster
representative_docs = []

for i in range(num_clusters):
    # Find the points in this cluster
    idx = np.where(clusters == i)[0]
    cluster_points = np.array(embeddings)[idx]
    
    # Find the 9 documents closest to the centroid
    closest_indices = cdist([cluster_centroids[i]], cluster_points).argsort()[0][:5]
    representatives = [paragraphs[idx[closest_idx]] for closest_idx in closest_indices]
    representative_docs.append(representatives)

# Now `representative_docs` contains the most representative documents for each cluster

combined_summaries = ""
combined_texts = ""
for i, docs in enumerate(representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"

    prompt = f"The following documents are from cluster {i} on the topic {target_page}:\n{combined_text}\nThis cluster of documents discusses {target_page} in the context of"

    combined_summaries += f"This cluster of documents discusses {target_page} in the context of {inference(prompt)}\n\n"
    combined_texts += "\n" + combined_text
    
print(combined_summaries)    



This cluster of documents discusses Abortion in the context of social conservative advocacy, reproductive rights, and church attitudes. It also touches on related topics such as birth control, contraception, and euthanasia.

This cluster of documents discusses Abortion in the context of politics and policy. The documents discuss the stances of various politicians on the issue of abortion, as well as the positions of organizations such as the Minnesota Family Council and the Constitution Party. They also discuss specific pieces of legislation related to abortion, such as the Abortion Pain Bill and the Child Interstate Abortion Notification Act.

This cluster of documents discusses Abortion in the context of relationships, family, and personal decisions. It covers topics such as teenage pregnancy, the decision to have an abortion, and the emotional and social consequences of such a decision. It also touches on religious views on abortion and the controversy surrounding the topic.

This c

In [41]:
prompt = f"These {num_clusters} clusters represent sub-categories of {target_page}:\n{combined_summaries}\nThese {num_clusters} clusters can be differentiated as:\nCluster 1:"
print(f"Cluster 1: {inference(prompt)}")

Cluster 1: Social Conservative Advocacy, Reproductive Rights, and Church Attitudes
Cluster 2: Politics and Policy
Cluster 3: Relationships, Family, and Personal Decisions
Cluster 4: Different Countries and Regions


In [42]:
num_clusters = 5

# Fit the KMeans model to your data
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(embeddings)

# Now `kmeans.cluster_centers_` contains the centroids of the clusters
cluster_centroids = kmeans.cluster_centers_

# And `kmeans.labels_` contains the cluster number for each document
clusters = kmeans.labels_

# Get the representative document (the one closest to the centroid) for each cluster
representative_docs = []

for i in range(num_clusters):
    # Find the points in this cluster
    idx = np.where(clusters == i)[0]
    cluster_points = np.array(embeddings)[idx]
    
    # Find the 9 documents closest to the centroid
    closest_indices = cdist([cluster_centroids[i]], cluster_points).argsort()[0][:5]
    representatives = [paragraphs[idx[closest_idx]] for closest_idx in closest_indices]
    representative_docs.append(representatives)

# Now `representative_docs` contains the most representative documents for each cluster

combined_summaries = ""
combined_texts = ""
for i, docs in enumerate(representative_docs, start=1):
    combined_text = f"Cluster {i} representative documents:\n"
    for j, doc in enumerate(docs, start=1):
        combined_text += f"Document {j}: \n{doc}\n"

    prompt = f"The following documents are from cluster {i} on the topic {target_page}:\n{combined_text}\nThe common theme which this cluster addresses is"

    combined_summaries += f"{inference(prompt)}\n\n"
    combined_texts += "\n" + combined_text
    
print(combined_summaries) 
prompt = f"These {num_clusters} clusters represent sub-categories of {target_page}:\n{combined_summaries}\nThese {num_clusters} clusters can be differentiated as:\nCluster 1:"
print(f"Cluster 1: {inference(prompt)}")



the use of contraception and abortion in different cultures and contexts. The documents discuss the use of contraception and abortion in Mexico, Thailand, the Catholic Church, preterm birth, and the medieval Islamic world. They also discuss the risks associated with these practices, as well as the beliefs and attitudes of different cultures towards them.

the consequences of abortion, particularly in terms of the emotional and psychological impact it can have on those involved. The documents discuss the topics of teenage pregnancy, guilt, family pressure, and the need to make difficult decisions. They also explore the idea of running away from home to avoid the consequences of an abortion, as well as the potential for regret and the difficulty of moving on from the experience.

the social and political implications of abortion. All of the documents discuss the issue of abortion in some way, either by discussing the stance of certain organizations or individuals on the issue, or by disc