In [13]:
import os
import time
import requests
import concurrent.futures
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np
import openai
import pickle
from concurrent.futures import ThreadPoolExecutor
# Set up your OpenAI API key
openai.api_key = "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"

class WikipediaAPI:
    def __init__(self, lang='en'):
        self.lang = lang
        self.base_url = f'https://{lang}.wikipedia.org/w/api.php'

    def _api_call(self, params):
        response = requests.get(self.base_url, params=params)
        data = response.json()
        if 'error' in data:
            print(f"Error: {data['error']['info']}")
        return data

    def get_inlinks(self, title, limit=5000):
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'backlinks',
            'bltitle': title,
            'bllimit': limit,
            'blnamespace': 0,  # Only retrieve links from main namespace
            'continue': ''  # Placeholder for pagination
        }
        inlinks = []
        while True:
            data = self._api_call(params)
            inlink_pages = data['query']['backlinks']
            for page in inlink_pages:
                inlinks.append(page['title'])
            if 'continue' not in data:
                break
            params['continue'] = data['continue']['continue']
            params['blcontinue'] = data['continue']['blcontinue']
        return inlinks

    def get_page_content(self, title):
        params = {
            'action': 'parse',
            'page': title,
            'format': 'json',
            'prop': 'text',
            'contentmodel': 'wikitext'
        }
        data = self._api_call(params)
        html_content = data['parse']['text']['*']
        return html_content


class TextExtractor:
    def __init__(self, lang='en'):
        self.lang = lang
        self.api = WikipediaAPI(lang)

    def get_paragraph_with_link(self, page_title, link_title):
        content = self.api.get_page_content(page_title)
        soup = BeautifulSoup(content, 'html.parser')
        elements = soup.find_all('p')
        link_href = "/wiki/" + link_title.replace(' ', '_')
        for element in elements:
            a_tags = element.find_all('a', href=True)
            if any(a['href'] == link_href for a in a_tags):
                return element.text.strip()
        return None


class OpenAI_API:
    def __init__(self, retries=5, backoff_factor=0.1):
        self.retries = retries
        self.backoff_factor = backoff_factor

    def _api_call(self, method, *args, **kwargs):
        for i in range(self.retries):
            try:
                return method(*args, **kwargs)
            except (openai.error.RateLimitError, openai.error.APIError, openai.error.ServiceUnavailableError) as e:
                if i < self.retries - 1:  # if it's not the last retry attempt
                    sleep_time = self.backoff_factor * (2 ** i)  # exponential backoff
                    time.sleep(sleep_time)
                else:  # if it's the last retry attempt
                    print(f"Failed to call API after {self.retries} attempts.")
                    raise

    def inference(self, prompt):
        # Your API call here
        response = self._api_call(openai.Completion.create,
            engine="text-davinci-003",
            prompt=prompt,
            max_tokens=500,
            n=1,
            stop = "",
            temperature=0.1,
        )
        return response.choices[0].text.strip()

    def embed(self, text):
        response = self._api_call(openai.Embedding.create,
            input=text,
            model="text-embedding-ada-002"
        )
        return response['data'][0]['embedding']


class HyperlinkAnalysis:
    def __init__(self):
        self.text_extractor = TextExtractor()
        self.openai_api = OpenAI_API()

    def _get_inlink_data_helper(self, inlink, target_page):
        text = self.text_extractor.get_paragraph_with_link(inlink, target_page)
        if text is None:
            return None

        analysis = self.hyperlink_analysis(target_page, text, inlink)
        embedding = self.openai_api.embed(analysis)

        return inlink, text, analysis, embedding

    def get_inlink_data(self, target_page, batch_size=100):
        inlinks = self.text_extractor.api.get_inlinks(target_page)
        found_inlinks = []
        none_inlinks = []  # This is the new list to store inlinks that return None
        texts = []
        analyses = []
        embeddings = []
        
        # Check if there is already saved data
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
            already_processed = set(found_inlinks).union(none_inlinks)  # Consider None inlinks as already processed
            inlinks = [inlink for inlink in inlinks if inlink not in already_processed]
        except Exception:
            pass

        with ThreadPoolExecutor() as executor:
            for idx, result in enumerate(tqdm(executor.map(self._get_inlink_data_helper, inlinks, [target_page]*len(inlinks)), total=len(inlinks), desc=target_page, unit="page"), 1):
                try:
                    if result is None:
                        none_inlinks.append(inlinks[idx-1])  # Store the inlink that returned None
                        continue
                    inlink, text, analysis, embedding = result
                    found_inlinks.append(inlink)
                    texts.append(text)
                    analyses.append(analysis)
                    embeddings.append(embedding)

                    # Save data after every batch
                    if idx % batch_size == 0:
                        self._save_data(target_page, 'found_inlinks', found_inlinks)
                        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save None inlinks after every batch
                        self._save_data(target_page, 'texts', texts)
                        self._save_data(target_page, 'analyses', analyses)
                        self._save_data(target_page, 'embeddings', embeddings)
                except Exception as e:
                    print(f"An exception occurred: {e}")
                    continue

        # Save remaining data that didn't fit into a full batch
        self._save_data(target_page, 'found_inlinks', found_inlinks)
        self._save_data(target_page, 'none_inlinks', none_inlinks)  # Save remaining None inlinks
        self._save_data(target_page, 'texts', texts)
        self._save_data(target_page, 'analyses', analyses)
        self._save_data(target_page, 'embeddings', embeddings)

        return found_inlinks, none_inlinks, texts, analyses, embeddings  # Return None inlinks along with other data


    def load_saved_data(self, target_page):
        try:
            found_inlinks = self._load_data(target_page, 'found_inlinks')
            none_inlinks = self._load_data(target_page, 'none_inlinks')
            texts = self._load_data(target_page, 'texts')
            analyses = self._load_data(target_page, 'analyses')
            embeddings = self._load_data(target_page, 'embeddings')
        except FileNotFoundError as e:
            print(f"No saved data found for {target_page}. Please check the target page or the saved data.")
            return None
        except Exception as e:
            print(f"An unexpected error occurred while loading saved data for {target_page}: {e}")
            return None

        return found_inlinks, none_inlinks, texts, analyses, embeddings


    def hyperlink_analysis(self, hyperlink, paragraph, page):
        prompt = f"""In the context of '{paragraph}' on the Wikipedia page '{page}', the hyperlink '{hyperlink}' appears. The following factors come into consideration:
        1) Extent of '{hyperlink}' usage within this context.
        2) Boundaries and limitations regarding this usage.
        3) Any interplay with other concepts or events within this context.
        4) The relevance and necessity of '{hyperlink}' within this specific context.

        1) Extent of '{hyperlink}' usage within this context can be described as:
        """
        return f"1) Extent of '{hyperlink}' usage within this context can be described as:\n\n" + self.openai_api.inference(prompt)
    
    def _load_data(self, target_page, data_type):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory
        file_path = os.path.join(directory, f'{data_type}.pkl')

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"No such file or directory: '{file_path}'")

        with open(file_path, 'rb') as f:
            data = pickle.load(f)

        return data

    def _save_data(self, target_page, data_type, data):
        directory = os.path.join('your_base_dir', target_page)  # replace 'your_base_dir' with your actual directory

        # Create target Directory if doesn't exist
        if not os.path.exists(directory):
            os.makedirs(directory)
            print("Directory ", directory, " Created ")

        file_path = os.path.join(directory, f'{data_type}.pkl')

        with open(file_path, 'wb') as f:
            pickle.dump(data, f)

In [None]:
target_page = "Israeli–Palestinian conflict"
hyperlink_analysis = HyperlinkAnalysis()

found_inlinks, none_inlinks, texts, analyses, embeddings = hyperlink_analysis.get_inlink_data(target_page)


Israeli–Palestinian conflict:   1%|▎                                           | 15/2397 [00:11<31:21,  1.27page/s]


In [4]:
for found_inlink, text, analysis in zip(found_inlinks, texts, analyses):
    time.sleep(0.00001)
    print(found_inlink, "\n", text, "\n", analysis, "\n", "\n")

Assistive technology 
 Among these aspects, acceptability and ethical considerations are particularly relevant to those technologies that are extremely invasive (such as cortical or auditory brainstem implants), or replace the human caregiver and human interaction, or collect and use data on cloud-based services or interconnected devices (e.g., companion robots, smart nursing and health-monitoring technologies), raising privacy issues and requiring connectivity, or raise safety concerns, such as autonomous wheelchairs. 
 1) Extent of 'Safety' usage within this context can be described as:

'Safety' is used to refer to the potential risks associated with the use of certain technologies, such as autonomous wheelchairs, companion robots, smart nursing and health-monitoring technologies, and cortical or auditory brainstem implants. It is used to emphasize the importance of considering the safety of users when using these technologies. 
        
        2) Boundaries and limitations regardi

Defensive programming 
 Defensive programming is a form of defensive design intended to develop programs that are capable of detecting potential security abnormalities and make predetermined responses.[1] It ensures the continuing function of a piece of software under unforeseen circumstances. Defensive programming practices are often used where high availability, safety, or security is needed. 
 1) Extent of 'Safety' usage within this context can be described as:

Safety is used to refer to the ability of a program to detect potential security abnormalities and make predetermined responses. It is used to ensure the continuing function of a piece of software under unforeseen circumstances.

        2) Boundaries and limitations regarding this usage can be described as:
        
        Defensive programming practices are limited to ensuring the continuing function of a piece of software under unforeseen circumstances. It does not guarantee the safety of the software in all cases, as it

Berenstain Bears 
 The Berenstain Bears is a children's literature franchise created by the late Stan and Jan Berenstain and continued by their son, Mike Berenstain, who assumed partial authorship in 2002, and full authorship in 2012 following his mother's passing. The books feature a family of anthropomorphic grizzly bears who generally learn a moral or safety-related lesson in the course of each story. 
 1) Extent of 'Safety' usage within this context can be described as:

The usage of 'Safety' within this context is quite extensive. It is mentioned in the context of the books featuring a family of anthropomorphic grizzly bears who generally learn a moral or safety-related lesson in the course of each story. This implies that safety is an important part of the stories and is a recurring theme throughout the franchise. 

        2) Boundaries and limitations regarding this usage can be described as:
        
        The boundaries and limitations regarding the usage of 'Safety' within

Regulation of sport 
 Formula One motor racing is an example of strict and changing regulation, where the regulating body appears to control rather than to simply define the sport. There have been major changes in the rules of F1 recently, almost on an annual basis, and more are planned. Sometimes this is done for safety reasons, sometimes to make the racing more interesting as a spectator sport, and sometimes to promote competition through involvement of smaller teams. Some changes make overtaking more probable for example or reduce the probability of an overwhelming technical advantage by any one team. Although heavily regulated, most people agree that the sport has thereby greatly benefitted, not least through dramatic leaps in safety. 
 1) Extent of 'Safety' usage within this context can be described as:

The usage of 'Safety' within this context is quite extensive. It is mentioned multiple times throughout the text, and is used to explain why certain changes have been made to the 

Kill switch 
 A kill switch, also known more formally as an emergency brake, emergency stop (E-stop), emergency off (EMO), or emergency power off (EPO), is a safety mechanism used to shut off machinery in an emergency, when it cannot be shut down in the usual manner.  Unlike a normal shut-down switch or shut-down procedure, which shuts down all systems in order and turns off the machine without damage, a kill switch is designed and configured to abort the operation as quickly as possible (even if it damages the equipment) and to be operated simply and quickly (so that even a panicked operator with impaired executive functions or a bystander can activate it). Kill switches are usually designed to be noticeable, even to an untrained operator or a bystander. 
 1) Extent of 'Safety' usage within this context can be described as:

The usage of 'Safety' within this context is extensive, as it is the primary purpose of the kill switch. The kill switch is designed to be used in an emergency si

International Consumer Research & Testing 
 Safety has always been a topic of concern to consumer organisations. ICRT members have played a key role in promoting higher safety standards for consumer goods. 
 1) Extent of 'Safety' usage within this context can be described as:

The term 'Safety' is used in a broad sense to refer to the overall safety of consumer goods. It is used to refer to the standards of safety that consumer organisations strive to promote and maintain.

        2) Boundaries and limitations regarding this usage can be described as:

The term 'Safety' is used to refer to the safety of consumer goods, and does not refer to any other aspects of safety, such as workplace safety or safety in public spaces.

        3) Any interplay with other concepts or events within this context can be described as:

The term 'Safety' is used in conjunction with other concepts, such as 'consumer organisations' and 'higher safety standards', to emphasise the importance of safety in the

Aviation Safety Reporting System 
 The Aviation Safety Reporting System, or ASRS, is the US Federal Aviation Administration's (FAA) voluntary confidential reporting system that allows pilots, air traffic controllers, cabin crew, dispatchers, maintenance technicians, ground operations, and UAS operators and drone flyers to confidentially report near misses or close call events in the interest of improving aviation safety. The ASRS collects, analyzes, and responds to voluntarily submitted aviation safety incident reports in order to reduce the likelihood of aviation accidents.[1] The ASRS was designed and is operated by NASA, who is seen as a neutral third-party due to its lack of enforcement authority and relations with airlines. The confidential and independent nature of the ASRS is key to its long-term success in identifying numerous latent system hazards in the National Airspace System (NAS). The FAA extends limited immunity to individual aviation workers for reporting safety events 

Safety standards 
 Safety standards are standards designed to ensure the safety of products, activities and processes, etc. They may be advisory or compulsory and are normally laid down by an advisory or regulatory body that may be either voluntary or statutory. In October 2021, a fire raging through multiple floors of a diplated apartment block in Kaoshiung highlighted the lax standards in Taiwan.[1] China has recently experienced trouble with some of the post listed associations. 
 1) Extent of 'Safety' usage within this context can be described as:

The term 'Safety' is used in this context to refer to the standards designed to ensure the safety of products, activities and processes. It is used to emphasize the importance of these standards and to highlight the consequences of not adhering to them.

        2) Boundaries and limitations regarding this usage:
        
        The usage of 'Safety' in this context is limited to the standards designed to ensure the safety of products, 

Afghanistan Scout Association 
 With the 1973 overthrow by pro-Soviet Mohammed Daoud Khan, the Scout association became part of the Ministry of the Interior, took over police tasks and became a part of the Afghan police. Scouting went downhill, as during the years of unrest and war, approximately ten million left the country and looked for refuge abroad. At the beginning of the unrest, the privileged social classes left the country, so that such organizations could not exist anymore in the countryside.[citation needed] 
 1) Extent of 'Safety' usage within this context can be described as:

The usage of 'Safety' within this context is limited to the discussion of the decline of the Afghan Scout Association due to the unrest and war in the country. It is used to describe the lack of safety that caused the privileged social classes to flee the country, leading to the decline of the organization. 

        2) Boundaries and limitations regarding this usage:
        
        The usage of 'S

Government Cut 
 Operation of the cut falls upon three government agencies. The Port of Miami is responsible for navigation, while the United States Coast Guard is responsible for safety and security, and the U.S. Army Corps of Engineers for the channel itself, including dredging. 
 1) Extent of 'Safety' usage within this context can be described as:

The term 'Safety' is used to refer to the responsibility of the United States Coast Guard in regards to the Government Cut. The Coast Guard is responsible for ensuring the safety and security of the cut, which includes monitoring and enforcing regulations, as well as responding to any potential threats or emergencies. 

        2) Boundaries and limitations regarding this usage can be described as:
        
        The term 'Safety' is limited to the responsibility of the United States Coast Guard in regards to the Government Cut. The Coast Guard is responsible for ensuring the safety and security of the cut, but is not responsible for an

Insect wing 
 Natural selection has played an enormous role in refining the wings, control and sensory systems, and anything else that affects aerodynamics or kinematics. One noteworthy trait is wing twist. Most insect wings are twisted, as are helicopter blades, with a higher angle of attack at the base. The twist generally is between 10 and 20 degrees. In addition to this twist, the wing surfaces are not necessarily flat or featureless; most larger insects have wing membranes distorted and angled between the veins in such a way that the cross-section of the wings approximates an airfoil. Thus, the wing's basic shape already is capable of generating a small amount of lift at zero angle of attack. Most insects control their wings by adjusting tilt, stiffness, and flapping frequency of the wings with tiny muscles in the thorax (below). Some insects evolved other wing features that are not advantageous for flight, but play a role in something else, such as mating or protection.[29] 
 1) 

Consumer fireworks 
 There are several ways by which fireworks can be ignited. The most basic of these is simply flame from a match, lighter or other devices that emits flames. Another way to light fireworks is by using a device called a punk. A punk is a long, thin piece of wood covered in a substance that burns very slowly, producing only heat, with no flame. A port fire is a smoldering compound as a powder compressed in a stiff paper tube. The most complicated method used to ignite consumer fireworks is to use electronic ignition. This is the preferred method of many professional pyrotechnicians worldwide because of the vast improvement in operator safety. There are a few electronic ignition (often called "e-fire") systems that use readily available materials. 
 1) Extent of 'Safety' usage within this context can be described as:

The usage of 'Safety' within this context is quite extensive. It is mentioned in the context of the most complicated method used to ignite consumer firewo

Kinsale Road Roundabout 
 As of 2006, traffic counts showed that there were approximately 100,000 vehicles using the roundabout daily.[1] The straight-through movement between the east and west sides of the South Ring Road was the highest recorded traffic movement at the roundabout. As such, it was deemed that there was a need for grade separation.[citation needed] A number of options were evaluated in terms of environmental impact, safety, constructibility, cost, aesthetics, disruption to traffic during construction and land take. These included two flyover and two underpass options. Ultimately it was considered that a flyover carrying the N40 South Ring Road (east-west over the existing roundabout) was the most suitable solution.[citation needed] Construction on this flyover started in mid-2005 and finished in August 2006.[2] 
 1) Extent of 'Safety' usage within this context can be described as:

Safety is mentioned in the context of the Kinsale Road Roundabout in order to explain wh

System safety 
 The system safety concept calls for a risk management strategy based on identification, analysis of hazards and application of remedial controls using a systems-based approach.[1]  This is different from traditional safety strategies which rely on control of conditions and causes of an accident based either on the epidemiological analysis or as a result of investigation of individual past accidents.[2] The concept of system safety is useful in demonstrating adequacy of technologies when difficulties are faced with probabilistic risk analysis.[3] The underlying principle is one of synergy: a whole is more than sum of its parts. Systems-based approach to safety requires the application of scientific, technical and managerial skills to hazard identification, hazard analysis, and elimination, control, or management of hazards throughout the life-cycle of a system, program, project or an activity or a product.[1]  "Hazop" is one of several techniques available for identifica

History of the petroleum industry in Canada (natural gas) 
 Part of a series on Canada's petroleum industry, this entry focuses on the second of these two functions of gas processing - removing impurities from the gas stream - rather than recovering natural gas liquids, described elsewhere. Of course, most large plants perform both functions, and plants have no other ultimate purpose than to quickly, safely and profitably turn raw gas into products to be safely shipped (mostly by pipeline) to market. The discussion covers gas processing as an engineering feat, critical developments in exploration and development and the fundamentals of the marketplace. 
 1) Extent of 'Safety' usage within this context can be described as:

The term 'Safety' is used in the context of the Wikipedia page 'History of the petroleum industry in Canada (natural gas)' to refer to the importance of quickly, safely and profitably turning raw gas into products to be safely shipped (mostly by pipeline) to market. 

Symbolic behavior 
 The use of gimmickry, using superficial pleasantness to cover up dishonest activities or intentions, providing misleading or incorrect advice regarding safety, or providing untrue explanations for behaviors are means used by unethical organizations, managers, or coworkers in order to obtain some advantage (Harris & Nelson 2008, p. 240). 
 1) Extent of 'Safety' usage within this context can be described as:

'Safety' is used to describe the unethical activities or intentions of organizations, managers, or coworkers in order to gain some advantage. It is used to emphasize the importance of being aware of the potential risks associated with such activities and to ensure that the safety of those involved is not compromised.

        2) Boundaries and limitations regarding this usage can be described as:
        
        'Safety' should not be used to justify unethical activities or intentions. It should be used to emphasize the importance of being aware of the potential

SeeMore's Playhouse 
 SeeMore's Playhouse is an American children's television series using puppets to teach preschoolers about health and safety concepts. 
 1) Extent of 'Safety' usage within this context can be described as:

'Safety' is a key concept within the context of SeeMore's Playhouse. It is used to teach preschoolers about health and safety concepts, and is a major focus of the show. The show uses puppets to help illustrate the importance of safety and to help children understand the concepts. 

        2) Boundaries and limitations regarding this usage can be described as:
        
        The show focuses on teaching preschoolers about health and safety concepts, and does not delve into more complex topics. The show also does not provide specific advice on how to stay safe, but rather focuses on teaching children the importance of safety. 

        3) Any interplay with other concepts or events within this context can be described as:
        
        The show also focuses

Certified California Municipal Treasurer 
 The municipal treasurer does not have the public’s permission to put public funds principal at risk, no matter how great the potential yield.  Securities options, therefore, are limited by statute to the safest portion of the investment continuum.  Public Treasury statutes mandate a priority of safety (first), liquidity (secondarily) and yield (last).[8]  Since there is an inverse relationship between risk and rate of return, the potential return is limited by the risk that public funds are permitted to take. 
 1) Extent of 'Safety' usage within this context can be described as:

The term 'Safety' is used to refer to the priority of safety that public treasury statutes mandate. It is used to emphasize the importance of safety when it comes to investing public funds and to explain the inverse relationship between risk and rate of return. 

        2) Boundaries and limitations regarding this usage:
        
        The boundaries and limitation

Functional safety 
 Functional safety is the part of the overall safety of a system or piece of equipment that depends on automatic protection operating correctly in response to its inputs or failure in a predictable manner (fail-safe). The automatic protection system should be designed to properly handle likely human errors, systematic errors, hardware failures and operational/environmental stress. 
 1) Extent of 'Safety' usage within this context can be described as:

The term 'Safety' is used to refer to the overall safety of a system or piece of equipment that depends on automatic protection operating correctly in response to its inputs or failure in a predictable manner (fail-safe). The automatic protection system should be designed to properly handle likely human errors, systematic errors, hardware failures and operational/environmental stress.

        2) Boundaries and limitations regarding this usage can be described as:
        
        The term 'Safety' is used to refer to t

Advance for Medical Laboratory Professionals 
 The biweekly trade journal Advance for Medical Laboratory Professionals started in 1991. During its time in circulation, it served an audience of bench technologists, chief technologists, cytotechnologists, generalists, histotechnologists, laboratory directors/managers, laboratory section heads, medical laboratory scientists, medical laboratory technicians, blood specialists, educators and others in the medical laboratory field. Special issues of the trade journal included the education issue, National Medical Laboratory Professionals Week issue, industry outlook issue, new graduate issue and the annual safety issue.[1] 
 1) Extent of 'Safety' usage within this context can be described as:

The usage of 'Safety' within this context is limited to the annual safety issue of the trade journal. It is mentioned in the context of the special issues of the journal, and is not used in any other context. 

        2) Boundaries and limitations rega

The HALO Corporation 
 The HALO Corporation is a San Diego-based company that provides safety, security and disaster relief.[1] 
 1) Extent of 'Safety' usage within this context can be described as:

The HALO Corporation provides safety, security and disaster relief, which implies that safety is a major component of the company's services. The company is dedicated to providing safety to its clients, and this is reflected in its mission statement. The company also provides training and education on safety topics, and has developed a range of safety products and services.

        2) Boundaries and limitations regarding this usage can be described as:
        
        The HALO Corporation provides safety services, but it is important to note that these services are limited to the scope of the company's mission. The company does not provide safety services outside of its mission, and it is important to understand the boundaries of the services it provides. Additionally, the company does n

Marine technology 
 Marine technology is defined by WEGEMT (a European association of 40 universities in 17 countries) as "technologies for the safe use, exploitation, protection of, and intervention in, the marine environment."  In this regard, according to WEGEMT, the technologies involved in marine technology are the following:[1] naval architecture, marine engineering, ship design, ship building and ship operations; oil and gas exploration, exploitation, and production; hydrodynamics, navigation, sea surface and sub-surface support, underwater technology and engineering; marine resources (including both renewable and non-renewable marine resources); transport logistics and economics; inland, coastal, short sea and deep sea shipping; protection of the marine environment; leisure and safety.[1] 
 1) Extent of 'Safety' usage within this context can be described as:

Safety is used in this context to refer to the protection of the marine environment, as well as the leisure and safety o

Protective gear in sports 
 Personal protective equipment serves an integral role in maintaining the safety of an athlete participating in a sport. The usage and development of protective gear in sports has evolved through time, and continues to advance over time. Many sports league or professional sports mandate the provision and usage of protective gear for athletes in the sport. Usage of protective gear is also mandated in college athletics and occasionally in amateur sports.[1] 
 1) Extent of 'Safety' usage within this context can be described as:

The usage of safety is integral to the context of protective gear in sports. It is mandated by many sports leagues and professional sports, as well as college athletics and occasionally in amateur sports. The usage of protective gear is intended to ensure the safety of athletes participating in the sport. 

        2) Boundaries and limitations regarding this usage can be described as:
        
        The boundaries and limitations of s

Soteria (mythology) 
 In Greek mythology, Soteria (Greek: Σωτηρία) was the goddess or spirit (daimon) of safety and salvation, deliverance, and preservation from harm (not to be mistaken for Eleos). Soteria was also an epithet of the goddesses Persephone and Hecate, meaning deliverance and safety.[1] 
 1) Extent of 'Safety' usage within this context can be described as:

'Safety' is used throughout the Wikipedia page to describe the goddess or spirit of Soteria, as well as an epithet of the goddesses Persephone and Hecate. It is used to describe the concept of deliverance and preservation from harm. 

        2) Boundaries and limitations regarding this usage can be described as:
        
        'Safety' is used to describe the concept of deliverance and preservation from harm, but it is not used to describe any other concepts or events within this context. 

        3) Any interplay with other concepts or events within this context can be described as:
        
        'Safety' is us

Health Intervention and Technology Assessment Program 
 In 1981, Thailand’s National List of Essential Medicines was created. Subsequently, in 1983, the Subcommittee for the Development of the National List of Essential Medicines who works in collaboration with the Food and Drug Administration as its secretariat. In the latter years, the function of the subcommittee became the maintenance of an optimal list of medicines, wherein the criteria for selection were cost, safety, efficacy and effectiveness of drugs. In each of these criteria, the subcommittee’s [7] was to consider scientific evidence to determine which medicines are to be included in the list. Twenty-eight specialist working groups undertake the task of determining what should be on the list as well as informing price negotiations between manufacturers and the NLEM. 
 1) Extent of 'Safety' usage within this context can be described as:

The concept of 'Safety' is used throughout the context to refer to the criteria for selec

Permanent time observation in the United States 
 Permanent standard time is considered by circadian health researchers and safety experts worldwide to be the best option for health, safety, schools, and economy, including the American Academy of Sleep Medicine, National Sleep Foundation, American College of Chest Physicians, National Safety Council, American College of Occupational and Environmental Medicine, Canadian Sleep Society, World Sleep Society, Society for Research on Biological Rhythms, and several state sleep societies.[11][12][4][13][14][15][16][17][18] Permanent standard time is supported by advocates for school children, including the National PTA, National Education Association, American Federation of Teachers, National School Boards Association, and Start School Later. They cite both the health benefits of circadian alignment, and the safety advantages regarding morning commutes.[11][19][20][21] 
 1) Extent of 'Safety' usage within this context can be described as:

Sa

Dakota Access Pipeline 
 In 2014, conservation groups raised concerns about safety, and the impacts on air, water, wildlife and farming, because of the risk of the pipeline disruption.[118] Groups such as Greenpeace, the Science & Environmental Health Network,[119] and in 2016 a group of more than 160 scientists spoke out against the pipeline.[120][121][122] 
 1) Extent of 'Safety' usage within this context can be described as:

The term 'Safety' is used to refer to the potential risks posed by the Dakota Access Pipeline, such as the potential for disruption to air, water, wildlife, and farming. It is also used to refer to the concerns raised by conservation groups in 2014, as well as the group of more than 160 scientists who spoke out against the pipeline in 2016. 

        2) Boundaries and limitations regarding this usage can be described as:
        
        The term 'Safety' is used to refer to the potential risks posed by the Dakota Access Pipeline, and does not refer to any othe

In [5]:
print(len(found_inlinks))

348


In [8]:
num_clusters = 4

kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(embeddings)

labels = kmeans.labels_

# Split the original data according to the clusters
clustered_data = {i: [] for i in range(num_clusters)}
for i, label in enumerate(labels):
    clustered_data[label].append((found_inlinks[i], texts[i], analyses[i], embeddings[i]))

# Now, clustered_data is a dictionary where keys are cluster labels (0 to num_clusters-1)
# and values are lists of tuples belonging to each cluster.

# Calculate the distance from each point to the center of its cluster
for label, data in clustered_data.items():
    # Calculate the Euclidean distance from each point to the cluster center
    distances = [distance.euclidean(d[-1], kmeans.cluster_centers_[label]) for d in data]
    
    # Create a combined list of distances and data
    data_with_distances = list(zip(distances, data))

    # Sort by distance (each element is a tuple where the first element is the distance)
    sorted_data_with_distances = sorted(data_with_distances, key=lambda x: x[0])

    # Remove distances, keep only data
    sorted_data = [d for _, d in sorted_data_with_distances]

    # Replace original cluster data with sorted data
    clustered_data[label] = sorted_data
# Print out the corresponding inlink, texts, and analysis for the first 5 embeddings in each cluster
for cluster, cluster_data in clustered_data.items():
    print(f"\nCluster {cluster}:\n-------------------------")
    for data in cluster_data[:5]:  # Here we take only the first 5 data points in each cluster
        inlink, text, analysis, _ = data  # Unpack data tuple
        print(f"Inlink: {inlink}\nText: {text}\n\n")




Cluster 0:
-------------------------
Inlink: Regulatory focus theory
Text: Another focus is the prevention-focus based on safety and responsibilities, also known as non-losses. This focus emphasizes security and safety by following the guidelines and the rules.[6]


Inlink: Danger
Text: Danger is a lack of safety and may refer to:


Inlink: Want
Text: Examples of wants that people would like to have is financial monitoring, saving time, higher paying job, more comfort, healthier diet, physical fitness, spirituality, friendship, companionship and safety.


Inlink: Health
Text: In addition to safety risks, many jobs also present risks of disease, illness and other long-term health problems. Among the most common occupational diseases are various forms of pneumoconiosis, including silicosis and coal worker's pneumoconiosis (black lung disease). Asthma is another respiratory illness that many workers are vulnerable to. Workers may also be vulnerable to skin diseases, including eczema, der

In [None]:
cluster, cluster_data = clustered_data.items()[0]

TODO: 
Iterative clustering
gpt4 - cluster lens analysis
Further website stuff

In [9]:
from sklearn.cluster import KMeans
from scipy.spatial import distance

def perform_clustering(data, num_clusters):
    # Extract only the embeddings from the data tuples
    embeddings = [d[-1] for d in data]

    # Perform K-means clustering on the embeddings
    kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(embeddings)

    # Split the original data according to the clusters
    labels = kmeans.labels_
    clustered_data = {i: [] for i in range(num_clusters)}
    for i, label in enumerate(labels):
        clustered_data[label].append(data[i])

    # Calculate the distance from each point to the center of its cluster
    for label, cluster_data in clustered_data.items():
        # Calculate the Euclidean distance from each point to the cluster center
        distances = [distance.euclidean(d[-1], kmeans.cluster_centers_[label]) for d in cluster_data]

        # Create a combined list of distances and data
        data_with_distances = list(zip(distances, cluster_data))

        # Sort by distance (each element is a tuple where the first element is the distance)
        sorted_data_with_distances = sorted(data_with_distances, key=lambda x: x[0])

        # Remove distances, keep only data
        sorted_data = [d for _, d in sorted_data_with_distances]

        # Replace original cluster data with sorted data
        clustered_data[label] = sorted_data

    return clustered_data

# Your original data
data = list(zip(found_inlinks, texts, analyses, embeddings))

# First level of clustering
first_level_clustered_data = perform_clustering(data, num_clusters=4)

# Second level of clustering
second_level_clustered_data = {}
for label, cluster_data in first_level_clustered_data.items():
    # We use the label from the first level of clustering as a prefix for the labels in the second level
    new_clusters = perform_clustering(cluster_data, num_clusters=4)
    for new_label, new_cluster_data in new_clusters.items():
        second_level_clustered_data[f"{label}-{new_label}"] = new_cluster_data

# Print out the corresponding inlink, texts, and analysis for the first 5 embeddings in each sub-cluster
for cluster, cluster_data in second_level_clustered_data.items():
    print(f"\nSub-cluster {cluster}:\n-------------------------")
    for data in cluster_data[:5]:  # Here we take only the first 5 data points in each cluster
        inlink, text, analysis, _ = data  # Unpack data tuple
        print(f"Inlink: {inlink}\nText: {text}\n\n")





Sub-cluster 0-0:
-------------------------
Inlink: Junior safety patrol
Text: Early, the role of students in the junior safety patrol was to "teach safety and role model it." Student members were taught to "direct children, not traffic," as they had authority over the students as they crossed streets, but did not have any authority over vehicular traffic on the streets.[3]


Inlink: About Safety
Text: About Safety is a children's educational television program which originated in 1972. It was produced by the Mississippi Authority for Educational Television. In the 3 to 6 minute shorts, marionettes, most notably Clyde Frog, taught children about safety and first aid. Mischievous Clyde has a distinctive, high-pitched voice and would get himself into various troubles. The show illustrated dangers, ranging from traffic, guns, and tornadoes. About Safety ran for 47 episodes and the shows were quite successful. MAETV syndicated them throughout the United States.[1]


Inlink: The Clyde Frog 

In [11]:
def get_sub_cluster(clustered_data, indices):
    # Handle the special case where indices contains a single -1
    if indices == [-1]:
        # Flatten the data into a list
        flat_data = [item for sublist in clustered_data.values() for item in sublist]
        return perform_clustering(flat_data, num_clusters=4)

    # Ensure that the indices list is not empty
    if not indices:
        raise ValueError("Indices list cannot be empty")

    # Start from the top-level cluster
    current_cluster = clustered_data

    # Traverse the hierarchy according to the indices
    for index in indices:
        # Convert index to string because dictionary keys are strings
        index = str(index)

        # Check if the required sub-cluster exists
        if index not in current_cluster:
            raise ValueError(f"Sub-cluster {index} not found")

        # Move to the sub-cluster
        current_cluster = current_cluster[index]

        # If we are not at the end of the indices list, perform clustering on the current sub-cluster
        if indices.index(index) != len(indices) - 1:
            current_cluster = perform_clustering(current_cluster, num_clusters=4)

    # Perform the final clustering and return the result
    return perform_clustering(current_cluster, num_clusters=4)


# Get the initial 4 cluster
initial_cluster = get_sub_cluster(second_level_clustered_data, [-1])

# Print out the corresponding inlink, texts, and analysis for the first 5 embeddings in each sub-cluster
for cluster, cluster_data in initial_cluster.items():
    print(f"\nSub-cluster {cluster}:\n-------------------------")
    for data in cluster_data[:5]:  # Here we take only the first 5 data points in each cluster
        inlink, text, analysis, _ = data  # Unpack data tuple
        print(f"Inlink: {inlink}\nText: {text}\n\n")




Sub-cluster 0:
-------------------------
Inlink: Candle
Text: International markets have developed a range of standards and regulations to ensure compliance, while maintaining and improving safety, including:


Inlink: Scenario (computing)
Text: Negative scenarios or misuse cases may be written to indicate likely threats which should be countered to ensure that systems have sufficient security, safety, and reliability. These help to discover non-functional requirements.[5]


Inlink: Boiler explosion
Text: On land-based boilers, explosions of the pressure systems happened regularly in stationary steam boilers in the Victorian era, but are now very rare because of the various protections provided, and because of regular inspections compelled by governmental and industry requirements.


Inlink: Laboratory
Text: In many laboratories, hazards are present. Laboratory hazards might include poisons; infectious agents; flammable, explosive, or radioactive materials; moving machinery; extreme t