## Subtask A: Entity and Relationship Extraction


In [129]:
#pip install transformers
#pip install spacy
#pip install nltk
#pip install torch
#pip install requests beautifulsoup4
#pip install yahooquery
#pip install scikit-learn
#pip install gensim

In [130]:
#!python -m spacy download en_core_web_sm

In [131]:
import sqlite3
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import spacy 
import nltk
import requests 
import torch
import pandas as pd
import re

from collections import Counter, defaultdict
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
pd.set_option("display.max_rows", 200)

In [132]:
dbpath = 'data/ecmdatabase.db'
con = sqlite3.connect(f"file:{dbpath}?mode=ro", uri=True)
with con:
    result = con.execute("SELECT * from companies WHERE stock_symbol = 'TSLA';")
    records = result.fetchall()
    company_name = records[0][1]
    tsla_item1 = records[0][2]

In [133]:
tsla_item1



In [134]:
tsla_item1 = tsla_item1.replace('\n', '')
tsla_item1



## Data Exploration - Text Analysis

### Frequency Analysis

In [135]:
## FREQUENCY ANALYSIS
# load spacy model
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")
# load data
doc = nlp(tsla_item1)
words = [token.text for token in doc if not token.is_stop and not token.is_punct]

print(Counter(words).most_common(20))

[('energy', 75), ('vehicles', 68), ('systems', 39), ('vehicle', 34), ('products', 33), ('solar', 33), ('storage', 32), ('customers', 32), ('Tesla', 32), ('U.S.', 25), ('electric', 24), ('including', 24), ('certain', 23), ('battery', 20), ('driving', 18), ('new', 17), ('offer', 16), ('self', 16), ('markets', 16), ('Energy', 16)]


From the frequency analysis, we see that Tesla was mentioned a total of 32 times.

### TF-IDF

In [136]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Step 1: Load and preprocess the text
text_data = [tsla_item1]

# Tokenization using nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenization and lowercasing
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic characters
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Preprocess the data
processed_text = [preprocess(text) for text in text_data]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ongai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ongai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [137]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10)  # Limit to top 10 features for brevity
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)
tfidf_keywords = tfidf_vectorizer.get_feature_names_out()

print("Top TF-IDF Keywords:")
print(tfidf_keywords)

Top TF-IDF Keywords:
['also' 'customers' 'energy' 'products' 'solar' 'storage' 'systems'
 'tesla' 'vehicle' 'vehicles']


### LDA

In [138]:
# Prepare data for LDA
tokenized_texts = [preprocess(text).split() for text in text_data]
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Display topics
print("LDA Topics:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")


LDA Topics:
Topic 0: 0.001*"energy" + 0.001*"vehicles" + 0.001*"solar" + 0.001*"storage" + 0.001*"products" + 0.001*"also" + 0.001*"systems" + 0.001*"vehicle" + 0.001*"customers" + 0.001*"electric"
Topic 1: 0.020*"energy" + 0.015*"vehicles" + 0.010*"also" + 0.010*"solar" + 0.009*"storage" + 0.009*"systems" + 0.008*"vehicle" + 0.008*"products" + 0.007*"customers" + 0.007*"tesla"


### Pos tagging

In [139]:
# POS Tagging using spaCy
doc = nlp(tsla_item1)

# Extract POS tags
pos_tags = [(token.text, token.pos_) for token in doc]

print("Part-of-Speech Tags:")
print(pos_tags)


Part-of-Speech Tags:


## Named Entity Recognition

### NLTK

In [None]:
## using nltk to identify mentions of Tesla
'''
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
''' 
nltk_name=[]
nltk_label=[]
for sent in nltk.sent_tokenize(tsla_item1):
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
     if hasattr(chunk, 'label'):
      entity_name = ' '.join(c[0] for c in chunk)
      entity_label = chunk.label()
      nltk_name.append(entity_name)
      nltk_label.append(entity_label)
      if entity_name == 'Tesla':
        print(entity_label, entity_name)

We see that the word Tesla was recognised as an entity 19 times. However, it was only correctly tagged as an organisation 5 times, resulting in a precision of only 26.3% and a recall of 59%. In fact, NLTK more frequently classifies Tesla as a geographical location (GPE)! We wish to find a NER model that can perform better at accurately linking entities. We try SpaCy next.

### Spacy

SpaCy recognises 26 high-level entity classes, including:
- PERSON:      People, including fictional.
- NORP:        Nationalities or religious or political groups.
- FAC:         Buildings, airports, highways, bridges, etc.
- ORG:         Companies, agencies, institutions, etc.
- GPE:         Countries, cities, states.
- LOC:         Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
- DATE:        Absolute or relative dates or periods.
- TIME:        Times smaller than a day.

In [21]:
## Using spacy

# load spacy model
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")
# load data
doc = nlp(tsla_item1)
Spacy_name=[]
Spacy_label=[]
# collect unique labels
unique_labels = {}
for ent in doc.ents:
    if ent.label_ not in unique_labels.keys():
        unique_labels[ent.label_] = []
    unique_labels[ent.label_].append((ent.text, ent.start_char, ent.end_char))
    Spacy_name.append(ent.text)
    Spacy_label.append(ent.label_)
    if ent.text == 'Tesla':
        print(ent.text, ent.label_)


Tesla ORG
Tesla ORG
Tesla ORG
Tesla NORP
Tesla ORG
Tesla ORG
Tesla NORP
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG
Tesla ORG


SpaCy performs much better than NLTK, recognising Tesla as an entity 20 times, and correctly identifying it as an organisation 18 times, for a precision of 90% and recall of 62.5%. 

## NER Evaluation

We generalise the process of identifying the most frequently assigned label for a given entity below.

In [39]:
def count(name, label):
    name_counts = {}
    for n, l in zip(name, label):
        if n in name_counts:
            name_counts[n] += 1
        else:
            name_counts[n] = 1
    return name_counts

def transfer_dic(name, label):
    name_label_counts = {}
    for n, l in zip(name, label):
        key = (n, l) 
        if key in name_label_counts:
            name_label_counts[key] += 1
        else:
            name_label_counts[key] = 1
    return name_label_counts

# output first the total appearing counts and the respective counts of each label
def check_name(name, label, target_name):
    if target_name not in name:
        return "no target_name"
    name_counts=count(name, label)
    name_label_counts=transfer_dic(name, label)
    print("total counts "+ str(name_counts[target_name]))
    for i in name_label_counts:
        if i[0]==target_name:
            print((i[1],name_label_counts[i]))

# output a dictionary containg the key of name and the label with highest appearing ratio. {name:(label,ratio)}
def get_NER(name, label,entity = None):
    res={}
    name_counts=count(name, label)
    name_label_counts=transfer_dic(name, label)
    for i in list(set(name)):
        for j in name_label_counts:
            if j[0]==i:
                ratio=name_label_counts[j]/name_counts[i]
                if j[0] in res:
                    if ratio>res[j[0]][1]:
                        res[j[0]]=(j[1],ratio)
                else:
                    res[j[0]]=(j[1],ratio)
    if entity != None:
        return res.get(entity,'not found')
    
    return res

def highest_label(name, label):
    res={}
    NER=get_NER(name, label)
    for i in NER:
        if NER[i][0] in res:
            res[NER[i][0]].append(i)
        else:
            res[NER[i][0]]=[i,]
    return res



print(get_NER(Spacy_name,Spacy_label,'Tesla'))
print(get_NER(nltk_name,nltk_label,'Tesla'))


('ORG', 0.9)
('GPE', 0.3684210526315789)


In [13]:
unique_labels.keys()

dict_keys(['ORG', 'PERSON', 'CARDINAL', 'MONEY', 'PRODUCT', 'NORP', 'DATE', 'ORDINAL', 'TIME', 'LOC', 'GPE', 'FAC', 'EVENT', 'LAW', 'PERCENT', 'WORK_OF_ART'])

In [None]:
unique_labels['ORG']

In [None]:
unique_labels['PRODUCT'] #products

In [16]:
unique_labels['LOC'] # locations

[('North America', 16709, 16722),
 ('Europe', 16724, 16730),
 ('Asia', 16735, 16739),
 ('Northern California', 17880, 17899),
 ('Europe', 32867, 32873),
 ('Europe', 41224, 41230)]

In [17]:
unique_labels['FAC'] # facilities / factories

[('Gigafactory Texas', 17963, 17980),
 ('Fremont Factory', 42324, 42339),
 ('Gigafactory Texas', 42361, 42378)]

In [18]:
unique_labels['EVENT'] # events

[('this Annual Report on Form 10-K', 20814, 20845),
 ('this Annual Report on Form 10-K. Energy Storage System Incentives',
  21988,
  22053),
 ('this Annual Report on Form 10-K. Pursuant', 23163, 23204),
 ('this Annual Report on Form 10-K.', 45826, 45858)]

In [55]:
from spacy import displacy
#displacy.render(doc, style="ent")

### ReFinED transformer

<a href = "https://github.com/amazon-science/ReFinED"> ReFinED </a> uses a Transformer model to perform mention detection, entity typing, and entity disambiguation for all mentions in a document in a single forward pass. The model is trained on a dataset we generated dataset using Wikipedia hyperlinks, which consists of over 150M entity mentions. The model uses entity descriptions and fine-grained entity types to perform linking. Therefore, new entities can be added to the system without retraining.



In [None]:
pip install git+https://github.com/ardentaegis17/ReFinED.git

In [61]:
from refined.inference.processor import Refined


refined = Refined.from_pretrained(model_name='wikipedia_model_with_numbers',
                                  entity_set="wikipedia")

spans = refined.process_text(tsla_item1)

print(spans)
print(len(spans))

i = 0 

for span in spans:
    if isinstance(span.predicted_entity,type(spans[3].predicted_entity)): #the fourth entity identified is 'Model 3', which is correctly linked to the Wikipedia article for Tesla Model 3.
        i += 1

print(i)

  checkpoint = torch.load(io.BytesIO(f.read()), map_location="cpu")
  with autocast():


[['ITEM 1', Entity not linked to a knowledge base, None], ['two', None, 'CARDINAL'], ['five', None, 'CARDINAL'], ['Model 3', Entity(wikidata_entity_id=Q23663332, wikipedia_entity_title=Tesla Model 3), None], ['Cybertruck', Entity(wikidata_entity_id=Q66311587, wikipedia_entity_title=Tesla Cybertruck), None], ['Model 3', Entity(wikidata_entity_id=Q23663332, wikipedia_entity_title=Tesla Model 3), None], ['four-door', None, 'CARDINAL'], ['Model', Entity(wikidata_entity_id=Q4610556, wikipedia_entity_title=Model (person)), None], ['Model 3', Entity(wikidata_entity_id=Q23663332, wikipedia_entity_title=Tesla Model 3), None], ['seven', None, 'CARDINAL'], ['Model S', Entity(wikidata_entity_id=Q1463050, wikipedia_entity_title=Tesla Model S), None], ['four-door', None, 'CARDINAL'], ['Model X', Entity(wikidata_entity_id=Q1634161, wikipedia_entity_title=Tesla Model X), None], ['seven', None, 'CARDINAL'], ['Model S', Entity(wikidata_entity_id=Q1463050, wikipedia_entity_title=Tesla Model S), None], ['

We see that the ReFinED transformer does an excellent job at entity resolution and disambiguation, successfully linking 197 of 252 entities to Wikipedia articles. We explore its capabilities further in the ERD.ipynb notebook. However, it lacks the ability to differentiate between important entities. Relationship extraction is also challenging, as it requires querying the Wikidata API using the entity id and parsing through voluminious JSON output. We thus turn to our final NER model.

## Diffbot API
<a href = "https://www.diffbot.com/"> Diffbot </a> is a developer of machine learning and computer vision algorithms and public APIs for extracting data from web pages / web scraping to create a knowledge base. In September 2020 the company released a Natural Language Processing API for automatically building Knowledge Graphs from text.

Because entities can be of different types and not all are equally important in the
context of the natural language text being analyzed, it is quite common for NER
processors to return the following in addition to a list of entities:

 **type**
- Is it a person? Is it a location? Is it an organization? The set of categories will
depend on the specific model used. 
- Diffbot API distinguishes between <a href= https://demo.nl.diffbot.com/schema/#types> 69 different entity types. </a>
 
**salience**
- The relative importance in the text analyzed or, in other words, the entity’s
relevance. 
- Is the entity central to the text (higher score/salience), or is it just
mentioned tangentially (lower score/salience)?

In [62]:
from getpass import getpass

TOKEN = getpass('Enter token: ')

In [63]:
FIELDS = "entities,facts"
HOST = "nl.diffbot.com"

In [64]:
import json

def get_request(payload):
  res = requests.post("https://{}/v1/?fields={}&token={}".format(HOST, FIELDS, TOKEN), json=payload)
  ret = None
  try:
    ret = res.json()
  except:
    print("Bad response: " + res.text)
    print(res.status_code)
    print(res.headers)
  return ret

In [65]:
res = get_request({
    "content": tsla_item1,
    "lang": "en",
    "format": "plain text with title",
})

print (res)

{'entities': [{'name': 'Tesla Semi', 'diffbotUri': 'https://diffbot.com/entity/EF4g4ohJUPUeLKh-N-rvUyA', 'confidence': 0.9569246, 'salience': 0.8978014, 'isCustom': False, 'allUris': ['http://www.wikidata.org/entity/Q40008974'], 'allTypes': [{'name': 'skill', 'diffbotUri': 'https://diffbot.com/entity/EvfbHngnSNVOh7ZBM5XTywQ'}, {'name': 'product', 'diffbotUri': 'https://diffbot.com/entity/EgSPUye7QPcyQoPylO8biMQ'}, {'name': 'tool', 'diffbotUri': 'https://diffbot.com/entity/EKvoYuTx4P9WT35YUouc0ug'}, {'name': 'vehicle', 'diffbotUri': 'https://diffbot.com/entity/E4vBDPVu3OTq90vd1GhX5mw', 'dbpediaUri': 'http://dbpedia.org/ontology/MeanOfTransportation'}], 'mentions': [{'text': 'Tesla', 'beginOffset': 2922, 'endOffset': 2927, 'confidence': 0.9569246}, {'text': 'Tesla', 'beginOffset': 5361, 'endOffset': 5366, 'confidence': 0.9569246}]}, {'name': 'automotive battery', 'diffbotUri': 'https://diffbot.com/entity/EF6RvyvRnNsW6oAQQjtPb7g', 'confidence': 0.97791916, 'salience': 0.77393895, 'isCusto

### View Entities and Entity Types

In [66]:
for ent in res["entities"]:
    if ent["salience"] > 0.5:
        print("Entity Name: " + ent['name'])
        print("Salience: " + str(ent['salience']))
        print("Entity Types:")
        print([ent_type["name"] for ent_type in ent['allTypes']])
        print()

Entity Name: Tesla Semi
Salience: 0.8978014
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: automotive battery
Salience: 0.77393895
Entity Types:
['skill', 'product', 'tool']

Entity Name: Tesla Model X
Salience: 0.6964315
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: solar energy
Salience: 0.6094218
Entity Types:
[]

Entity Name: Tesla Autopilot
Salience: 0.5553389
Entity Types:
['skill', 'field of work', 'technology']

Entity Name: artificial intelligence
Salience: 0.5336387
Entity Types:
['skill', 'field of work', 'technology']

Entity Name: electric vehicle
Salience: 0.5048789
Entity Types:
['skill', 'product', 'tool', 'vehicle']



Filtering the output to retain only significant entities with a salience score greater than 0.5, we see that 7 entities were extracted from the SEC 10-K filing data. All entities are indeed relevant to Tesla as an automobile electric vehicle company with significant investments in solar energy and artificial intelligence.

We also see that each entity is linked to a variety of entity types. For our knowledge graph, we are primarily interested in **company**, **product**, **industry / field of work**, and **location / country** entities. 

In [109]:
def extract_entites(res):
    ents = pd.DataFrame.from_dict(res["entities"])
    if not ents.empty:
        salient_ents = ents[ents["salience"] > 0.5]
        salient_ents["Labels"] = None
        for i, row in salient_ents.iterrows():
            if len(row['allTypes']) != 0:
                names = [ent_type["name"] for ent_type in row['allTypes']]
                if "organization" in names:
                    salient_ents.loc[i,"Labels"] = 'company'
                elif ("field of work" in names)  or ("industry" in names) or ("industry" in row['name']):
                    salient_ents.loc[i,'Labels'] = 'industry'
                elif "country" in names:
                    salient_ents.loc[i,'Labels'] = 'country'
                elif "location" in names:
                    salient_ents.loc[i,'Labels'] = 'location'
                elif "product" in names:
                    salient_ents.loc[i,'Labels'] = 'product'
                else:
                    salient_ents.loc[i,'Labels'] = row['allTypes'][0]['name']
                

        fin_ents = salient_ents[['name','salience','Labels']]
        return fin_ents
    return ents

In [None]:
tsla_entities = extract_entites(res)
tsla_entities.head()

## View Relationships and Relationship Types

In [108]:
def extract_relationships(res):
    rels =  pd.DataFrame.from_dict(res["facts"])
    if not rels.empty:
        for i, row in rels.iterrows():
            rels.loc[i,"entity"] = row["entity"]["name"]
            rels.loc[i,"property"] = row["property"]["name"]
            rels.loc[i,"value"] = row["value"]["name"]
            if row["evidence"] != []:
                rels.loc[i,"evidence"] = row["evidence"][0].get("passage",None)
        fin_rels = rels[['entity','property','value','evidence']]
        return fin_rels
    return rels

In [None]:
extract_relationships(res)

Diffbot's NLP API is unable to extract any relationships from Tesla's SEC 10-K filing data. We turn to Tesla's wikipedia article as a supplementary data source.

### Supplementary Source: Wikipedia Article

In [77]:
tesla_wiki = "Tesla, Inc. (/ˈtɛslə/ TESS-lə or /ˈtɛzlə/ TEZ-lə[a]) is an American multinational automotive and clean energy company. Headquartered in Austin, Texas, it designs, manufactures and sells battery electric vehicles (BEVs), stationary battery energy storage devices from home to grid-scale, solar panels and solar shingles, and related products and services. \
    Tesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. Its name is a tribute to inventor and electrical engineer Nikola Tesla. In February 2004, Elon Musk joined as Tesla's largest shareholder; in 2008, he was named chief executive officer. In 2008, the company began production of its first car model, the Roadster sports car, followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruck pickup truck in 2023. The Model 3 is the all-time best-selling plug-in electric car worldwide, and in June 2021 became the first electric car to sell 1 million units globally.[6] In 2023, the Model Y was the best-selling vehicle, of any kind, globally.[7][8][3] \
        Tesla is one of the world's most valuable companies in terms of market capitalization. In October 2021, Tesla temporarily became a trillion-dollar company, the seventh U.S. company to do so. In 2023, the company led the battery electric vehicle market, with 19.9% share. Also in 2023, the company was ranked 69th in the Forbes Global 2000.[9] As of March 2024, it is the world's most valuable automaker. Tesla has been the subject of lawsuits, government scrutiny, and journalistic criticism, stemming from allegations of multiple cases of whistleblower retaliation, worker rights violations such as sexual harassment and anti-union activities, safety defects leadings to dozens of recalls, the lack of a public relations department, and controversial statements from Musk including overpromising on the company's driving assist technology and product release timelines."

In [78]:
res = get_request({
    "content": tesla_wiki,
    "lang": "en",
    "format": "plain text with title",
})

In [162]:
tsla_wiki_entities = extract_entites(res)
tsla_wiki_entities.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


Unnamed: 0,name,salience,Labels
0,Marc Tarpenning,0.988777,person
1,Martin Eberhard,0.98803,person
2,Tesla,0.975387,company
3,battery electric vehicle,0.834153,product
4,Elon Musk,0.75881,person


In [163]:
extract_relationships(res).head()

Unnamed: 0,entity,property,value,evidence
0,Tesla,product type,battery electric vehicles,"Headquartered in Austin, Texas, it designs, ma..."
1,Elon Musk,position held,chief executive officer,"In February 2004, Elon Musk joined as Tesla's ..."
2,Tesla,chief executive officer,Elon Musk,"In February 2004, Elon Musk joined as Tesla's ..."
3,Tesla,chief executive officer,Elon Musk,"In February 2004, Elon Musk joined as Tesla's ..."
4,Elon Musk,employee or member of,Tesla,"In February 2004, Elon Musk joined as Tesla's ..."


Diffbot can successfully extract entity relationships from Tesla's Wikipedia article!

### Creating Entity-Relationship Extraction Pipelines

We now construct E-R extraction pipelines, and run them to gain insights into Apple Inc, with stock ticker code AAPL.

In [154]:
from googlesearch import search
from yahooquery import Ticker

def get_company_ticker(self):

    searchval = 'yahoo finance '+self
    link = []
    #limits to the first link
    for url in search(searchval, tld='es', lang='es', stop=1):
        link.append(url)

    link = str(link[0])
    link=link.split("/")
    if link[-1]=='':
        ticker=link[-2]
    else:
        x=link[-1].split('=')
        ticker=x[-1]

    return(ticker)

def get_company_name(ticker):
    try:
        ticker_info = Ticker(ticker)
        company_name = ticker_info.quote_type[ticker]['longName']
        print(f"Found Company: {company_name}")
        return company_name
    except Exception as e:
        print(f"Error fetching company name for ticker {ticker}: {e}")
        return ticker

get_company_name("AAPL")


Found Company: Apple Inc.


'Apple Inc.'

In [102]:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_article(company_name_or_ticker):
    search_url = f"https://en.wikipedia.org/wiki/{company_name_or_ticker}"
    
    try:
        response = requests.get(search_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.find('h1', {'id': 'firstHeading'}).text

        content_div = soup.find('div', {'id': 'mw-content-text'})

        paragraphs = content_div.find_all('p')

        full_article_text = '\n\n'.join([p.text.strip() for p in paragraphs if p.text.strip()])

        #print(f"Title: {title}")
        #print(f"Full Article:\n{full_article_text}")
        return f"{title}" + " " + f"{full_article_text}"

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the Wikipedia article: {e}")
        return None

# company_name_or_ticker = input("Enter the company name or ticker code: ").replace(' ', '_')
# get_wikipedia_article(company_name_or_ticker)




In [103]:
def wikipedia_ner_rel_pipeline(ticker):
    company_name = get_company_name(ticker)
    article = get_wikipedia_article(company_name)
    if article is None:
        return None
    res = get_request({
    "content": article,
    "lang": "en",
    "format": "plain text with title",
    })
    ents, rels = None, None
    ents = extract_entites(res)
    rels = extract_relationships(res)
    pd.options.display.max_columns = None
    pd.set_option('display.width', 3000)
    
    return (ents, rels)

def sec_10k_ner_rel_pipeline(ticker):
    dbpath = 'data/ecmdatabase.db'
    con = sqlite3.connect(f"file:{dbpath}?mode=ro", uri=True)
    with con:
        result = con.execute(f"SELECT * from companies WHERE stock_symbol = '{ticker}';")
        records = result.fetchall()
        if records == []:
            print(f"no records of company ticker {ticker} found in database.")
            return (None,None)
        company_name = records[0][1]
        item1 = records[0][2].replace('\n', '')
        item7 = records[0][3].replace('\n', '')
    item1_res = get_request({
    "content": item1,
    "lang": "en",
    "format": "plain text",
    })
    item7_res = get_request({
    "content": item7,
    "lang": "en",
    "format": "plain text",
    })
    item1_ents, item1_rels = extract_entites(item1_res), extract_relationships(item1_res)
    item7_ents, item7_rels = extract_entites(item7_res), extract_relationships(item7_res)

    ents = pd.concat([item1_ents,item7_ents],axis = 0)
    rels = pd.concat([item1_rels,item7_rels],axis = 0)

    return (ents,rels)

    

    


In [125]:
ents_rels = wikipedia_ner_rel_pipeline("AEP")


Found Company: American Electric Power Company, Inc.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


In [173]:
aep_ents = ents_rels[0]
aep_ents.head()

Unnamed: 0,name,salience,Labels
0,American Electric Power,0.737737,company
1,Algonquin Power,0.430244,company
2,AEP Texas Central,0.400038,company
95,American Electric Power,0.651433,company
150,American Electric Power Company,0.651433,company


In [175]:
aep_rels = ents_rels[1]
aep_rels.head()

Unnamed: 0,entity,property,value,evidence
0,Mary Fallin,position held,Governor,"In April 2014, Oklahoma Governor Mary Fallin s..."
1,Appalachian Power Company,organization locations,Kingsport,AEP considers Appalachian Power to be the oper...
2,Kingsport Power Company,parent organization,American Electric Power Company,"Until the 21st century, AEP's operations in Te..."
3,American Electric Power,acquired by,American Electric Power Company,AEP Texas was formed from a merger of various ...
4,American Electric Power,headquarters,Ashland,Kentucky Power headquarters is in Ashland and ...


In [143]:
(aapl_sec_ents, aapl_sec_rels) = sec_10k_ner_rel_pipeline('AAPL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


In [168]:
aapl_sec_ents.head() # more entities extracted compared to wikipedia articles.

Unnamed: 0,name,salience,Labels
0,Apple,0.902315,company
1,MacBook Air,0.895517,product
2,MacBook Pro,0.867708,product
3,Apple TV,0.840443,product
4,AirPods Max,0.831631,


In [113]:
aapl_sec_rels #can't extract relationships from 10-K forms.

In [149]:
(aapl_wiki_ents, aapl_wiki_rels) = wikipedia_ner_rel_pipeline("AAPL")

Found Company: Apple Inc.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


In [167]:
aapl_wiki_ents.head() # fewer entities extracted

Unnamed: 0,name,salience,Labels
0,Jef Raskin,0.957273,person
1,Ronald Wayne,0.945415,person
2,Apple,0.924846,company
3,Steve Jobs,0.899881,person
4,Jean-Louis Gassée,0.882259,person


In [164]:
aapl_wiki_rels.head()

Unnamed: 0,entity,property,value,evidence
0,Lens Technology,customers,Apple,"Apple announced on August 16, 2016, that Lens ..."
1,Apple,competitors,Microsoft,"[119] In May 2010, Apple's market cap exceeded..."
2,Apple,product type,mobile handset,"[115] By October 2008, Apple was the third-lar..."
3,Apple Computer Company,founding date,1976-04-01,"Apple Computer Company was founded on April 1,..."
4,World Wide Fund for Nature,partnership,Apple,"On April 14, 2016, Apple and the World Wide Fu..."


In [154]:
aapl_wiki_rels["property"].unique()

array(['customers', 'competitors', 'product type', 'founding date',
       'partnership', 'work relationship', 'organization locations',
       'headquarters', 'number of employees', 'employee or member of',
       'position held', 'acquired by', 'founded by', 'date of death',
       'suppliers', 'political affiliation', 'contributed to', 'brands',
       'industry', 'parent organization', 'yearly revenue', 'subsidiary',
       'cause of death', 'skilled at', 'interested in',
       'chief executive officer', 'all names', 'gender'], dtype=object)

In [172]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "product type"]

Unnamed: 0,entity,property,value,evidence
2,Apple,product type,mobile handset,"[115] By October 2008, Apple was the third-lar..."
6,Lens Technology,product type,glass,"Apple announced on August 16, 2016, that Lens ..."
138,Apple Inc.,product type,solar energy,"Apple Energy, LLC is a wholly-owned subsidiary..."
147,"Apple Energy, LLC",product type,solar energy,"Apple Energy, LLC is a wholly-owned subsidiary..."
160,Emagic,product type,computer,The purchase of Emagic made Apple the first co...
165,Apple,product type,iPad accessories,"[1] Apple sells several iPad accessories, incl..."


In [119]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "organization locations"]

Unnamed: 0,entity,property,value,evidence
44,Apple,organization locations,Ireland,"Apple's headquarters for Europe, the Middle Ea..."
47,Apple,organization locations,Cork,[274] Apple's international sales and distribu...
62,Apple,organization locations,Austin,"Apple has two campuses near Austin, Texas: a 2..."
70,Apple,organization locations,Texas,"Apple has two campuses near Austin, Texas: a 2..."
83,Apple,organization locations,Cork,"Apple's headquarters for Europe, the Middle Ea..."
88,Apple,organization locations,California,"Apple Computer, Inc. was incorporated in Cuper..."
125,Apple,organization locations,Cupertino,Apple Inc. is an American multinational corpor...
158,Zhengzhou Technology Park,organization locations,Zhengzhou,[386] Zhengzhou Technology Park alone employs ...


In [120]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "headquarters"]

Unnamed: 0,entity,property,value,evidence
89,Apple,headquarters,California,Apple Inc. is an American multinational corpor...
126,Apple,headquarters,Cupertino,Apple Inc. is an American multinational corpor...
159,Zhengzhou Technology Park,headquarters,Zhengzhou,[386] Zhengzhou Technology Park alone employs ...


In [121]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "industry"]

Unnamed: 0,entity,property,value,evidence
6,Apple,industry,accounting,"In the late 1980s, Apple was a pioneer of an a..."
131,Apple,industry,technology,Apple is the largest technology company by rev...
169,Apple,industry,information technology,Apple is one of the Big Five American informat...


In [160]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "subsidiary"]

Unnamed: 0,entity,property,value,evidence
24,Verizon,subsidiary,Flurry Analytics,A study by Verizon subsidiary Flurry Analytics...
117,Apple Inc.,subsidiary,"Apple Energy, LLC","Apple Energy, LLC is a wholly-owned subsidiary..."


In [123]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "suppliers"]

Unnamed: 0,entity,property,value,evidence
41,Apple,suppliers,NeXTSTEP,"[69] Only weeks away from bankruptcy,[70] Appl..."
66,Apple,suppliers,Lens Technology,"Apple announced on August 16, 2016, that Lens ..."
135,Apple,suppliers,Lens Technology,"Apple announced on August 16, 2016, that Lens ..."


In [124]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "competitors"]

Unnamed: 0,entity,property,value,evidence
3,Be Inc.,competitors,Gassée,[52] Gassée left the company later that year t...
11,Gassée,competitors,Be Inc.,[52] Gassée left the company later that year t...
111,Microsoft,competitors,Apple,"[119] In May 2010, Apple's market cap exceeded..."
127,Apple,competitors,Microsoft,"[119] In May 2010, Apple's market cap exceeded..."


## Create JSON Node and Relationship Objects

To faciliate knowledge graph construction, we collate the entities and relationships extracted for all 40 companies into a JSON object.

In [165]:
def create_json_schema():
    json_schema = {
        "nodes":{
            "Company":[
                # company nodes here
            ],
            "Country":[
                # country nodes here
            ],
            "Industry":[
                # industry nodes here
            ],
            "Region":[
                # region nodes here
            ],

            "Product":[
                # product nodes here
            ]
        },
        
        "relationships":{
            "PARTNERS_WITH":[
                # (COMPANY cid1)-[:PARTNERS_WITH]->(COMPANY cid2)
            ],
            "COMPETES_WITH":[
                # (COMPANY cid1)-[:COMPETES_WITH]->(COMPANY cid2)
            ],
            "SUBSIDIARY_OF":[
                # (COMPANY cid1)-[:SUBSIDIARY_OF]->(COMPANY cid2)
            ],

            "HEADQUARTERS_IN":[
                # (COMPANY cid)-[:HEADQUARTERS_IN]->(COUNTRY ctyid)
            ],

            "OPERATES_IN_COUNTRY":[
                # (COMPANY cid)-[:OPERATES_IN_COUNTRY]->(COUNTRY ctyid)
            ],

            "IS_INVOLVED_IN":[
                # (COMPANY cid)-[:IS_INVOLVED_IN]->(INDUSTRY iid)
            ],

            "IS_IN":[
                # (COUNTRY ctyid)-[:IS_IN]->(REGION rid)
            ],

            "OPERATES_IN_REGION":[
                # (COMPANY cid)-[:OPERATES_IN_REGION]->(REGION rid)
            ],

            "PRODUCES":[
                # (COMPANY cid)-[:PRODUCES]->(PRODUCT)
            ]
    }}


    return json_schema



In [155]:
def create_company_node(name,ticker_code = None,founded_year = None):
    c_node = {}

    c_node["name"] = name

    if ticker_code is None:
        c_node["ticker_code"] = get_company_ticker(name)
    else:
        c_node["ticker_code"] = ticker_code
    c_node["founded_year"] = founded_year

    return c_node

In [None]:
#pip install pycountry
#pip install pycountry-convert
#pip install locationtagger

In [206]:
import pycountry_convert as pc
import random

def city_to_country(city):
    response = requests.request("GET", f"https://www.geonames.org/search.html?q={city}&country=")
    country_raw = re.findall("/countries.*\\.html", response.text)[0].strip(".html").split("/")[-1]
    country = country_raw.replace('-',' ').title()
    return country

def country_to_continent(country_name):
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

def create_country_node(name,iso3=None,iso2=None, population = None, gdp = None, corporate_tax_rate = None, is_city = False):
    cnty_node = {}
    
    if is_city:
        cnty_node["source_city"] = name
        name = city_to_country(name)
    cnty_node["name"] = name
    try:
        cnty_node["iso2"] = pc.country_name_to_country_alpha2(name)
        cnty_node["iso3"] = pc.country_name_to_country_alpha3(name)
    except KeyError:
        print(f"{cnty_node["name"]} could not be mapped to iso code")
    
    cnty_node["population"] = random.randint(5,1400) #in millions
    cnty_node["gdp"] = random.randint(1,20000) #in billions
    cnty_node["corporate_tax_rate"] = random.randint(10,50)

    return cnty_node

def create_region_node(cnty_node):
    reg_node = {}
    reg_node["name"] = pc.country_alpha2_to_continent_code(cnty_node["iso2"])
    reg_node["m49"] = None
    return reg_node

In [196]:
def create_industry_node(name,SIC_code = None, industry_group = None, subindustry_desc = None, primary_activity= None):
    ind_node = {}
    ind_node["name"] = name
    ind_node["SIC_code"] = SIC_code
    ind_node["industry_group"] = industry_group
    ind_node["subindustry_desc"] = subindustry_desc
    ind_node["primary_activity"] = primary_activity
    return ind_node

def create_product_node(name):
    pdt_node = {}
    pdt_node["name"] = name
    return pdt_node

In [199]:
def create_hq_rel(c_node, cnty_node):
    hq_rel = {}
    hq_rel["company_name"] = c_node["name"]
    hq_rel["country_name"] = cnty_node["name"]

    return hq_rel

def create_operates_in_country_rel(c_node, cnty_node):
    oic_rel = {}
    oic_rel["company_name"] = c_node["name"]
    oic_rel["country_name"] = cnty_node["name"]
    oic_rel["net sales"] = random.randint(-30000000,30000000)
    oic_rel["headcount"] = random.randint(1,10000)

    return oic_rel

def create_operates_in_region_rel(c_node, reg_node):
    oir_rel = {}
    oir_rel["company_name"] = c_node["name"]
    oir_rel["region_name"] = reg_node["name"]
    oir_rel["net sales"] = random.randint(-30000000,30000000)
    oir_rel["headcount"] = random.randint(100,1000000)

    return oir_rel

def create_is_in_rel(cnty_node, reg_node):
    is_in_rel = {}
    is_in_rel["country_name"] = cnty_node["name"]
    is_in_rel["region_name"] = reg_node["name"]


#partners, competitors, subsidiaries 
def create_company_company_rel(c_node1, c_node2, type = None):
    c_c_rel = {}
    c_c_rel["company_name_1"] = c_node1["name"]
    c_c_rel["company_name_2"] = c_node2["name"]
    c_c_rel["type"] = type

    return c_c_rel

def create_in_industry_rel(c_node, ind_node):
    c_ind_rel = {}
    c_ind_rel["company_name"] = c_node["name"]
    c_ind_rel["industry_name"] = ind_node["name"]
    return c_ind_rel

def create_produces_rel(c_node, pdt_node):
    c_pdt_rel = {}
    c_pdt_rel["company_name"] = c_node["name"]
    c_pdt_rel["product_name"] = pdt_node["name"]

    return c_pdt_rel

In [217]:
def generate_json_schema(json):
    dbpath = 'data/ecmdatabase.db'
    con = sqlite3.connect(f"file:{dbpath}?mode=ro", uri=True)
    def fill_entities(ents, c_name):
        if ents is None:
            return None
        
        for _,ent in ents.iterrows():
            if ent["Labels"] =='company' and ent["name"] != c_name:
                ticker = get_company_ticker(ent["name"])
                json["nodes"]["Company"].append(create_company_node(ent["name"],ticker))
            elif ent["Labels"] == 'industry':
                json["nodes"]["Industry"].append(create_industry_node(ent["name"]))
            elif ent["Labels"] == 'country':
                json["nodes"]["Country"].append(create_country_node(ent["name"]))
            elif ent["Labels"] == 'location':
                json["nodes"]["Country"].append(create_country_node(ent["name"], is_city = True))
            elif ent["Labels"] == 'product':
                json["nodes"]["Product"].append(create_product_node(ent["name"]))

        for cnty_node in json["nodes"]["Country"]:
            reg_node = create_region_node(cnty_node)
            json["nodes"]["Region"].append(reg_node)
            json["relationships"]["IS_IN"].append(create_is_in_rel(cnty_node,reg_node))
    
    def fill_relationships(rels):
        if rels is None:
            return None
        
        for _, rel in rels.iterrows():
            if rel["property"] == "headquarters":
                c_node = create_company_node(rel["entity"])
                hq_node = create_country_node(rel["value"], is_city = True)
                hq_rel = create_hq_rel(c_node, hq_node)
                json["relationships"]["HEADQUARTERS_IN"].append(hq_rel)
            elif rel["property"] == "organization locations":
                c_node = create_company_node(rel["entity"])
                loc_node = create_country_node(rel["value"],is_city = True)
                loc_rel = create_operates_in_country_rel(c_node, loc_node)
                json["relationships"]["OPERATES_IN_COUNTRY"].append(loc_rel)
            elif rel["property"] == "industry":
                c_node = create_company_node(rel["entity"])
                ind_node = create_industry_node(rel["value"])
                works_rel = create_in_industry_rel(c_node, ind_node)
                json["relationships"]["IS_INVOLVED_IN"].append(works_rel)
            elif rel["property"] == "product type":
                c_node = create_company_node(rel["entity"])
                pdt_node = create_product_node(rel["value"])
                produces_rel = create_produces_rel(c_node,pdt_node)
                json["relationships"]["PRODUCES"].append(produces_rel)

            elif rel["property"] == "competitors":
                c_node_1 = create_company_node(rel["entity"])
                c_node_2 = create_company_node(rel["value"])
                cc_rel = create_company_company_rel(c_node_1,c_node_2)
                json["relationships"]["COMPETES_WITH"].append(cc_rel)
            elif rel["property"] == "suppliers":
                c_node_1 = create_company_node(rel["entity"])
                c_node_2 = create_company_node(rel["value"])
                cc_rel = create_company_company_rel(c_node_1,c_node_2, "suppliers")
                json["relationships"]["PARTNERS_WITH"].append(cc_rel)
            elif rel["property"] == "subsidiary":
                c_node_1 = create_company_node(rel["entity"])
                c_node_2 = create_company_node(rel["value"])
                cc_rel = create_company_company_rel(c_node_2, c_node_1, "subsidiary")
                json["relationships"]["SUBSIDIARY_OF"].append(cc_rel)
    with con:
        result = con.execute("SELECT name, stock_symbol from companies;")
        records = result.fetchall()
        for record in records:
            company_name = record[0]
            stock_code = record[1]

            print(f'Processing {company_name} with stock code {stock_code}')
            
            c_node = create_company_node(company_name,stock_code)
            json["nodes"]["Company"].append(c_node)

            sec_10k_ents, sec_10k_rels = sec_10k_ner_rel_pipeline(stock_code)
            wiki_ents, wiki_rels = wikipedia_ner_rel_pipeline(stock_code)

            fill_entities(sec_10k_ents,company_name)
            fill_entities(wiki_ents,company_name)
            fill_relationships(sec_10k_rels)
            fill_relationships(wiki_rels)
    
    return json


In [218]:
empty_schema = create_json_schema()
print(empty_schema)

{'nodes': {'Company': [], 'Country': [], 'Industry': [], 'Region': [], 'Product': []}, 'relationships': {'PARTNERS_WITH': [], 'COMPETES_WITH': [], 'SUBSIDIARY_OF': [], 'HEADQUARTERS_IN': [], 'OPERATES_IN_COUNTRY': [], 'IS_INVOLVED_IN': [], 'IS_IN': [], 'OPERATES_IN_REGION': [], 'PRODUCES': []}}


In [219]:
import warnings
warnings.filterwarnings('ignore')

kg_json = generate_json_schema(empty_schema)

Processing Apple Inc. with stock code AAPL
Found Company: Apple Inc.
Processing ADOBE INC. with stock code ADBE
Found Company: Adobe Inc.
Processing ANALOG DEVICES INC with stock code ADI
Found Company: Analog Devices, Inc.
Processing AUTOMATIC DATA PROCESSING INC with stock code ADP
Found Company: Automatic Data Processing, Inc.
United Kingdo could not be mapped to iso code
Processing APPLIED MATERIALS INC /DE with stock code AMAT
Found Company: Applied Materials, Inc.
Processing ADVANCED MICRO DEVICES INC with stock code AMD
Found Company: Advanced Micro Devices, Inc.


IndexError: list index out of range

In [220]:
empty_schema

{'nodes': {'Company': [{'name': 'Apple Inc.',
    'ticker_code': 'AAPL',
    'founded_year': None},
   {'name': 'Apple', 'ticker_code': 'AAPL', 'founded_year': None},
   {'name': 'App Store', 'ticker_code': 'id328412701', 'founded_year': None},
   {'name': 'Apple Arcade',
    'ticker_code': 'apple-launches-game-subscription-apple-arcade-181647109--finance.html',
    'founded_year': None},
   {'name': 'AirPods', 'ticker_code': 'AAPL', 'founded_year': None},
   {'name': 'Apple', 'ticker_code': 'AAPL', 'founded_year': None},
   {'name': 'MLS Season Pass',
    'ticker_code': 'apple-tv-cuts-price-mls-161110203.html',
    'founded_year': None},
   {'name': 'Major League Soccer (MLS)',
    'ticker_code': 'MLFB',
    'founded_year': None},
   {'name': 'Apple', 'ticker_code': 'AAPL', 'founded_year': None},
   {'name': 'Sculley', 'ticker_code': 'SRL', 'founded_year': None},
   {'name': 'Sculley', 'ticker_code': 'SRL', 'founded_year': None},
   {'name': 'ADOBE INC.', 'ticker_code': 'ADBE', 'found

In [222]:
with open('nasdaq_kg_schema.json', 'w') as f:
    json.dump(empty_schema, f)

## Spacy Node and Relationship Objects

In [9]:
# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

doc = nlp(tsla_item1)

# Extract entities and relations
nodes = defaultdict(set)  # Use a set to avoid duplicates
edges = []

# Define a function to identify non-company keywords
def is_non_company(entity_text):
    # Keywords or patterns that indicate the entity is not a company
    non_company_keywords = [
        'program', 'act', 'regulation', 'department', 'agency', 'council',
        'commission', 'service', 'policy', 'initiative', 'standard', 
        'incentive', 'college', 'school', 'university', 'authority', 'board',
        'order', 'capital', 'vehicle', 'development', 'internship', 'apprenticeship',
        'system', 'training', 'product', 'directive', 'committee', 'resource',
        'partnership', 'technology', 'platform'
    ]
    
    # If the entity contains any of these keywords, it is not a company
    return any(re.search(r'\b' + keyword + r'\b', entity_text.lower()) for keyword in non_company_keywords)

# Improved function to categorize and extract entities
def categorize_entities(entity):
    if entity.label_ == 'ORG':
        # Check if it's not a company
        if is_non_company(entity.text):
            return  # Exclude if it's not a company
        nodes['Company'].add(entity.text)
    elif entity.label_ == 'PRODUCT':
        nodes['Product'].add(entity.text)
    elif entity.label_ == 'GPE':
        nodes['Country'].add(entity.text)
    elif entity.label_ in ['NORP', 'INDUSTRY']:
        nodes['Industry'].add(entity.text)

# Extract named entities
for ent in doc.ents:
    categorize_entities(ent)

# Synonyms to match different verbs for edge detection
target_phrases = {
    'SUPPLIES': ["provide products", "manufactures products", "delivers goods", "offers items", "distributes products"],
    'LOCATED_IN': ["headquartered in", "based in", "located in", "situated in"],
    'RIVALS_WITH': ["competes with", "is a competitor of", "challenges", "rivals"],
    'ALLIES_WITH': ["partners with", "collaborates with", "is allied with", "cooperates with"]
}

# Extract relationships using semantic similarity
for sent in doc.sents:
    sent_vector = sent.vector  # Get the vector of the current sentence
    for relationship, phrases in target_phrases.items():
        for phrase in phrases:
            phrase_vector = nlp(phrase).vector  # Get the vector for the target phrase
            similarity = sent.similarity(nlp(phrase))  # Compute similarity
            if similarity > 0.7:  # Threshold for determining a match
                # Check if entities in the sentence can be matched to the known nodes
                for entity in sent.ents:
                    if entity.label_ == "ORG" and entity.text in nodes["Company"]:
                        target_entity = None
                        if relationship == "SUPPLIES":
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "PRODUCT"]
                        elif relationship == "LOCATED_IN":
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "GPE"]
                        elif relationship in ["RIVALS_WITH", "ALLIES_WITH"]:
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "ORG" and ent.text != entity.text]
                        
                        if target_entity:
                            for target in target_entity:
                                edges.append((entity.text, relationship, target))



# Convert nodes to list to remove duplicates and maintain order
nodes = {k: list(v) for k, v in nodes.items()}

# Output nodes and edges
print("Nodes:")
for node_type, node_list in nodes.items():
    print(f"{node_type}: {node_list}")

print("\nEdges:")
for edge in edges:
    print(f"{edge[0]} -[{edge[1]}]-> {edge[2]}")


  similarity = sent.similarity(nlp(phrase))  # Compute similarity


Nodes:
Company: ['FSD Computer', 'Gigafactory Nevada', 'Automotive Purchase Financing and Leases', 'Powertrain Our', 'Model S', 'Energy Generation and Storage Energy Storage Systems', 'NHTSA', 'Megapack', 'AI', 'the Organization for Economic Co-operation and', 'Tesla Superchargers', 'Autopilot', 'Internships &#8211', 'the National Highway Traffic Safety Administration', 'ECE', 'ESG', 'Tesla', 'SAE International', 'Solar Energy Offerings', 'the Internal Revenue Code', 'Energy Generation and Storage Energy Storage Products', 'Control systems', 'FMVSS', 'Automobile Manufacturer', 'Energy Generation and Storage Energy Storage Products Powerwall', 'European Defence Ministries', 'ECE markets &#8221;)', 'Our Products and Services Automotive', 'FSD', 'ir.tesla.com', 'NACS', 'Technician Trainee', 'the &#8220;EPA&#8221', 'Battery Safety and Testing Our', ': &#8226', 'IRC', 'SEC', 'Full Self-Driving (&', 'Take Charge', 'START', 'Significant Accounting Policies', 'Solar Energy Systems', 'Financial