## Subtask A: Entity and Relationship Extraction


In [None]:
#pip install transformers
#pip install spacy
#pip install nltk
#pip install torch
#pip install requests beautifulsoup4
#pip install yahooquery
#pip install scikit-learn
#pip install gensim

In [None]:
#!python -m spacy download en_core_web_sm

In [2]:
import sqlite3
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import spacy 
import nltk
import requests 
import torch
import pandas as pd

from collections import Counter
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
pd.set_option("display.max_rows", 200)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dbpath = 'data/ecmdatabase.db'
con = sqlite3.connect(f"file:{dbpath}?mode=ro", uri=True)
with con:
    result = con.execute("SELECT * from companies WHERE stock_symbol = 'TSLA';")
    records = result.fetchall()
    company_name = records[0][1]
    tsla_item1 = records[0][2]

In [4]:
tsla_item1



In [12]:
tsla_item1 = tsla_item1.replace('\n', '')
tsla_item1



## Data Exploration - Text Analysis

### Frequency Analysis

In [None]:
## FREQUENCY ANALYSIS
# load spacy model
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")
# load data
doc = nlp(tsla_item1)
words = [token.text for token in doc if not token.is_stop and not token.is_punct]

print(Counter(words).most_common(20))

### TF-IDF

In [23]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Step 1: Load and preprocess the text
text_data = [tsla_item1]

# Tokenization using nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenization and lowercasing
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic characters
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Preprocess the data
processed_text = [preprocess(text) for text in text_data]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ongai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ongai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10)  # Limit to top 10 features for brevity
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)
tfidf_keywords = tfidf_vectorizer.get_feature_names_out()

print("Top TF-IDF Keywords:")
print(tfidf_keywords)

### LDA

In [None]:
# Prepare data for LDA
tokenized_texts = [preprocess(text).split() for text in text_data]
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Display topics
print("LDA Topics:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")


### Pos tagging

In [None]:
# POS Tagging using spaCy
doc = nlp(tsla_item1)

# Extract POS tags
pos_tags = [(token.text, token.pos_) for token in doc]

print("Part-of-Speech Tags:")
print(pos_tags)


## Named Entity Recognition

### NLTK

In [None]:
## using nltk
'''
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
''' 
# Step Two: Load Data
print(len(tsla_item1))

# Step Three: Tokenise, find parts of speech and chunk words 
nltk_name=[]
nltk_label=[]
for sent in nltk.sent_tokenize(tsla_item1):
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
     if hasattr(chunk, 'label'):
      entity_name = ' '.join(c[0] for c in chunk)
      entity_label = chunk.label()
      nltk_name.append(entity_name)
      nltk_label.append(entity_label)
      print(entity_label, entity_name)

### Spacy

In [130]:
## Using spacy

# load spacy model
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")
# load data
doc = nlp(tsla_item1)
Spacy_name=[]
Spacy_label=[]
# collect unique labels
unique_labels = {}
for ent in doc.ents:
    if ent.label_ not in unique_labels.keys():
        unique_labels[ent.label_] = []
    unique_labels[ent.label_].append((ent.text, ent.start_char, ent.end_char))
    Spacy_name.append(ent.text)
    Spacy_label.append(ent.label_)


- PERSON:      People, including fictional.
- NORP:        Nationalities or religious or political groups.
- FAC:         Buildings, airports, highways, bridges, etc.
- ORG:         Companies, agencies, institutions, etc.
- GPE:         Countries, cities, states.
- LOC:         Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
- EVENT:       Named hurricanes, battles, wars, sports events, etc.
- WORK_OF_ART: Titles of books, songs, etc.
- LAW:         Named documents made into laws.
- LANGUAGE:    Any named language.
- DATE:        Absolute or relative dates or periods.
- TIME:        Times smaller than a day.
- PERCENT:     Percentage, including ”%“.
- MONEY:       Monetary values, including unit.
- QUANTITY:    Measurements, as of weight or distance.
- ORDINAL:     “first”, “second”, etc.
- CARDINAL:    Numerals that do not fall under another type.

In [None]:
unique_labels.keys()

In [None]:
unique_labels['ORG']

In [None]:
unique_labels['PRODUCT'] #products

In [None]:
unique_labels['LOC'] # locations

In [None]:
unique_labels['FAC'] # facilities / factories

In [None]:
unique_labels['EVENT'] # events

In [55]:
from spacy import displacy
#displacy.render(doc, style="ent")

### Hugging Face Transformers

Because entities can be of different types and not all are equally important in the
context of the natural language text being analyzed, it is quite common for NER
processors to return the following in addition to a list of entities:

 **type**
- Is it a person? Is it a location? Is it an organization? The set of categories will
depend on the specific model used. 
- The bert-base-NER distinguishes four types
of entities: location (LOC), organization (ORG), person (PER), and miscellaneous
(MISC).
 
**salience**
- The relative importance in the text analyzed or, in other words, the entity’s
relevance. 
- Is the entity central to the text (higher score/salience), or is it just
mentioned tangentially (lower score/salience)?

In [None]:
## Using transformers
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipe = pipeline("ner", model = model, tokenizer = tokenizer)
for ent in ner_pipe(tsla_item1):
    print(ent)

## Diffbot API
https://www.diffbot.com/


In [7]:
from getpass import getpass

TOKEN = getpass('Enter token: ')

In [8]:
FIELDS = "entities,facts"
HOST = "nl.diffbot.com"

In [9]:
import json

def get_request(payload):
  res = requests.post("https://{}/v1/?fields={}&token={}".format(HOST, FIELDS, TOKEN), json=payload)
  ret = None
  try:
    ret = res.json()
  except:
    print("Bad response: " + res.text)
    print(res.status_code)
    print(res.headers)
  return ret

In [28]:
res = get_request({
    "content": tsla_item1,
    "lang": "en",
    "format": "plain text with title",
})

print (res)

{'entities': [{'name': 'Tesla Semi', 'diffbotUri': 'https://diffbot.com/entity/EF4g4ohJUPUeLKh-N-rvUyA', 'confidence': 0.9569246, 'salience': 0.8978014, 'isCustom': False, 'allUris': ['http://www.wikidata.org/entity/Q40008974'], 'allTypes': [{'name': 'skill', 'diffbotUri': 'https://diffbot.com/entity/EvfbHngnSNVOh7ZBM5XTywQ'}, {'name': 'product', 'diffbotUri': 'https://diffbot.com/entity/EgSPUye7QPcyQoPylO8biMQ'}, {'name': 'tool', 'diffbotUri': 'https://diffbot.com/entity/EKvoYuTx4P9WT35YUouc0ug'}, {'name': 'vehicle', 'diffbotUri': 'https://diffbot.com/entity/E4vBDPVu3OTq90vd1GhX5mw', 'dbpediaUri': 'http://dbpedia.org/ontology/MeanOfTransportation'}], 'mentions': [{'text': 'Tesla', 'beginOffset': 2922, 'endOffset': 2927, 'confidence': 0.9569246}, {'text': 'Tesla', 'beginOffset': 5361, 'endOffset': 5366, 'confidence': 0.9569246}]}, {'name': 'automotive battery', 'diffbotUri': 'https://diffbot.com/entity/EF6RvyvRnNsW6oAQQjtPb7g', 'confidence': 0.97791916, 'salience': 0.77393895, 'isCusto

### View Entities and Entity Types

In [29]:
for ent in res["entities"]:
    if ent["salience"] > 0.5:
        print("Entity Name: " + ent['name'])
        print("Salience: " + str(ent['salience']))
        print("Entity Types:")
        print([ent_type["name"] for ent_type in ent['allTypes']])
        print()

Entity Name: Tesla Semi
Salience: 0.8978014
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: automotive battery
Salience: 0.77393895
Entity Types:
['skill', 'product', 'tool']

Entity Name: Tesla Model X
Salience: 0.6964315
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: solar energy
Salience: 0.6094218
Entity Types:
[]

Entity Name: Tesla Autopilot
Salience: 0.5553389
Entity Types:
['skill', 'field of work', 'technology']

Entity Name: artificial intelligence
Salience: 0.5336387
Entity Types:
['skill', 'field of work', 'technology']

Entity Name: electric vehicle
Salience: 0.5048789
Entity Types:
['skill', 'product', 'tool', 'vehicle']



In [108]:
def extract_entites(res):
    ents = pd.DataFrame.from_dict(res["entities"])
    if not ents.empty:
        salient_ents = ents[ents["salience"] > 0.4]
        salient_ents["Labels"] = None
        for i, row in salient_ents.iterrows():
            if len(row['allTypes']) != 0:
                names = [ent_type["name"] for ent_type in row['allTypes']]
                if "organization" in names:
                    salient_ents.loc[i,"Labels"] = 'company'
                elif ("field of work" in names)  or ("industry" in names):
                    salient_ents.loc[i,'Labels'] = 'industry'
                elif "country" in names:
                    salient_ents.loc[i,'Labels'] = 'country'
                elif "product" in names:
                    salient_ents.loc[i,'Labels'] = 'product'
                else:
                    salient_ents.loc[i,'Labels'] = row['allTypes'][0]['name']
                

        fin_ents = salient_ents[['name','salience','Labels']]
        return fin_ents
    return ents

In [None]:
tsla_entities = extract_entites(res)
tsla_entities

## View Relationships and Relationship Types

In [123]:
def extract_relationships(res):
    rels =  pd.DataFrame.from_dict(res["facts"])
    if not rels.empty:
        for i, row in rels.iterrows():
            rels.loc[i,"entity"] = row["entity"]["name"]
            rels.loc[i,"property"] = row["property"]["name"]
            rels.loc[i,"value"] = row["value"]["name"]
            if row["evidence"] != []:
                rels.loc[i,"evidence"] = row["evidence"][0].get("passage",None)
        fin_rels = rels[['entity','property','value','evidence']]
        return fin_rels
    return rels

### Supplementary Source: Wikipedia Article

In [94]:
tesla_wiki = "Tesla, Inc. (/ˈtɛslə/ TESS-lə or /ˈtɛzlə/ TEZ-lə[a]) is an American multinational automotive and clean energy company. Headquartered in Austin, Texas, it designs, manufactures and sells battery electric vehicles (BEVs), stationary battery energy storage devices from home to grid-scale, solar panels and solar shingles, and related products and services. \
    Tesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. Its name is a tribute to inventor and electrical engineer Nikola Tesla. In February 2004, Elon Musk joined as Tesla's largest shareholder; in 2008, he was named chief executive officer. In 2008, the company began production of its first car model, the Roadster sports car, followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruck pickup truck in 2023. The Model 3 is the all-time best-selling plug-in electric car worldwide, and in June 2021 became the first electric car to sell 1 million units globally.[6] In 2023, the Model Y was the best-selling vehicle, of any kind, globally.[7][8][3] \
        Tesla is one of the world's most valuable companies in terms of market capitalization. In October 2021, Tesla temporarily became a trillion-dollar company, the seventh U.S. company to do so. In 2023, the company led the battery electric vehicle market, with 19.9% share. Also in 2023, the company was ranked 69th in the Forbes Global 2000.[9] As of March 2024, it is the world's most valuable automaker. Tesla has been the subject of lawsuits, government scrutiny, and journalistic criticism, stemming from allegations of multiple cases of whistleblower retaliation, worker rights violations such as sexual harassment and anti-union activities, safety defects leadings to dozens of recalls, the lack of a public relations department, and controversial statements from Musk including overpromising on the company's driving assist technology and product release timelines."

In [95]:
res = get_request({
    "content": tesla_wiki,
    "lang": "en",
    "format": "plain text with title",
})

In [162]:
tsla_wiki_entities = extract_entites(res)
tsla_wiki_entities.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


Unnamed: 0,name,salience,Labels
0,Marc Tarpenning,0.988777,person
1,Martin Eberhard,0.98803,person
2,Tesla,0.975387,company
3,battery electric vehicle,0.834153,product
4,Elon Musk,0.75881,person


In [163]:
extract_relationships(res).head()

Unnamed: 0,entity,property,value,evidence
0,Tesla,product type,battery electric vehicles,"Headquartered in Austin, Texas, it designs, ma..."
1,Elon Musk,position held,chief executive officer,"In February 2004, Elon Musk joined as Tesla's ..."
2,Tesla,chief executive officer,Elon Musk,"In February 2004, Elon Musk joined as Tesla's ..."
3,Tesla,chief executive officer,Elon Musk,"In February 2004, Elon Musk joined as Tesla's ..."
4,Elon Musk,employee or member of,Tesla,"In February 2004, Elon Musk joined as Tesla's ..."


### Creating Entity-Relationship Extraction Pipelines

In [102]:
from yahooquery import Ticker

def get_company_name(ticker):
    try:
        ticker_info = Ticker(ticker)
        company_name = ticker_info.quote_type[ticker]['longName']
        print(f"Found Company: {company_name}")
        return company_name
    except Exception as e:
        print(f"Error fetching company name for ticker {ticker}: {e}")
        return ticker

get_company_name("AEP")


Found Company: American Electric Power Company, Inc.


'American Electric Power Company, Inc.'

In [101]:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_article(company_name_or_ticker):
    search_url = f"https://en.wikipedia.org/wiki/{company_name_or_ticker}"
    
    try:
        response = requests.get(search_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.find('h1', {'id': 'firstHeading'}).text

        content_div = soup.find('div', {'id': 'mw-content-text'})

        paragraphs = content_div.find_all('p')

        full_article_text = '\n\n'.join([p.text.strip() for p in paragraphs if p.text.strip()])

        #print(f"Title: {title}")
        #print(f"Full Article:\n{full_article_text}")
        return f"{title}" + " " + f"{full_article_text}"

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the Wikipedia article: {e}")
        return None

# company_name_or_ticker = input("Enter the company name or ticker code: ").replace(' ', '_')
# get_wikipedia_article(company_name_or_ticker)




In [141]:
def wikipedia_ner_rel_pipeline(ticker):
    company_name = get_company_name(ticker)
    article = get_wikipedia_article(company_name)
    if article is None:
        return None
    res = get_request({
    "content": article,
    "lang": "en",
    "format": "plain text with title",
    })
    ents, rels = None, None
    ents = extract_entites(res)
    rels = extract_relationships(res)
    pd.options.display.max_columns = None
    pd.set_option('display.width', 3000)
    
    return (ents, rels)

def sec_10k_ner_rel_pipeline(ticker):
    dbpath = 'data/ecmdatabase.db'
    con = sqlite3.connect(f"file:{dbpath}?mode=ro", uri=True)
    with con:
        result = con.execute(f"SELECT * from companies WHERE stock_symbol = '{ticker}';")
        records = result.fetchall()
        if records == []:
            print(f"no records of company ticker {ticker} found in database.")
            return (None,None)
        company_name = records[0][1]
        item1 = records[0][2].replace('\n', '')
        item7 = records[0][3].replace('\n', '')
    item1_res = get_request({
    "content": item1,
    "lang": "en",
    "format": "plain text",
    })
    item7_res = get_request({
    "content": item7,
    "lang": "en",
    "format": "plain text",
    })
    item1_ents, item1_rels = extract_entites(item1_res), extract_relationships(item1_res)
    item7_ents, item7_rels = extract_entites(item7_res), extract_relationships(item7_res)

    ents = pd.concat([item1_ents,item7_ents],axis = 0)
    rels = pd.concat([item1_rels,item7_rels],axis = 0)

    return (ents,rels)

    

    


In [125]:
ents_rels = wikipedia_ner_rel_pipeline("AEP")


Found Company: American Electric Power Company, Inc.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


In [173]:
aep_ents = ents_rels[0]
aep_ents.head()

Unnamed: 0,name,salience,Labels
0,American Electric Power,0.737737,company
1,Algonquin Power,0.430244,company
2,AEP Texas Central,0.400038,company
95,American Electric Power,0.651433,company
150,American Electric Power Company,0.651433,company


In [175]:
aep_rels = ents_rels[1]
aep_rels.head()

Unnamed: 0,entity,property,value,evidence
0,Mary Fallin,position held,Governor,"In April 2014, Oklahoma Governor Mary Fallin s..."
1,Appalachian Power Company,organization locations,Kingsport,AEP considers Appalachian Power to be the oper...
2,Kingsport Power Company,parent organization,American Electric Power Company,"Until the 21st century, AEP's operations in Te..."
3,American Electric Power,acquired by,American Electric Power Company,AEP Texas was formed from a merger of various ...
4,American Electric Power,headquarters,Ashland,Kentucky Power headquarters is in Ashland and ...


In [143]:
(aapl_sec_ents, aapl_sec_rels) = sec_10k_ner_rel_pipeline('AAPL')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


In [168]:
aapl_sec_ents.head() # more entities extracted compared to wikipedia articles.

Unnamed: 0,name,salience,Labels
0,Apple,0.902315,company
1,MacBook Air,0.895517,product
2,MacBook Pro,0.867708,product
3,Apple TV,0.840443,product
4,AirPods Max,0.831631,


In [147]:
aapl_sec_rels #can't extract relationships from 10-K forms.

In [149]:
(aapl_wiki_ents, aapl_wiki_rels) = wikipedia_ner_rel_pipeline("AAPL")

Found Company: Apple Inc.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salient_ents["Labels"] = None


In [167]:
aapl_wiki_ents.head() # fewer entities extracted

Unnamed: 0,name,salience,Labels
0,Jef Raskin,0.957273,person
1,Ronald Wayne,0.945415,person
2,Apple,0.924846,company
3,Steve Jobs,0.899881,person
4,Jean-Louis Gassée,0.882259,person


In [164]:
aapl_wiki_rels.head()

Unnamed: 0,entity,property,value,evidence
0,Lens Technology,customers,Apple,"Apple announced on August 16, 2016, that Lens ..."
1,Apple,competitors,Microsoft,"[119] In May 2010, Apple's market cap exceeded..."
2,Apple,product type,mobile handset,"[115] By October 2008, Apple was the third-lar..."
3,Apple Computer Company,founding date,1976-04-01,"Apple Computer Company was founded on April 1,..."
4,World Wide Fund for Nature,partnership,Apple,"On April 14, 2016, Apple and the World Wide Fu..."


In [154]:
aapl_wiki_rels["property"].unique()

array(['customers', 'competitors', 'product type', 'founding date',
       'partnership', 'work relationship', 'organization locations',
       'headquarters', 'number of employees', 'employee or member of',
       'position held', 'acquired by', 'founded by', 'date of death',
       'suppliers', 'political affiliation', 'contributed to', 'brands',
       'industry', 'parent organization', 'yearly revenue', 'subsidiary',
       'cause of death', 'skilled at', 'interested in',
       'chief executive officer', 'all names', 'gender'], dtype=object)

In [172]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "product type"]

Unnamed: 0,entity,property,value,evidence
2,Apple,product type,mobile handset,"[115] By October 2008, Apple was the third-lar..."
6,Lens Technology,product type,glass,"Apple announced on August 16, 2016, that Lens ..."
138,Apple Inc.,product type,solar energy,"Apple Energy, LLC is a wholly-owned subsidiary..."
147,"Apple Energy, LLC",product type,solar energy,"Apple Energy, LLC is a wholly-owned subsidiary..."
160,Emagic,product type,computer,The purchase of Emagic made Apple the first co...
165,Apple,product type,iPad accessories,"[1] Apple sells several iPad accessories, incl..."


In [170]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "organization locations"]

Unnamed: 0,entity,property,value,evidence
7,Zhengzhou Technology Park,organization locations,Zhengzhou,[384] Zhengzhou Technology Park alone employs ...
26,Apple,organization locations,Cupertino,Apple Inc. is an American multinational corpor...
29,Apple,organization locations,Cork,[274] Apple's international sales and distribu...
126,Apple,organization locations,Austin,"Apple has two campuses near Austin, Texas: a 2..."
170,Apple,organization locations,Texas,"Apple has two campuses near Austin, Texas: a 2..."
172,Apple,organization locations,California,Apple Inc. is an American multinational corpor...


In [152]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "headquarters"]

Unnamed: 0,entity,property,value,evidence
8,Zhengzhou Technology Park,headquarters,Zhengzhou,[384] Zhengzhou Technology Park alone employs ...
27,Apple,headquarters,Cupertino,Apple Inc. is an American multinational corpor...
173,Apple,headquarters,California,Apple Inc. is an American multinational corpor...


In [153]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "industry"]

Unnamed: 0,entity,property,value,evidence
99,Apple,industry,accounting,"In the late 1980s, Apple was a pioneer of an a..."
111,Apple,industry,technology,Apple Inc. is an American multinational corpor...


In [155]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "yearly revenue"]

Unnamed: 0,entity,property,value,evidence
108,Apple,yearly revenue,"383,290,000,000 USD",Apple is the largest technology company by rev...


In [165]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "suppliers"]

Unnamed: 0,entity,property,value,evidence
37,Apple,suppliers,Lens Technology,"Apple announced on August 16, 2016, that Lens ..."
133,Apple,suppliers,NeXTSTEP,"[69] Only weeks away from bankruptcy,[70] Appl..."
156,Apple,suppliers,Lens Technology,"Apple announced on August 16, 2016, that Lens ..."


In [166]:
aapl_wiki_rels[aapl_wiki_rels["property"] == "competitors"]

Unnamed: 0,entity,property,value,evidence
1,Apple,competitors,Microsoft,"[119] In May 2010, Apple's market cap exceeded..."
47,Microsoft,competitors,Apple,"[119] In May 2010, Apple's market cap exceeded..."
50,VisiCalc,competitors,Atari,[26] VisiCalc created a business market for th...
51,VisiCalc,competitors,CBM,[26] VisiCalc created a business market for th...
88,Atari,competitors,VisiCalc,[26] VisiCalc created a business market for th...
92,Gassée,competitors,Be Inc.,[52] Gassée left the company later that year t...
100,CBM,competitors,VisiCalc,[26] VisiCalc created a business market for th...
161,Be Inc.,competitors,Gassée,[52] Gassée left the company later that year t...


## Create JSON Node and Relationship Objects

In [4]:
def create_json_schema():
    json_export = {
        "nodes":{
            "Company":[
                # company nodes here
            ],
            "Country":[
                # country nodes here
            ],
            "Industry":[
                # industry nodes here
            ],
            "Region":[
                # region nodes here
            ]},
        "relationships":{
            "PARTNERS_WITH":[
                # (COMPANY cid1)-[:PARTNERS_WITH]->(COMPANY cid2)
            ],
            "COMPETES_WITH":[
                # (COMPANY cid1)-[:COMPETES_WITH]->(COMPANY cid2)
            ],
            "SUBSIDIARY_OF":[
                # (COMPANY cid1)-[:SUBSIDIARY_OF]->(COMPANY cid2)
            ],

            "HEADQUARTERS_IN":[
                # (COMPANY cid)-[:HEADQUARTERS_IN]->(COUNTRY ctyid)
            ],

            "OPERATES_IN_COUNTRY":[
                # (COMPANY cid)-[:OPERATES_IN_COUNTRY]->(COUNTRY ctyid)
            ],

            "IS_INVOLVED_IN":[
                # (COMPANY cid)-[:IS_INVOLVED_IN]->(INDUSTRY iid)
            ],

            "IS_IN":[
                # (COUNTRY ctyid)-[:IS_IN]->(REGION rid)
            ],

            "OPERATES_IN_REGION":[
                # (COMPANY cid)-[:OPERATES_IN_REGION]->(REGION rid)
            ]
    }}


    return json_export



In [None]:
def create_company_node(name,ticker_code = None,founded_year = None):
    c_node = {}

    c_node["name"] = name
    c_node["ticker_code"] = ticker_code
    c_node["founded_year"] = founded_year

    return c_node

In [None]:
dbpath = 'data/ecmdatabase.db'
con = sqlite3.connect(f"file:{dbpath}?mode=ro", uri=True)
with con:
    result = con.execute("SELECT name, stock_code from companies;")
    records = result.fetchall()
    for record in records:
        company_name = records[0][1]
        tsla_item1 = records[0][2]

## Spacy Node and Relationship Objects

In [None]:


# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

doc = nlp(tsla_item1)

# Extract entities and relations
nodes = defaultdict(set)  # Use a set to avoid duplicates
edges = []

# Define a function to identify non-company keywords
def is_non_company(entity_text):
    # Keywords or patterns that indicate the entity is not a company
    non_company_keywords = [
        'program', 'act', 'regulation', 'department', 'agency', 'council',
        'commission', 'service', 'policy', 'initiative', 'standard', 
        'incentive', 'college', 'school', 'university', 'authority', 'board',
        'order', 'capital', 'vehicle', 'development', 'internship', 'apprenticeship',
        'system', 'training', 'product', 'directive', 'committee', 'resource',
        'partnership', 'technology', 'platform'
    ]
    
    # If the entity contains any of these keywords, it is not a company
    return any(re.search(r'\b' + keyword + r'\b', entity_text.lower()) for keyword in non_company_keywords)

# Improved function to categorize and extract entities
def categorize_entities(entity):
    if entity.label_ == 'ORG':
        # Check if it's not a company
        if is_non_company(entity.text):
            return  # Exclude if it's not a company
        nodes['Company'].add(entity.text)
    elif entity.label_ == 'PRODUCT':
        nodes['Product'].add(entity.text)
    elif entity.label_ == 'GPE':
        nodes['Country'].add(entity.text)
    elif entity.label_ in ['NORP', 'INDUSTRY']:
        nodes['Industry'].add(entity.text)

# Extract named entities
for ent in doc.ents:
    categorize_entities(ent)

# Synonyms to match different verbs for edge detection
target_phrases = {
    'SUPPLIES': ["provide products", "manufactures products", "delivers goods", "offers items", "distributes products"],
    'LOCATED_IN': ["headquartered in", "based in", "located in", "situated in"],
    'RIVALS_WITH': ["competes with", "is a competitor of", "challenges", "rivals"],
    'ALLIES_WITH': ["partners with", "collaborates with", "is allied with", "cooperates with"]
}

# Extract relationships using semantic similarity
for sent in doc.sents:
    sent_vector = sent.vector  # Get the vector of the current sentence
    for relationship, phrases in target_phrases.items():
        for phrase in phrases:
            phrase_vector = nlp(phrase).vector  # Get the vector for the target phrase
            similarity = sent.similarity(nlp(phrase))  # Compute similarity
            if similarity > 0.7:  # Threshold for determining a match
                # Check if entities in the sentence can be matched to the known nodes
                for entity in sent.ents:
                    if entity.label_ == "ORG" and entity.text in nodes["Company"]:
                        target_entity = None
                        if relationship == "SUPPLIES":
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "PRODUCT"]
                        elif relationship == "LOCATED_IN":
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "GPE"]
                        elif relationship in ["RIVALS_WITH", "ALLIES_WITH"]:
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "ORG" and ent.text != entity.text]
                        
                        if target_entity:
                            for target in target_entity:
                                edges.append((entity.text, relationship, target))



# Convert nodes to list to remove duplicates and maintain order
nodes = {k: list(v) for k, v in nodes.items()}

# Output nodes and edges
print("Nodes:")
for node_type, node_list in nodes.items():
    print(f"{node_type}: {node_list}")

print("\nEdges:")
for edge in edges:
    print(f"{edge[0]} -[{edge[1]}]-> {edge[2]}")


In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

## NER model evaluation


In [131]:
def count(name, label):
    name_counts = {}
    for n, l in zip(name, label):
        if n in name_counts:
            name_counts[n] += 1
        else:
            name_counts[n] = 1
    return name_counts

def transfer_dic(name, label):
    name_label_counts = {}
    for n, l in zip(name, label):
        key = (n, l) 
        if key in name_label_counts:
            name_label_counts[key] += 1
        else:
            name_label_counts[key] = 1
    return name_label_counts

# output first the total appearing counts and the respective counts of each label
def check_name(name, label, target_name):
    if target_name not in name:
        return "no target_name"
    name_counts=count(name, label)
    name_label_counts=transfer_dic(name, label)
    print("total counts "+ str(name_counts[target_name]))
    for i in name_label_counts:
        if i[0]==target_name:
            print((i[1],name_label_counts[i]))

# output a dictionary containg the key of name and the label with highest appearing ratio. {name:(label,ratio)}
def get_NER(name, label):
    res={}
    name_counts=count(name, label)
    name_label_counts=transfer_dic(name, label)
    for i in list(set(name)):
        for j in name_label_counts:
            if j[0]==i:
                ratio=name_label_counts[j]/name_counts[i]
                if j[0] in res:
                    if ratio>res[j[0]][1]:
                        res[j[0]]=(j[1],ratio)
                else:
                    res[j[0]]=(j[1],ratio)
    return res

def highest_label(name, label):
    res={}
    NER=get_NER(name, label)
    for i in NER:
        if NER[i][0] in res:
            res[NER[i][0]].append(i)
        else:
            res[NER[i][0]]=[i,]
    return res



print(get_NER(Spacy_name,Spacy_label))
print(get_NER(nltk_name,nltk_label))





{'Vehicle': ('PERSON', 1.0), 'GWh': ('ORGANIZATION', 1.0), 'Services Automotive': ('ORGANIZATION', 1.0), 'Marketing Historically': ('PERSON', 1.0), 'Tesla Semi': ('ORGANIZATION', 1.0), 'Model S': ('PERSON', 1.0), 'Solar Roof': ('PERSON', 1.0), 'Design': ('GPE', 1.0), 'Cybertruck': ('ORGANIZATION', 0.6666666666666666), 'FSD': ('ORGANIZATION', 1.0), 'Model': ('PERSON', 1.0), 'Automotive Battery': ('ORGANIZATION', 1.0), 'Powerwall': ('GPE', 0.5), 'Artificial': ('ORGANIZATION', 1.0), 'BUSINESS': ('ORGANIZATION', 1.0), 'Mobile Service': ('PERSON', 1.0), 'Energy': ('PERSON', 1.0), 'Megapack': ('GPE', 1.0), 'Storage Energy Storage': ('PERSON', 1.0), 'Generation': ('ORGANIZATION', 1.0), 'Energy Offerings': ('PERSON', 1.0), 'Infotainment': ('ORGANIZATION', 1.0), 'Model X': ('PERSON', 1.0), 'Solar': ('PERSON', 1.0), 'Y': ('GPE', 1.0), 'FSD Computer': ('ORGANIZATION', 1.0), 'SUV': ('ORGANIZATION', 1.0), 'PPA': ('ORGANIZATION', 1.0), 'Energy Systems': ('ORGANIZATION', 1.0), 'Supercharger': ('GPE',