## Subtask A: Entity and Relationship Extraction


In [10]:
#pip install transformers
#pip install spacy
#pip install nltk
#pip install torch
#pip install requests beautifulsoup4
#pip install yahooquery

SyntaxError: invalid syntax (449124395.py, line 6)

In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----------------------------------- --- 11.8/12.8 MB 73.5 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 61.8 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import sqlite3
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import spacy 
import nltk
import requests 
import torch
import pandas as pd
from bs4 import BeautifulSoup
pd.set_option("display.max_rows", 200)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dbpath = 'data/ecmdatabase.db'
con = sqlite3.connect(f"file:{dbpath}?mode=ro", uri=True)
with con:
    result = con.execute("SELECT item1 from companies WHERE stock_symbol = 'TSLA';")
    tsla_item1 = result.fetchall()[0][0]

In [4]:
tsla_item1



In [5]:
tsla_item1 = tsla_item1.replace('\n', '')
tsla_item1



## Data Exploration - Text Analysis

### Frequency Analysis

In [14]:
## FREQUENCY ANALYSIS
from collections import Counter
# load spacy model
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")
# load data
doc = nlp(tsla_item1)
words = [token.text for token in doc if not token.is_stop and not token.is_punct]

print(Counter(words).most_common(20))

[('energy', 75), ('vehicles', 68), ('systems', 39), ('vehicle', 34), ('products', 33), ('solar', 33), ('storage', 32), ('customers', 32), ('Tesla', 32), ('U.S.', 25), ('electric', 24), ('including', 24), ('certain', 23), ('battery', 20), ('driving', 18), ('new', 17), ('offer', 16), ('self', 16), ('markets', 16), ('Energy', 16)]


### TF-IDF

In [41]:
import nltk
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
from collections import Counter

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Step 1: Load and preprocess the text
text_data = [tsla_item1]

# Tokenization using nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Tokenization and lowercasing
    tokens = word_tokenize(text.lower())
    # Remove stopwords and non-alphabetic characters
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Preprocess the data
processed_text = [preprocess(text) for text in text_data]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [42]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=10)  # Limit to top 10 features for brevity
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text)
tfidf_keywords = tfidf_vectorizer.get_feature_names_out()

print("Top TF-IDF Keywords:")
print(tfidf_keywords)

Top TF-IDF Keywords:
['also' 'customers' 'energy' 'products' 'solar' 'storage' 'systems'
 'tesla' 'vehicle' 'vehicles']


### LDA

In [43]:
# Prepare data for LDA
tokenized_texts = [preprocess(text).split() for text in text_data]
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Display topics
print("LDA Topics:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")


LDA Topics:
Topic 0: 0.001*"vehicles" + 0.001*"energy" + 0.001*"solar" + 0.001*"vehicle" + 0.001*"storage" + 0.001*"systems" + 0.001*"also" + 0.001*"certain" + 0.001*"customers" + 0.001*"products"
Topic 1: 0.020*"energy" + 0.015*"vehicles" + 0.010*"also" + 0.010*"solar" + 0.009*"storage" + 0.009*"systems" + 0.008*"vehicle" + 0.008*"products" + 0.007*"customers" + 0.007*"tesla"


### Pos tagging

In [44]:
# POS Tagging using spaCy
doc = nlp(tsla_item1)

# Extract POS tags
pos_tags = [(token.text, token.pos_) for token in doc]

print("Part-of-Speech Tags:")
print(pos_tags)


Part-of-Speech Tags:


## Named Entity Recognition

### NLTK

In [15]:
## using nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
 
# Step Two: Load Data
print(len(tsla_item1))

# Step Three: Tokenise, find parts of speech and chunk words 

for sent in nltk.sent_tokenize(tsla_item1):
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
     if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker_tab.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


45859
ORGANIZATION BUSINESS
PERSON Mobile Service
GPE Supercharger
ORGANIZATION Services Automotive
GPE Y
GPE S
ORGANIZATION Cybertruck
PERSON Model
ORGANIZATION SUV
PERSON Model
PERSON Model X
ORGANIZATION SUV
PERSON Model
PERSON Model X
ORGANIZATION SUV
ORGANIZATION Cybertruck
ORGANIZATION Tesla Semi
ORGANIZATION FSD
PERSON Energy
ORGANIZATION Generation
PERSON Storage Energy Storage
GPE Powerwall
GPE Megapack
ORGANIZATION GWh
PERSON Solar
PERSON Energy Offerings
ORGANIZATION PPA
PERSON Technology
ORGANIZATION Automotive Battery
GPE Powertrain
PERSON Model S
PERSON Model X
PERSON Cybertruck
ORGANIZATION Tesla Semi
PERSON Vehicle
ORGANIZATION Control
ORGANIZATION Infotainment
GSP Control
ORGANIZATION Artificial
ORGANIZATION FSD Computer
ORGANIZATION FSD
GPE Optimus
PERSON Energy
ORGANIZATION Generation
PERSON Storage Energy Storage
PERSON Solar
ORGANIZATION Energy Systems
PERSON Solar Roof
ORGANIZATION Powerwall
GPE Design
PERSON Energy
ORGANIZATION Generation
PERSON Solar Roof
PERSON

### Spacy

In [16]:
## Using spacy

# load spacy model
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipe("parser")
nlp.enable_pipe("senter")
# load data
doc = nlp(tsla_item1)

# collect unique labels
unique_labels = {}
for ent in doc.ents:
    if ent.label_ not in unique_labels.keys():
        unique_labels[ent.label_] = []
    unique_labels[ent.label_].append((ent.text, ent.start_char, ent.end_char))


- PERSON:      People, including fictional.
- NORP:        Nationalities or religious or political groups.
- FAC:         Buildings, airports, highways, bridges, etc.
- ORG:         Companies, agencies, institutions, etc.
- GPE:         Countries, cities, states.
- LOC:         Non-GPE locations, mountain ranges, bodies of water.
- PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
- EVENT:       Named hurricanes, battles, wars, sports events, etc.
- WORK_OF_ART: Titles of books, songs, etc.
- LAW:         Named documents made into laws.
- LANGUAGE:    Any named language.
- DATE:        Absolute or relative dates or periods.
- TIME:        Times smaller than a day.
- PERCENT:     Percentage, including ”%“.
- MONEY:       Monetary values, including unit.
- QUANTITY:    Measurements, as of weight or distance.
- ORDINAL:     “first”, “second”, etc.
- CARDINAL:    Numerals that do not fall under another type.

In [17]:
unique_labels.keys()

dict_keys(['ORG', 'PERSON', 'CARDINAL', 'MONEY', 'PRODUCT', 'NORP', 'DATE', 'ORDINAL', 'TIME', 'LOC', 'GPE', 'FAC', 'EVENT', 'LAW', 'PERCENT', 'WORK_OF_ART'])

In [18]:
unique_labels['ORG']

[('Mobile Service', 380, 394),
 ('Our Products and Services Automotive', 1942, 1978),
 ('Full Self-Driving (&', 3066, 3086),
 ('Energy Generation and Storage Energy Storage Products Powerwall',
  3162,
  3225),
 ('Solar Energy Offerings', 3925, 3947),
 ('Powertrain Our', 4680, 4694),
 ('Model S', 5321, 5328),
 ('Vehicle Control', 5785, 5800),
 ('Control systems', 5942, 5957),
 ('Self-Driving Development and Artificial Intelligence We', 6250, 6305),
 ('FSD Computer', 6449, 6461),
 ('Autopilot', 6804, 6813),
 ('Tesla', 7312, 7317),
 ('AI', 7652, 7654),
 ('Energy Generation and Storage Energy Storage Products', 7663, 7716),
 ('Solar Energy Systems', 8235, 8255),
 ('Energy Generation and Storage', 9642, 9671),
 ('Megapack', 9945, 9953),
 ('Automotive Direct Sales Our', 10536, 10563),
 ('Tesla', 11052, 11057),
 ('Tesla', 11508, 11513),
 ('Tesla', 11585, 11590),
 ('Tesla', 11737, 11742),
 ('Tesla Superchargers', 11857, 11876),
 ('the North American Charging Standard', 12677, 12713),
 ('NACS'

In [19]:
unique_labels['PRODUCT'] #products

[('the Model 3', 2045, 2056),
 ('Model 3', 2082, 2089),
 ('Model 3', 2277, 2284),
 ('Model S', 2437, 2444),
 ('Solar Roof', 4355, 4365),
 ('Solar Roof', 8275, 8285),
 ('Solar Roof', 10158, 10168),
 ('Solar Roof', 16095, 16105),
 ('Solar Roof', 17592, 17602),
 ('Model 3', 32041, 32048),
 ('Model Y', 32053, 32060)]

In [20]:
unique_labels['LOC'] # locations

[('North America', 16709, 16722),
 ('Europe', 16724, 16730),
 ('Asia', 16735, 16739),
 ('Northern California', 17880, 17899),
 ('Europe', 32867, 32873),
 ('Europe', 41224, 41230)]

In [21]:
unique_labels['FAC'] # facilities / factories

[('Gigafactory Texas', 17963, 17980),
 ('Fremont Factory', 42324, 42339),
 ('Gigafactory Texas', 42361, 42378)]

In [22]:
unique_labels['EVENT'] # events

[('this Annual Report on Form 10-K', 20814, 20845),
 ('this Annual Report on Form 10-K. Energy Storage System Incentives',
  21988,
  22053),
 ('this Annual Report on Form 10-K. Pursuant', 23163, 23204),
 ('this Annual Report on Form 10-K.', 45826, 45858)]

In [26]:
from spacy import displacy
#displacy.render(doc, style="ent")

### Hugging Face Transformers

Because entities can be of different types and not all are equally important in the
context of the natural language text being analyzed, it is quite common for NER
processors to return the following in addition to a list of entities:

 **type**
- Is it a person? Is it a location? Is it an organization? The set of categories will
depend on the specific model used. 
- The bert-base-NER distinguishes four types
of entities: location (LOC), organization (ORG), person (PER), and miscellaneous
(MISC).
 
**salience**
- The relative importance in the text analyzed or, in other words, the entity’s
relevance. 
- Is the entity central to the text (higher score/salience), or is it just
mentioned tangentially (lower score/salience)?

In [24]:
## Using transformers
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipe = pipeline("ner", model = model, tokenizer = tokenizer)
for ent in ner_pipe(tsla_item1):
    print(ent)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'I-ORG', 'score': 0.5477541, 'index': 74, 'word': 'Service', 'start': 387, 'end': 394}
{'entity': 'B-MISC', 'score': 0.520529, 'index': 84, 'word': 'Des', 'start': 434, 'end': 437}
{'entity': 'I-MISC', 'score': 0.62265056, 'index': 87, 'word': 'Chargers', 'start': 446, 'end': 454}
{'entity': 'B-ORG', 'score': 0.7542444, 'index': 350, 'word': 'Products', 'start': 1946, 'end': 1954}
{'entity': 'I-ORG', 'score': 0.9869141, 'index': 351, 'word': 'and', 'start': 1955, 'end': 1958}
{'entity': 'I-ORG', 'score': 0.9261168, 'index': 352, 'word': 'Services', 'start': 1959, 'end': 1967}
{'entity': 'I-ORG', 'score': 0.8898363, 'index': 353, 'word': 'Auto', 'start': 1968, 'end': 1972}
{'entity': 'I-ORG', 'score': 0.60575587, 'index': 354, 'word': '##mot', 'start': 1972, 'end': 1975}
{'entity': 'B-MISC', 'score': 0.59626424, 'index': 369, 'word': 'Model', 'start': 2049, 'end': 2054}
{'entity': 'I-MISC', 'score': 0.9495684, 'index': 370, 'word': '3', 'start': 2055, 'end': 2056}


## Diffbot API
https://www.diffbot.com/


In [21]:
from getpass import getpass

TOKEN = getpass('Enter token: ')

In [13]:
FIELDS = "entities,facts"
HOST = "nl.diffbot.com"

In [14]:
import json

def get_request(payload):
  res = requests.post("https://{}/v1/?fields={}&token={}".format(HOST, FIELDS, TOKEN), json=payload)
  ret = None
  try:
    ret = res.json()
  except:
    print("Bad response: " + res.text)
    print(res.status_code)
    print(res.headers)
  return ret

In [13]:
res = get_request({
    "content": tsla_item1,
    "lang": "en",
    "format": "plain text with title",
})

print (res)

{'entities': [{'name': 'Tesla Semi', 'diffbotUri': 'https://diffbot.com/entity/EF4g4ohJUPUeLKh-N-rvUyA', 'confidence': 0.9569246, 'salience': 0.8978014, 'isCustom': False, 'allUris': ['http://www.wikidata.org/entity/Q40008974'], 'allTypes': [{'name': 'skill', 'diffbotUri': 'https://diffbot.com/entity/EvfbHngnSNVOh7ZBM5XTywQ'}, {'name': 'product', 'diffbotUri': 'https://diffbot.com/entity/EgSPUye7QPcyQoPylO8biMQ'}, {'name': 'tool', 'diffbotUri': 'https://diffbot.com/entity/EKvoYuTx4P9WT35YUouc0ug'}, {'name': 'vehicle', 'diffbotUri': 'https://diffbot.com/entity/E4vBDPVu3OTq90vd1GhX5mw', 'dbpediaUri': 'http://dbpedia.org/ontology/MeanOfTransportation'}], 'mentions': [{'text': 'Tesla', 'beginOffset': 2922, 'endOffset': 2927, 'confidence': 0.9569246}, {'text': 'Tesla', 'beginOffset': 5361, 'endOffset': 5366, 'confidence': 0.9569246}]}, {'name': 'automotive battery', 'diffbotUri': 'https://diffbot.com/entity/EF6RvyvRnNsW6oAQQjtPb7g', 'confidence': 0.97791916, 'salience': 0.77393895, 'isCusto

### View Entities and Entity Types

In [18]:
def extract_entities(res):
    for ent in res["entities"]:
        if ent["salience"] > 0.5:
            print("Entity Name: " + ent['name'])
            print("Salience: " + str(ent['salience']))
            print("Entity Types:")
            print([ent_type["name"] for ent_type in ent['allTypes']])
            print()

In [15]:
for ent in res["entities"]:
    if ent["salience"] > 0.5:
        print("Entity Name: " + ent['name'])
        print("Salience: " + str(ent['salience']))
        print("Entity Types:")
        print([ent_type["name"] for ent_type in ent['allTypes']])
        print()

Entity Name: Tesla Semi
Salience: 0.8978014
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: automotive battery
Salience: 0.77393895
Entity Types:
['skill', 'product', 'tool']

Entity Name: Tesla Model X
Salience: 0.6964315
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: solar energy
Salience: 0.6094218
Entity Types:
[]

Entity Name: Tesla Autopilot
Salience: 0.5553389
Entity Types:
['skill', 'field of work', 'technology']

Entity Name: artificial intelligence
Salience: 0.5336387
Entity Types:
['skill', 'field of work', 'technology']

Entity Name: electric vehicle
Salience: 0.5048789
Entity Types:
['skill', 'product', 'tool', 'vehicle']



In [16]:
if "facts" in res:
    df = pd.DataFrame.from_dict(res["facts"])
    pd.options.display.max_columns = None
    pd.set_option('display.width', 1000)
    print(df.head())

Empty DataFrame
Columns: []
Index: []


### Supplementary Source: Wikipedia Article

In [15]:
tesla_wiki = "Tesla, Inc. (/ˈtɛslə/ TESS-lə or /ˈtɛzlə/ TEZ-lə[a]) is an American multinational automotive and clean energy company. Headquartered in Austin, Texas, it designs, manufactures and sells battery electric vehicles (BEVs), stationary battery energy storage devices from home to grid-scale, solar panels and solar shingles, and related products and services. \
    Tesla was incorporated in July 2003 by Martin Eberhard and Marc Tarpenning as Tesla Motors. Its name is a tribute to inventor and electrical engineer Nikola Tesla. In February 2004, Elon Musk joined as Tesla's largest shareholder; in 2008, he was named chief executive officer. In 2008, the company began production of its first car model, the Roadster sports car, followed by the Model S sedan in 2012, the Model X SUV in 2015, the Model 3 sedan in 2017, the Model Y crossover in 2020, the Tesla Semi truck in 2022 and the Cybertruck pickup truck in 2023. The Model 3 is the all-time best-selling plug-in electric car worldwide, and in June 2021 became the first electric car to sell 1 million units globally.[6] In 2023, the Model Y was the best-selling vehicle, of any kind, globally.[7][8][3] \
        Tesla is one of the world's most valuable companies in terms of market capitalization. In October 2021, Tesla temporarily became a trillion-dollar company, the seventh U.S. company to do so. In 2023, the company led the battery electric vehicle market, with 19.9% share. Also in 2023, the company was ranked 69th in the Forbes Global 2000.[9] As of March 2024, it is the world's most valuable automaker. Tesla has been the subject of lawsuits, government scrutiny, and journalistic criticism, stemming from allegations of multiple cases of whistleblower retaliation, worker rights violations such as sexual harassment and anti-union activities, safety defects leadings to dozens of recalls, the lack of a public relations department, and controversial statements from Musk including overpromising on the company's driving assist technology and product release timelines."

In [22]:
res = get_request({
    "content": tesla_wiki,
    "lang": "en",
    "format": "plain text with title",
})

In [23]:
extract_entities(res)

Entity Name: Marc Tarpenning
Salience: 0.9887775
Entity Types:
['person']

Entity Name: Martin Eberhard
Salience: 0.9880303
Entity Types:
['person']

Entity Name: Tesla
Salience: 0.97538745
Entity Types:
['organization']

Entity Name: battery electric vehicle
Salience: 0.8341532
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: Elon Musk
Salience: 0.75881004
Entity Types:
['person']

Entity Name: plug-in electric vehicle
Salience: 0.7428088
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: Tesla Model Y
Salience: 0.7421379
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: pickup truck
Salience: 0.741191
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: Tesla Model X
Salience: 0.7204369
Entity Types:
['skill', 'product', 'tool', 'vehicle']

Entity Name: market capitalisation
Salience: 0.71955776
Entity Types:
[]

Entity Name: multinational corporation
Salience: 0.7153199
Entity Types:
[]

Entity Name: Tesla Model S


In [24]:
if "facts" in res:
    df = pd.DataFrame.from_dict(res["facts"])
    pd.options.display.max_columns = None
    pd.set_option('display.width', 3000)
    print(df.head())

                                       humanReadable                                             entity                                           property                                              value                                         qualifiers  confidence                                           evidence
0    [Martin Eberhard] employee or member of [Tesla]  {'name': 'Martin Eberhard', 'diffbotUri': 'htt...  {'name': 'employee or member of', 'diffbotUri'...  {'name': 'Tesla', 'diffbotUri': 'https://diffb...  [{'property': {'name': 'is current'}, 'value':...    0.916384  [{'passage': 'Tesla was incorporated in July 2...
1        [Tesla] chief executive officer [Elon Musk]  {'name': 'Tesla', 'diffbotUri': 'https://diffb...  {'name': 'chief executive officer', 'diffbotUr...  {'name': 'Elon Musk', 'diffbotUri': 'https://d...  [{'property': {'name': 'is not current'}, 'val...    0.943741  [{'passage': 'In February 2004, Elon Musk join...
2        [Tesla] chief executive officer

### Creating a Entity-Relationship Extraction Pipeline for Wikipedia Articles

In [25]:
from yahooquery import Ticker

def get_company_name(ticker):
    try:
        ticker_info = Ticker(ticker)
        company_name = ticker_info.quote_type[ticker]['longName']
        print(f"Found Company: {company_name}")
        return company_name
    except Exception as e:
        print(f"Error fetching company name for ticker {ticker}: {e}")
        return ticker

get_company_name("AEP")


Found Company: American Electric Power Company, Inc.


'American Electric Power Company, Inc.'

In [30]:
import requests
from bs4 import BeautifulSoup

def get_wikipedia_article(company_name_or_ticker):
    search_url = f"https://en.wikipedia.org/wiki/{company_name_or_ticker}"
    
    try:
        response = requests.get(search_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        title = soup.find('h1', {'id': 'firstHeading'}).text

        content_div = soup.find('div', {'id': 'mw-content-text'})

        paragraphs = content_div.find_all('p')

        full_article_text = '\n\n'.join([p.text.strip() for p in paragraphs if p.text.strip()])

        #print(f"Title: {title}")
        #print(f"Full Article:\n{full_article_text}")
        return f"{title}" + " " + f"{full_article_text}"

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the Wikipedia article: {e}")

# company_name_or_ticker = input("Enter the company name or ticker code: ").replace(' ', '_')
# get_wikipedia_article(company_name_or_ticker)




In [35]:
def wikipedia_ner_rel_pipeline(ticker):
    company_name = get_company_name(ticker)
    article = get_wikipedia_article(company_name)
    res = get_request({
    "content": article,
    "lang": "en",
    "format": "plain text with title",
    })
    ents, rels = None, None
    if "entities" in res:
        ents = pd.DataFrame.from_dict(res["entities"])
    if "facts" in res:
        rels = pd.DataFrame.from_dict(res["facts"])
    pd.options.display.max_columns = None
    pd.set_option('display.width', 3000)
    
    return (ents, rels)

    

    


In [36]:
ents_rels = wikipedia_ner_rel_pipeline("AEP")


Found Company: American Electric Power Company, Inc.


In [37]:
print(ents_rels[0])
print(ents_rels[1])



                          name                                         diffbotUri  confidence  salience  isCustom                                            allUris                                           allTypes                                           mentions                                           location
0      American Electric Power  https://diffbot.com/entity/EXCVF5smZMtKtxEe8vJ...    0.999950  0.737737     False           [http://www.wikidata.org/entity/Q464092]  [{'name': 'organization', 'diffbotUri': 'https...  [{'text': 'American Electric Power Company, In...  {'latitude': 39.98072, 'longitude': -82.98559,...
1              Algonquin Power  https://diffbot.com/entity/EFnq_KqcwP0m4Aw8JWu...    0.999970  0.430244     False          [http://www.wikidata.org/entity/Q4724337]  [{'name': 'organization', 'diffbotUri': 'https...  [{'text': 'Liberty Utilities', 'beginOffset': ...  {'latitude': 43.409233, 'longitude': -79.65146...
2            AEP Texas Central  https://diffbo

## testing

In [6]:
import spacy
from collections import defaultdict
from nltk.corpus import wordnet
import re

# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

doc = nlp(tsla_item1)

# Extract entities and relations
nodes = defaultdict(set)  # Use a set to avoid duplicates
edges = []

# Define a function to identify non-company keywords
def is_non_company(entity_text):
    # Keywords or patterns that indicate the entity is not a company
    non_company_keywords = [
        'program', 'act', 'regulation', 'department', 'agency', 'council',
        'commission', 'service', 'policy', 'initiative', 'standard', 
        'incentive', 'college', 'school', 'university', 'authority', 'board',
        'order', 'capital', 'vehicle', 'development', 'internship', 'apprenticeship',
        'system', 'training', 'product', 'directive', 'committee', 'resource',
        'partnership', 'technology', 'platform'
    ]
    
    # If the entity contains any of these keywords, it is not a company
    return any(re.search(r'\b' + keyword + r'\b', entity_text.lower()) for keyword in non_company_keywords)

# Improved function to categorize and extract entities
def categorize_entities(entity):
    if entity.label_ == 'ORG':
        # Check if it's not a company
        if is_non_company(entity.text):
            return  # Exclude if it's not a company
        nodes['Company'].add(entity.text)
    elif entity.label_ == 'PRODUCT':
        nodes['Product'].add(entity.text)
    elif entity.label_ == 'GPE':
        nodes['Country'].add(entity.text)
    elif entity.label_ in ['NORP', 'INDUSTRY']:
        nodes['Industry'].add(entity.text)

# Extract named entities
for ent in doc.ents:
    categorize_entities(ent)

# Synonyms to match different verbs for edge detection
target_phrases = {
    'SUPPLIES': ["provide products", "manufactures products", "delivers goods", "offers items", "distributes products"],
    'LOCATED_IN': ["headquartered in", "based in", "located in", "situated in"],
    'RIVALS_WITH': ["competes with", "is a competitor of", "challenges", "rivals"],
    'ALLIES_WITH': ["partners with", "collaborates with", "is allied with", "cooperates with"]
}

# Extract relationships using semantic similarity
for sent in doc.sents:
    sent_vector = sent.vector  # Get the vector of the current sentence
    for relationship, phrases in target_phrases.items():
        for phrase in phrases:
            phrase_vector = nlp(phrase).vector  # Get the vector for the target phrase
            similarity = sent.similarity(nlp(phrase))  # Compute similarity
            if similarity > 0.7:  # Threshold for determining a match
                # Check if entities in the sentence can be matched to the known nodes
                for entity in sent.ents:
                    if entity.label_ == "ORG" and entity.text in nodes["Company"]:
                        target_entity = None
                        if relationship == "SUPPLIES":
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "PRODUCT"]
                        elif relationship == "LOCATED_IN":
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "GPE"]
                        elif relationship in ["RIVALS_WITH", "ALLIES_WITH"]:
                            target_entity = [ent.text for ent in sent.ents if ent.label_ == "ORG" and ent.text != entity.text]
                        
                        if target_entity:
                            for target in target_entity:
                                edges.append((entity.text, relationship, target))



# Convert nodes to list to remove duplicates and maintain order
nodes = {k: list(v) for k, v in nodes.items()}

# Output nodes and edges
print("Nodes:")
for node_type, node_list in nodes.items():
    print(f"{node_type}: {node_list}")

print("\nEdges:")
for edge in edges:
    print(f"{edge[0]} -[{edge[1]}]-> {edge[2]}")


  similarity = sent.similarity(nlp(phrase))  # Compute similarity


Nodes:
Company: ['ECE markets &#8221;)', 'Energy Generation and Storage Energy Storage Products', 'ir.tesla.com', 'ESG', 'Energy Generation and Storage Energy Storage Products Powerwall', 'Technician Trainees', 'European Defence Ministries', 'Automotive Regulatory Credits', 'FSD Computer', 'Internships &#8211', 'Our Products and Services Automotive', 'Control systems', 'Tesla', 'the &#8220;EPA&#8221', 'SEC', 'Megapack', 'Solar Energy Systems', 'AI', 'Full Self-Driving (&', 'Self-Driving Vehicles', 'SAE International', 'the National Highway Traffic Safety Administration', 'Autopilot', 'ECE', 'Battery Safety and Testing Our', ': &#8226', 'Powertrain Our', 'the U.S.', 'the Internal Revenue Code', 'FMVSS', 'Governmental Programs', 'Automotive Direct Sales Our', 'START', 'Model S', 'Automotive Purchase Financing and Leases', 'NACS', 'Solar Energy Offerings', 'Tesla Superchargers', 'the Organization for Economic Co-operation and', 'IRC', 'Significant Accounting Policies', 'FSD', 'Financial S