# Named-entity recognition with spaCy
Below were the experiment to obtain the company names from tweets using NER, then to be mapped to the stock symbols using a dictionary.

The results with spaCy was not good enough to be used in the project, as the models were not specifically trained on financial data. 

The alternative was to use transformers or LLMs, which were not used in the project due to the time constraint and complexity to fine-tune the models with hand-labeled data.

In [1]:
import spacy
from spacy import displacy
import pandas as pd
from collections import Counter

In [2]:
nlp = spacy.load('en_core_web_trf')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
BLACKLIST = ['ev', 'covid', 'etf', 'nyse', 'sec', 'spac', 'fda',
             'fed', 'treasury', 'eu', 'cnbc', 'faq', 'company']

def get_orgs(text):
    # process the text with our SpaCy model to get named entities
    doc = nlp(text)
    # initialize list to store identified organizations
    org_list = []
    for entity in doc.ents:
        # here we modify the original code to check that entity text is not equal to one of our 'blacklisted' organizations
        # (we also add .lower() to lowercase the text, this allows us to match both 'nyse' and 'NYSE' with just 'nyse')
        if entity.label_ == 'ORG' and entity.text.lower() not in BLACKLIST:
            org_list.append(entity.text)
    # if organization is identified more than once it will appear multiple times in list
    # we use set() to remove duplicates then convert back to list
    org_list = list(set(org_list))
    return org_list

In [4]:
df = pd.read_csv('../data/jimcramer_tweets_2022-03-20-2023-04-30.csv')
old_length = len(df)
df = df.dropna(subset=['content'])
old_length - len(df)

4

In [5]:
import re

def clean_content(text):
    if not isinstance(text, str):
        print(text)
        return text
    # Remove links
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    # Remove hashtags
    text = re.sub(r"\#\w+", "", text)
    # Remove mentions
    text = re.sub(r"\@\w+", "", text)
    # Remove emojis and other non-ascii characters
    text = text.encode("ascii", "ignore").decode("ascii")
    # Remove newline and tab characters
    text = text.replace("\n", " ").replace("\t", " ")
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


df["content"] = df["content"].apply(clean_content)


In [6]:
df['organizations'] = df['content'].apply(get_orgs)
df.head()

Unnamed: 0,time,content,comments,retweets,quotes,hearts,organizations
0,2023-04-22 15:00:00,"Houston, TX! Come spend the afternoon with Lis...",87,9,2,234,[]
1,2023-04-21 23:00:00,"I heard Houston loves Mezcal, so Lisa and I ar...",65,10,1,154,[]
2,2023-04-21 17:00:00,"Houston, TX! I'm heading to the Bayou City thi...",39,4,3,94,[]
3,2023-04-21 10:56:00,Problematic for Club Name Ford.. hope pinned o...,56,8,5,83,[Ford]
4,2023-04-21 08:00:00,As a veteran of these wars i totally agree,33,2,1,48,[]


In [21]:
# get rows containing "morgan" in content column
df[df['content'].str.contains("Schwab")]

Unnamed: 0,time,content,comments,retweets,quotes,hearts,organizations
28,2023-04-18 07:49:00,Schwab's conference call was amazing. Just rel...,50,16,4,185,[MS]
29,2023-04-17 15:12:00,Watch Schwab. The bears need to break this one...,135,33,28,401,[]
32,2023-04-17 13:17:00,Does Schwab control this market today? Roblox??,59,10,3,157,[]
71,2023-04-05 09:45:00,What's new besides First Republic and Schwab? ...,177,16,4,294,[First Republic and]
200,2023-03-14 17:35:00,"NIce insider buy at Schwab... 50,000 shares by...",606,100,93,1552,[]
205,2023-03-14 09:42:00,Schwab's defense seemed logical and i think th...,69,13,4,137,[Citigroup]


In [7]:
# merge organizations column into one big list
orgs = df['organizations'].to_list()
orgs = [org for sublist in orgs for org in sublist]
orgs[:10]
org_freq = Counter(orgs)
org_freq.most_common(10)

[('Apple', 34),
 ('Nvidia', 30),
 ('Disney', 26),
 ('Gamestop', 25),
 ('AMD', 21),
 ('Bed Bath', 20),
 ('Amazon', 18),
 ('CNBC Investing Club', 14),
 ('Twitter', 14),
 ('AMC', 14)]

In [22]:
# merge organizations column into one big list
orgs = df['organizations'].to_list()
orgs = [org for sublist in orgs for org in sublist]
orgs[:10]


['TX',
 'Smith Street',
 'the Lone Star State',
 'TX',
 'Fsforo',
 'Ford',
 'CSX',
 'ATT',
 'IBM',
 'Cramericans & Mezcal']

In [23]:
# create dictionary of organization mention frequency
org_freq = Counter(orgs)
org_freq.most_common(10)


[('Apple', 33),
 ('AMD', 25),
 ('Disney', 23),
 ('Amazon', 18),
 ('semis', 18),
 ('Investing Club', 15),
 ('CNBC Investing Club', 15),
 ('AMC', 14),
 ('BBBY', 14),
 ('Micron', 13)]