<a href="https://colab.research.google.com/github/Codehackerone/nlp-with-transformers/blob/main/Named_Entity_Recognition(NER).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP with Spacy

In [None]:
!pip install spacy

In [1]:
import spacy

In [2]:
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[K     |████████████████████████████████| 33.5 MB 1.3 MB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
nlp = spacy.load('en_core_web_md')

Model names:
`[lang]_[type]_[genre]_[size]`

In [5]:
txt = "very big text... .. .. . ..  .dsfsdfd ..sf.sd.f.s. f.sdf."

In [6]:
doc = nlp(txt)

In [7]:
type(doc)

spacy.tokens.doc.Doc

In [9]:
from spacy import displacy

In [10]:
displacy.render(doc, style='ent')



'<div class="entities" style="line-height: 2.5; direction: ltr">very big text... .. .. . ..  .dsfsdfd ..sf.sd.f.s. f.sdf.</div>'

# Extracting Entities

In [11]:
spacy.explain('ORG')

'Companies, agencies, institutions, etc.'

In [12]:
spacy.explain('GPE')

'Countries, cities, states'

In [14]:
type(doc.ents)

tuple

In [15]:
doc.ents

()

In [16]:
doc = nlp("Apple reached an all-time high stock price of 143 dollars this January.")
org_list = []
for entity in doc.ents:
    if entity.label_ == 'ORG':
        org_list.append(entity.text)

In [17]:
org_list

['Apple']

# Getting Data from Reddit API

In [31]:
client_id = secret = password = ''
with open('reddit.txt', 'r') as t:
  txt = t.read().split()
  client_id = txt[0]
  secret = txt[1]
  password = txt[2]

In [28]:
import requests

In [36]:
auth = requests.auth.HTTPBasicAuth(client_id, secret)

In [37]:
data = {
    'grant_type':'password',
    'username':'Euphoric_Bar7252',
    'password':password
}

In [40]:
headers = {
    'User-Agent':'TextClasifier/0.0.1'
}

In [42]:
res = requests.post('https://www.reddit.com/api/v1/access_token', auth=auth, data=data,headers=headers)

In [43]:
res

<Response [200]>

In [44]:
token = res.json()['access_token']

In [48]:
headers['Authorization'] = f'bearer {token}'

In [49]:
headers

{'Authorization': 'bearer 1975611431860-xEPw_TzcTpf12symsD7IcdOYT6cLjA',
 'User-Agent': 'TextClasifier/0.0.1'}

In [50]:
requests.get('https://oauth.reddit.com/api/v1/me', headers=headers)

<Response [200]>

In [51]:
api = 'https://oauth.reddit.com'

In [56]:
res = requests.get(f'{api}/r/investing/new', headers=headers, params={'limit':'100'})

In [None]:
res.json()

In [None]:
res.json()['data']['children'][0]['data']

In [62]:
import pandas as pd

In [65]:
class Reddit:
    def __init__(self, client_id, secret_token, username, password):
        # first create authentication object
        auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
        # build login dictionary
        login = {'grant_type': 'password',
                 'username': username,
                 'password': password}
        # setup header info (incl description of API)
        headers = {'User-Agent': 'MyBot/0.0.1'}
        # send request for OAuth token
        res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=login, headers=headers)
        # pull auth bearer token from response
        token = res.json()['access_token']
        # add authorization to headers dictionary
        headers['Authorization'] = f'bearer {token}'
        # add headers dict to internal attributes
        self.headers = headers
        # and api
        self.api = 'https://oauth.reddit.com'

    def get_new(self, subreddit, iters):
        # initialize dataframe to store data
        df = pd.DataFrame()
        # initialize parameters dictionary
        params = {'limit': 100}
        # iterate through several times to make sure we get all the data available
        for i in range(iters):
            # make request
            res = requests.get(f'{self.api}/r/{subreddit}/new',
                               headers=self.headers,
                               params=params)
            # check that we returned something (if not we reached end)
            if len(res.json()['data']['children']) == 0:
                print('No more found')
                return df
            # iterate through each thread recieved
            for thread in res.json()['data']['children']:
                # add info to dataframe
                df = df.append({
                    'id': thread['data']['name'],
                    'created_utc': int(thread['data']['created_utc']),
                    'subreddit': thread['data']['subreddit'],
                    'title': thread['data']['title'],
                    'selftext': thread['data']['selftext'],
                    'upvote_ratio': thread['data']['upvote_ratio'],
                    'ups': thread['data']['ups'],
                    'downs': thread['data']['downs'],
                    'score': thread['data']['score']
                }, ignore_index=True)
            # get earliest ID
            earliest = df['id'].iloc[len(df)-1]
            # add earliest ID to params
            params['after'] = earliest
        return df

In [66]:
SUB = 'investing'


In [67]:
reddit = Reddit(client_id, secret, 'Euphoric_Bar7252', password)

In [68]:
data = data = reddit.get_new(SUB, 20)

No more found


In [69]:
data = data.replace({'|': ''}, regex=True)

In [71]:
data.to_csv(f'./reddit_{SUB}.csv', sep='|', index=False)

# Extracting ORGs from reddit data

In [72]:
df = pd.read_csv('reddit_investing.csv', sep='|')
df

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,t3_vy9g0x,1.657735e+09,investing,Event-Driven Investing Resources,I’m trying to find a good resource to see what...,1.00,1.0,0.0,1.0
1,t3_vy8vxc,1.657733e+09,investing,Questions about Money Market accounts and IRAs,So obviously there's been a lot of doom and gl...,1.00,1.0,0.0,1.0
2,t3_vy7unu,1.657731e+09,investing,Changing SP500 ETFs to euro hedged version,I've been investing for quite a long while as ...,1.00,2.0,0.0,2.0
3,t3_vy7lhr,1.657730e+09,investing,Can I borrow against my concentrated single st...,I own more than half of the market cap of one ...,0.60,1.0,0.0,1.0
4,t3_vy6cp2,1.657727e+09,investing,Hindenburg Research backs Twitter and bets aga...,Hindenburg Research backs Twitter and bets aga...,0.82,7.0,0.0,7.0
...,...,...,...,...,...,...,...,...,...
888,t3_um7a92,1.652148e+09,investing,is a recession being priced into the market now?,markets are down nearly 25% ytd and we're not ...,0.84,201.0,0.0,201.0
889,t3_um73zk,1.652147e+09,investing,Why would younger investors use a dividend sto...,This specifically pertains to younger investor...,0.65,4.0,0.0,4.0
890,t3_um50q0,1.652141e+09,investing,"To the people who have stable coin staked, or ...",I’ve seen a lot of people here discuss stable ...,0.65,12.0,0.0,12.0
891,t3_um4inc,1.652139e+09,investing,Bitcoin tumbles more than 50% below its all-ti...,"At a price of just below $31,000, bitcoin is m...",0.86,1616.0,0.0,1616.0


In [73]:
def get_orgs(text):
  doc = nlp(text)
  org_list= []

  for entity in doc.ents:
    if entity.label_ == 'ORG':
      org_list.append(entity.text)

  org_list = list(set(org_list))

  return org_list

In [74]:
df['Organisations'] = df['selftext'].apply(get_orgs)

In [75]:
df.head(10)

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,Organisations
0,t3_vy9g0x,1657735000.0,investing,Event-Driven Investing Resources,I’m trying to find a good resource to see what...,1.0,1.0,0.0,1.0,[]
1,t3_vy8vxc,1657733000.0,investing,Questions about Money Market accounts and IRAs,So obviously there's been a lot of doom and gl...,1.0,1.0,0.0,1.0,[]
2,t3_vy7unu,1657731000.0,investing,Changing SP500 ETFs to euro hedged version,I've been investing for quite a long while as ...,1.0,2.0,0.0,2.0,[sp500 ETFs]
3,t3_vy7lhr,1657730000.0,investing,Can I borrow against my concentrated single st...,I own more than half of the market cap of one ...,0.6,1.0,0.0,1.0,[LTV]
4,t3_vy6cp2,1657727000.0,investing,Hindenburg Research backs Twitter and bets aga...,Hindenburg Research backs Twitter and bets aga...,0.82,7.0,0.0,7.0,"[the Financial Times, Hindenburg Research]"
5,t3_vy5tqy,1657726000.0,investing,Bombardier preferred shares,[BBD.PR.B](https://money.tmx.com/en/quote/BBD....,0.75,2.0,0.0,2.0,"[BBD.PR.D](https://money.tmx.com, BBD.PR.B](ht..."
6,t3_vy2w32,1657718000.0,investing,How to start investing in Korean Stocks?,"Hey everyone, there's a specific Korean stock ...",0.83,4.0,0.0,4.0,"[ETF, NAVER]"
7,t3_vy4fe0,1657722000.0,investing,Bank of Canada increases policy interest rate ...,[BoC](https://www.bankofcanada.ca/2022/07/fad-...,0.96,62.0,0.0,62.0,"[the Bank Rate, The Governing Council, the Gov..."
8,t3_vy2dld,1657716000.0,investing,Dividend aristocrat that suffered the biggest ...,Hello all\n\nDo you have any company in mind t...,0.73,5.0,0.0,5.0,[]
9,t3_vy2am5,1657716000.0,investing,Question about pacific gas &amp; electric from...,"So I remember being told ""family"" bought me an...",0.83,4.0,0.0,4.0,[]


# Getting Entity Frequency

In [76]:
from collections import Counter

In [None]:
orgs = df['Organisations'].to_list()
orgs

In [80]:
orgs = [org for sublist in orgs for org in sublist]

In [81]:
orgs

['sp500 ETFs',
 'LTV',
 'the Financial Times',
 'Hindenburg Research',
 'BBD.PR.D](https://money.tmx.com',
 'BBD.PR.B](https://money.tmx.com',
 '😬',
 'announced](https://bombardier.com',
 'ETF',
 'NAVER',
 'the Bank Rate',
 'The Governing Council',
 'the Governing Council',
 'The Bank of Canada',
 'Bank',
 'MPR',
 'Dow Jones',
 'https://www.cnbc.com/2022/07/13/inflation-rose-9point1percent-in-june-even-more-than-expected-as-price-pressures-intensify.html](https://www.cnbc.com/2022/07/13/inflation-rose-9point1percent-in-june-even-more-than-expected-as-price-pressures-intensify.html',
 'the Bureau of Labor  Statistics',
 'SCHJ',
 'SEC',
 'ESG',
 'Refinitive ESG',
 'the Frankfurt School of Finance and Management',
 'Refinitiv',
 'MIT Sloan School of Management',
 'MIT',
 'Thomson Reuters',
 'LLC',
 'the British Stock Exchange',
 'ADR',
 'ETF',
 'FLJP',
 'FLJH',
 'The European Central Bank',
 'Bank of France',
 'https://www.reuters.com/markets/europe/ecb-digital-euros-could-be-capped-frenc

In [82]:
org_freq = Counter(orgs)

In [83]:
org_freq.most_common(10)

[('ETF', 47),
 ('Fed', 38),
 ('VOO', 21),
 ('VTI', 20),
 ('Amazon', 16),
 ('the Federal Reserve', 15),
 ('DCA', 14),
 ('Fidelity', 13),
 ('SEC', 12),
 ('Reuters', 12)]

# Entity Blacklist

In [84]:
blacklist = ['ETF', 'SEC' 'Fed']

In [86]:
def get_orgs(text):
  doc = nlp(text)
  org_list= []

  for entity in doc.ents:
    if entity.label_ == 'ORG' and entity.text.lower() not in blacklist:
      org_list.append(entity.text)

  org_list = list(set(org_list))

  return org_list

In [87]:
df['Organisations'] = df['selftext'].apply(get_orgs)

In [88]:
df.to_csv('reddit_investing_ner.csv', sep='|', index=False)

# NER with Sentiment

In [90]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 14.2 MB/s 
Collecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 376 kB/s 
Collecting transformers>=4.0.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 66.0 MB/s 
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting bpemb>=0.3.2
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting wikipedia-api
  Downloading Wikipedia-API-0.5.4.tar.gz (18 kB)
Collecting segtok>=1.5.7
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting sqlitedict>=1.6.0
  Downloading sqlitedict-2.0.0.tar.gz (46 kB)
[K     |████████████████████████████████| 46 kB 4.9 MB/s 
Collecting deprecated>=1.2.4
 

In [91]:
import flair

In [92]:
model = flair.models.TextClassifier.load('en-sentiment')

2022-07-13 18:13:10,936 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpytbrx9ee


100%|██████████| 265512723/265512723 [00:07<00:00, 37218230.89B/s]

2022-07-13 18:13:18,177 copying /tmp/tmpytbrx9ee to cache at /root/.flair/models/sentiment-en-mix-distillbert_4.pt





2022-07-13 18:13:18,822 removing temp file /tmp/tmpytbrx9ee
2022-07-13 18:13:18,888 loading file /root/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [107]:
def get_sentiment(text):
  sentence = flair.data.Sentence(text)
  model.predict(sentence)
  sentiment = sentence.labels[0]
  return str(sentiment).split('→')[1]

In [94]:
df = pd.read_csv('reddit_investing_ner.csv', sep='|')
df.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,Organisations
0,t3_vy9g0x,1657735000.0,investing,Event-Driven Investing Resources,I’m trying to find a good resource to see what...,1.0,1.0,0.0,1.0,[]
1,t3_vy8vxc,1657733000.0,investing,Questions about Money Market accounts and IRAs,So obviously there's been a lot of doom and gl...,1.0,1.0,0.0,1.0,[]
2,t3_vy7unu,1657731000.0,investing,Changing SP500 ETFs to euro hedged version,I've been investing for quite a long while as ...,1.0,2.0,0.0,2.0,['sp500 ETFs']
3,t3_vy7lhr,1657730000.0,investing,Can I borrow against my concentrated single st...,I own more than half of the market cap of one ...,0.6,1.0,0.0,1.0,['LTV']
4,t3_vy6cp2,1657727000.0,investing,Hindenburg Research backs Twitter and bets aga...,Hindenburg Research backs Twitter and bets aga...,0.82,7.0,0.0,7.0,"['the Financial Times', 'Hindenburg Research']"


In [108]:
df['Sentiment'] = df['selftext'].apply(get_sentiment)
df.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,Organisations,Sentiment
0,t3_vy9g0x,1657735000.0,investing,Event-Driven Investing Resources,I’m trying to find a good resource to see what...,1.0,1.0,0.0,1.0,[],NEGATIVE (0.9321)
1,t3_vy8vxc,1657733000.0,investing,Questions about Money Market accounts and IRAs,So obviously there's been a lot of doom and gl...,1.0,1.0,0.0,1.0,[],POSITIVE (0.9927)
2,t3_vy7unu,1657731000.0,investing,Changing SP500 ETFs to euro hedged version,I've been investing for quite a long while as ...,1.0,2.0,0.0,2.0,['sp500 ETFs'],NEGATIVE (0.9999)
3,t3_vy7lhr,1657730000.0,investing,Can I borrow against my concentrated single st...,I own more than half of the market cap of one ...,0.6,1.0,0.0,1.0,['LTV'],NEGATIVE (0.9821)
4,t3_vy6cp2,1657727000.0,investing,Hindenburg Research backs Twitter and bets aga...,Hindenburg Research backs Twitter and bets aga...,0.82,7.0,0.0,7.0,"['the Financial Times', 'Hindenburg Research']",POSITIVE (0.6587)


In [110]:
import ast

In [111]:
df['Organisations'] = df['Organisations'].apply(lambda x:ast.literal_eval(x))

In [112]:
sentiment = {}

In [132]:
str(df['Sentiment'].to_list()[0].split('(')[0]).strip(), str(df['Sentiment'].to_list()[0].split('(')[1]).split(')')[0]

('NEGATIVE', '0.9321')

In [None]:
for i, row in df.iterrows():
  direction = str(row['Sentiment'].split('(')[0]).strip()
  score = str(row['Sentiment'].split('(')[1]).split(')')[0]
  for org in row['Organisations']:
    if org not in sentiment.keys():
        sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
    sentiment[org][direction].append(score)

# NER and transformers (Huggingface)

In [141]:
import spacy
from spacy import displacy

In [142]:
nlp = spacy.load('en_core_web_md')

In [143]:
doc = nlp("Apple reached an all-time high stock price of 143 dollars this January.")

In [144]:
displacy.render(doc, style='ent')


'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Apple\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n reached an all-time high stock price of \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    143 dollars\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n \n<mark class="entity" style="background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    this January\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-le

In [146]:
!python -m spacy download en_core_web_trf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-trf==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.3.0/en_core_web_trf-3.3.0-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 25 kB/s 
Collecting spacy-transformers<1.2.0,>=1.1.2
  Downloading spacy_transformers-1.1.7-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.9 MB/s 
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 21.8 MB/s 
Installing collected packages: spacy-alignments, spacy-transformers, en-core-web-trf
Successfully installed en-core-web-trf-3.3.0 spacy-alignments-0.8.5 spacy-transformers-1.1.7
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core

In [None]:
trf = spacy.load('en_core_web_trf')

In [None]:
doc = trf("Apple reached an all-time high stock price of 143 dollars this January.")

In [None]:
displacy.render(doc, style='ent')