In [1]:
import spacy
from spacy import displacy

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
nlp = spacy.load('en_core_web_sm')

In [6]:
txt = ("Given the recent downturn in stocks especially in tech which is likely to persist as yields keep going up, "
       "I thought it would be prudent to share the risks of investing in ARK ETFs, written up very nicely by "
       "[The Bear Cave](https://thebearcave.substack.com/p/special-edition-will-ark-invest-blow). The risks comes "
       "primarily from ARK's illiquid and very large holdings in small cap companies. ARK is forced to sell its "
       "holdings whenever its liquid ETF gets hit with outflows as is especially the case in market downturns. "
       "This could force very painful liquidations at unfavorable prices and the ensuing crash goes into a "
       "positive feedback loop leading into a death spiral enticing even more outflows and predatory shorts.")

In [7]:
doc = nlp(txt)

displacy.render(doc, style='ent')
# displacy.serve(doc, style='ent') if not running in a notebook

Immediately we're able to produce not perfect, but pretty good NER. We are using the en_core_web_sm model - en referring to English and sm small.

The model is accurately identifying ARK as an organization. It does also classify ETF (exchange traded fund) as an organization, which is not the case (an ETF is a grouping of securities on the markets), but it's easy to see why this is being classified as one. The other tag we can see is WORK_OF_ART, it isn't inherently clear what exactly this means, so we can get more information using spacy.explain:

In [8]:
spacy.explain('WORK_OF_ART')

'Titles of books, songs, etc.'

In [12]:
spacy.explain('ORG') #organisation

'Companies, agencies, institutions, etc.'

And we can see that this description fits well to the tagged item, which refers to an article (although not quite a book).

We have a visual output from our tagged text, but this won't be particularly useful programatically. What we need is a way to extract the relevant tags (the organizations) from our text. To do that we can use doc.ents which will return a list of all identified entities.

Each item in this entity list contains two attributes that we are interested in, label_ and text:

In [9]:
for entity in doc.ents:
    print(f"{entity.label_}: {entity.text}")

GPE: ARK
ORG: The Bear Cave](https://thebearcave.substack.com/p
ORG: ARK
ORG: ARK
ORG: ETF


Now, we need to filter out any entities that are not ORG entities, and append those remaining ORGs to an organization list:

In [10]:
# initialize our list
org_list = []

for entity in doc.ents:
    # if label_ is ORG, we append text, otherwise ignore
    if entity.label_ == 'ORG':
        org_list.append(entity.text)

org_list

['The Bear Cave](https://thebearcave.substack.com/p', 'ARK', 'ARK', 'ETF']

In [11]:
# I don't need to see 'ARK' three times, so I use set() to remove duplicates, and then convert back to list
org_list = list(set(org_list))

org_list

['ARK', 'The Bear Cave](https://thebearcave.substack.com/p', 'ETF']

In [13]:
txt = "Apple is looking at buying U.K. startup for $1 billion"

Getting Reddit Data
There are two options for extracting data from Reddit:

The requests library, which will allow us to interface directly with the Reddit API.

The PRAW library, which is a wrapper library that adds an extra layer of abstraction in accessing the Reddit API.

Here we will cover the first option, using the requests library to interface directly with the API.

The final extraction script will look like this:

In [14]:
import requests
import pandas as pd


class Reddit:
    def __init__(self, client_id, secret_token, username, password):
        # first create authentication object
        auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
        # build login dictionary
        login = {'grant_type': 'password',
                 'username': username,
                 'password': password}
        # setup header info (incl description of API)
        headers = {'User-Agent': 'MyBot/0.0.1'}
        # send request for OAuth token
        res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=login, headers=headers)
        # pull auth bearer token from response
        token = res.json()['access_token']
        # add authorization to headers dictionary
        headers['Authorization'] = f'bearer {token}'
        # add headers dict to internal attributes
        self.headers = headers
        # and api
        self.api = 'https://oauth.reddit.com'

    def get_new(self, subreddit, iters):
        # initialize dataframe to store data
        df = pd.DataFrame()
        # initialize parameters dictionary
        params = {'limit': 100}
        # iterate through several times to make sure we get all the data available
        for i in range(iters):
            # make request
            res = requests.get(f'{self.api}/r/{subreddit}/new',
                               headers=self.headers,
                               params=params)
            # check that we returned something (if not we reached end)
            if len(res.json()['data']['children']) == 0:
                print('No more found')
                return df
            # iterate through each thread recieved
            for thread in res.json()['data']['children']:
                # add info to dataframe
                df = df.append({
                    'id': thread['data']['name'],
                    'created_utc': int(thread['data']['created_utc']),
                    'subreddit': thread['data']['subreddit'],
                    'title': thread['data']['title'],
                    'selftext': thread['data']['selftext'],
                    'upvote_ratio': thread['data']['upvote_ratio'],
                    'ups': thread['data']['ups'],
                    'downs': thread['data']['downs'],
                    'score': thread['data']['score']
                }, ignore_index=True)
            # get earliest ID
            earliest = df['id'].iloc[len(df)-1]
            # add earliest ID to params
            params['after'] = earliest
        return df


In [15]:
SUB = 'investing'



In [16]:
CLIENT_ID = '<CLIENT_ID>'
SECRET_TOKEN = '<SECRET_TOKEN>'

In [17]:
USER = '<USER>'
PWD = '<PASSWORD>'

In [18]:
reddit = Reddit(CLIENT_ID, SECRET_TOKEN, USER, PWD)


KeyError: 'access_token'

In [None]:
data = reddit.get_new(SUB, 20)


No more found

In [None]:
data = data.replace({'|': ''}, regex=True)
data.to_csv(f'./data/reddit_{SUB}.csv', sep='|', index=False)