In [1]:
import numpy as np
import pandas as pd

import requests
import time
import pickle
import demoji
from tqdm.notebook import tqdm_notebook

In [2]:
#Downloads database of emojis
demoji.download_codes()

Downloading emoji data ...
... OK (Got response in 1.89 seconds)
Writing emoji data to /home/batyrkhan/.demoji/codes.json ...
... OK


In [7]:
r = requests.get(
    'https://www.reddit.com/r/wallstreetbets/.json',
    headers={'user-agent': 'Mozilla/5.0'}
)

In [16]:
data = r.json()['data']

In [28]:
data = data['children']

In [9]:
r.headers

{'Connection': 'keep-alive', 'Content-Length': '142520', 'Content-Type': 'application/json; charset=UTF-8', 'x-ua-compatible': 'IE=edge', 'x-frame-options': 'SAMEORIGIN', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'access-control-allow-origin': '*', 'access-control-expose-headers': 'X-Moose', 'Content-Encoding': 'gzip', 'cache-control': 'max-age=0, must-revalidate', 'X-Moose': 'majestic', 'Accept-Ranges': 'bytes', 'Date': 'Wed, 17 Mar 2021 10:11:18 GMT', 'Via': '1.1 varnish', 'Vary': 'accept-encoding', 'Set-Cookie': 'loid=0000000000aypvx3cb.2.1615975876857.Z0FBQUFBQmdVZFhGSTUxaEE2Wmd4RDRlbHg0UG1Jcy1lMlkzdEh0YTdkRjZYcl9LQVd4STFpb1ZNTTktcnlEcU1lZkF5c0JWb2FmWjVmMDdNWjNuNHMyeUlSNlVFbllicWgtWjhqSFNMbFg5QXNGam1XV1lJd1Bwa0VrVElEZVJyVkMwbEJXS2ZOS1g; Domain=reddit.com; Max-Age=63071999; Path=/; expires=Fri, 17-Mar-2023 10:11:17 GMT; secure; SameSite=None; Secure, session_tracker=uhQi8BJJT8efZqJqcx.0.1615975876857.Z0FBQUFBQmdVZFhGZElPMFFzQmZPanFUREZUY2ZIQVFxQS1mUmJ

In [35]:
#Finds current name of the pinned discussion


fresh = 'https://www.reddit.com' + r.json()['data']['children'][0]['data']['permalink'] + '.json'
fresh

'https://www.reddit.com/r/wallstreetbets/comments/m6wyk2/daily_discussion_thread_for_march_17_2021/.json'

In [36]:
def get_time(utc):
    """
    Converts utc-type time into datetime
    utc - float/int for universal time (seconds since 1970)
    """
    return pd.to_datetime(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(int(utc))))

def get_posts(fresh):
    """
    Gets posts from the pinned topic and provides the following info:
    time - time of post creation
    ups - number of upvotes of the post
    text - text of the post w/o emojis
    """
    
    r = requests.get(
        fresh, 
        headers={'user-agent': 'Mozilla/5.0'}
    )

    posts = []

    for stuff in r.json()[1]['data']['children']:
        try:
            posts.append( {
                'time': get_time(stuff['data']['created_utc']), 
                'ups': stuff['data']['ups'], 
                'text': demoji.replace_with_desc(stuff['data']['body'])
            } )
        except:
            continue
            
    return posts

In [37]:
PUNC = ['.',',','?','!','@','#','$','%','^','&','*','(',')','-','+','_','=','[',']','{','}','|','/',':',"'",
       '`','~','<','>']

def is_ticker(text):
    """
    Checks if the text looks like a ticker. Not 100% reliable
    """
    
    if (text.upper() == text) or (text[0] == '$'):
        
        text = ''.join([t for t in list(text) if t not in PUNC])
        
        if text in ['I', '', 'WSB']:
            return False
        else:
            try: 
                float(text)
                return False
            except:
                try: 
                    float(text.split('$')[1])
                    return False
                except:
                    return True
    else:
        return False
    
    pass
    
print(is_ticker('GME'))
print(is_ticker('AMC?'))
print(is_ticker('$nok'))
print(is_ticker('25.2'))
print(is_ticker('I'))
print(is_ticker('any'))
print(is_ticker('$22.1'))
print(is_ticker('11000'))

def clean_ticker(tick):
    """
    removes junk from ticker word, i.e. "AMC," -> "AMC"
    """
    
    return ''.join([t for t in list(tick) if t not in PUNC])

clean_ticker('AMC')

True
True
True
False
False
False
False
False


'AMC'

In [38]:
def most_quoted(top=5):
    """
    Checks posts for tickers and creates a pd.Series with number of times a ticker was mentioned.
    top - number of most quoted tickers.
    """
    
    posts = get_posts(fresh)

    tickers = []

    for p in posts:

        uniques = []

        for w in p['text'].split(" "):
            if is_ticker(w) and not (clean_ticker(w) in uniques):
                uniques.append(clean_ticker(w))

        tickers = tickers + uniques
    print(posts)
    return pd.Series(tickers, name=posts[0]['time']).value_counts().head(top), posts[-1]['time']

In [39]:
def time_name(dt):
    dt = str(pd.to_datetime(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(dt))))
    return dt.replace(":","_").replace("-","_")

time_name(time.time())

'2021_03_18 14_42_53'

In [41]:
fulldf = pd.DataFrame()

for _ in tqdm_notebook(range(5)):
    tickers, timestamp = most_quoted(top=10)
#     print(timestamp)
    fulldf = fulldf.join(tickers, how='outer', rsuffix='_')
    display(fulldf)
    display(len(fulldf.index.unique()) == len(fulldf.index))
    time.sleep(1+np.random.randint(0, 1))
    
fulldf.columns=pd.Series(fulldf.columns).astype(str)
fulldf['tot'] = fulldf.sum(axis=1).sort_values(ascending=False)

fname = 'wsb files/tick_freq ' + time_name(time.time()) + '.parquet'
fulldf.to_parquet(fname)

  0%|          | 0/5 [00:00<?, ?it/s]

[{'time': Timestamp('2021-03-18 09:25:23'), 'ups': 2, 'text': 'lol why do i have nio 70c and why is it green'}, {'time': Timestamp('2021-03-18 08:13:55'), 'ups': 2, 'text': 'why doesn’t BB just rebrand?'}, {'time': Timestamp('2021-03-18 07:56:09'), 'ups': 2, 'text': 'Holy moly'}, {'time': Timestamp('2021-03-18 06:35:31'), 'ups': 2, 'text': 'My hedgefucks my position is 42069 shares at 3.14'}, {'time': Timestamp('2021-03-18 05:55:07'), 'ups': 5, 'text': '# DOW futures just hit the 92-year resistance line - 33100!'}, {'time': Timestamp('2021-03-18 05:32:28'), 'ups': -3, 'text': "Let's gooooooooooo  PLTR! UWMC! FSR! AMC! PLUG (sure why not, picked up this morn)! and of course GME\n\n&amp;#x200B;\n\nbeen bag holding weed stocks since that spike... is it time to just get rid of em? only 2.5k total invested with all of the above."}, {'time': Timestamp('2021-03-18 05:26:31'), 'ups': 1, 'text': 'I love you movie monke’s'}, {'time': Timestamp('2021-03-18 05:14:35'), 'ups': -7, 'text': "As a com




Unnamed: 0,2021-03-18 09:25:23
AMC,8
UWMC,4
GME,4
BB,3
PLTR,2
SNDL,2
PLUG,2
DOW,1
THE,1
US,1


True

[{'time': Timestamp('2021-03-18 09:25:23'), 'ups': 2, 'text': 'lol why do i have nio 70c and why is it green'}, {'time': Timestamp('2021-03-18 08:13:55'), 'ups': 2, 'text': 'why doesn’t BB just rebrand?'}, {'time': Timestamp('2021-03-18 07:56:09'), 'ups': 2, 'text': 'Holy moly'}, {'time': Timestamp('2021-03-18 06:35:31'), 'ups': 2, 'text': 'My hedgefucks my position is 42069 shares at 3.14'}, {'time': Timestamp('2021-03-18 05:55:07'), 'ups': 4, 'text': '# DOW futures just hit the 92-year resistance line - 33100!'}, {'time': Timestamp('2021-03-18 05:32:28'), 'ups': -3, 'text': "Let's gooooooooooo  PLTR! UWMC! FSR! AMC! PLUG (sure why not, picked up this morn)! and of course GME\n\n&amp;#x200B;\n\nbeen bag holding weed stocks since that spike... is it time to just get rid of em? only 2.5k total invested with all of the above."}, {'time': Timestamp('2021-03-18 05:26:31'), 'ups': 1, 'text': 'I love you movie monke’s'}, {'time': Timestamp('2021-03-18 05:14:35'), 'ups': -7, 'text': "As a com




Unnamed: 0,2021-03-18 09:25:23,2021-03-18 09:25:23_
AMC,8,8
UWMC,4,4
GME,4,4
BB,3,3
PLTR,2,2
SNDL,2,2
PLUG,2,2
DOW,1,1
THE,1,1
US,1,1


True

[{'time': Timestamp('2021-03-18 09:25:23'), 'ups': 2, 'text': 'lol why do i have nio 70c and why is it green'}, {'time': Timestamp('2021-03-18 08:13:55'), 'ups': 2, 'text': 'why doesn’t BB just rebrand?'}, {'time': Timestamp('2021-03-18 07:56:09'), 'ups': 2, 'text': 'Holy moly'}, {'time': Timestamp('2021-03-18 06:35:31'), 'ups': 2, 'text': 'My hedgefucks my position is 42069 shares at 3.14'}, {'time': Timestamp('2021-03-18 05:55:07'), 'ups': 5, 'text': '# DOW futures just hit the 92-year resistance line - 33100!'}, {'time': Timestamp('2021-03-18 05:32:28'), 'ups': -2, 'text': "Let's gooooooooooo  PLTR! UWMC! FSR! AMC! PLUG (sure why not, picked up this morn)! and of course GME\n\n&amp;#x200B;\n\nbeen bag holding weed stocks since that spike... is it time to just get rid of em? only 2.5k total invested with all of the above."}, {'time': Timestamp('2021-03-18 05:26:31'), 'ups': 1, 'text': 'I love you movie monke’s'}, {'time': Timestamp('2021-03-18 05:14:35'), 'ups': -8, 'text': "As a com




Unnamed: 0,2021-03-18 09:25:23,2021-03-18 09:25:23_,2021-03-18 09:25:23.1
AMC,8,8,8
UWMC,4,4,4
GME,4,4,4
BB,3,3,3
PLTR,2,2,2
SNDL,2,2,2
PLUG,2,2,2
DOW,1,1,1
THE,1,1,1
US,1,1,1


True

[{'time': Timestamp('2021-03-18 09:25:23'), 'ups': 2, 'text': 'lol why do i have nio 70c and why is it green'}, {'time': Timestamp('2021-03-18 08:13:55'), 'ups': 2, 'text': 'why doesn’t BB just rebrand?'}, {'time': Timestamp('2021-03-18 07:56:09'), 'ups': 2, 'text': 'Holy moly'}, {'time': Timestamp('2021-03-18 06:35:31'), 'ups': 2, 'text': 'My hedgefucks my position is 42069 shares at 3.14'}, {'time': Timestamp('2021-03-18 05:55:07'), 'ups': 5, 'text': '# DOW futures just hit the 92-year resistance line - 33100!'}, {'time': Timestamp('2021-03-18 05:32:28'), 'ups': -3, 'text': "Let's gooooooooooo  PLTR! UWMC! FSR! AMC! PLUG (sure why not, picked up this morn)! and of course GME\n\n&amp;#x200B;\n\nbeen bag holding weed stocks since that spike... is it time to just get rid of em? only 2.5k total invested with all of the above."}, {'time': Timestamp('2021-03-18 05:26:31'), 'ups': 1, 'text': 'I love you movie monke’s'}, {'time': Timestamp('2021-03-18 05:14:35'), 'ups': -7, 'text': "As a com




Unnamed: 0,2021-03-18 09:25:23,2021-03-18 09:25:23_,2021-03-18 09:25:23.1,2021-03-18 09:25:23_.1
AMC,8,8,8,8
UWMC,4,4,4,4
GME,4,4,4,4
BB,3,3,3,3
PLTR,2,2,2,2
SNDL,2,2,2,2
PLUG,2,2,2,2
DOW,1,1,1,1
THE,1,1,1,1
US,1,1,1,1


True

[{'time': Timestamp('2021-03-18 09:25:23'), 'ups': 2, 'text': 'lol why do i have nio 70c and why is it green'}, {'time': Timestamp('2021-03-18 08:13:55'), 'ups': 2, 'text': 'why doesn’t BB just rebrand?'}, {'time': Timestamp('2021-03-18 07:56:09'), 'ups': 2, 'text': 'Holy moly'}, {'time': Timestamp('2021-03-18 06:35:31'), 'ups': 2, 'text': 'My hedgefucks my position is 42069 shares at 3.14'}, {'time': Timestamp('2021-03-18 05:55:07'), 'ups': 4, 'text': '# DOW futures just hit the 92-year resistance line - 33100!'}, {'time': Timestamp('2021-03-18 05:32:28'), 'ups': -5, 'text': "Let's gooooooooooo  PLTR! UWMC! FSR! AMC! PLUG (sure why not, picked up this morn)! and of course GME\n\n&amp;#x200B;\n\nbeen bag holding weed stocks since that spike... is it time to just get rid of em? only 2.5k total invested with all of the above."}, {'time': Timestamp('2021-03-18 05:26:31'), 'ups': 1, 'text': 'I love you movie monke’s'}, {'time': Timestamp('2021-03-18 05:14:35'), 'ups': -6, 'text': "As a com




Unnamed: 0,2021-03-18 09:25:23,2021-03-18 09:25:23_,2021-03-18 09:25:23.1,2021-03-18 09:25:23_.1,2021-03-18 09:25:23.2
AMC,8,8,8,8,8
UWMC,4,4,4,4,4
GME,4,4,4,4,4
BB,3,3,3,3,3
PLTR,2,2,2,2,2
SNDL,2,2,2,2,2
PLUG,2,2,2,2,2
DOW,1,1,1,1,1
THE,1,1,1,1,1
US,1,1,1,1,1


True

ValueError: Duplicate column names found: ['2021-03-18 09:25:23', '2021-03-18 09:25:23_', '2021-03-18 09:25:23', '2021-03-18 09:25:23_', '2021-03-18 09:25:23', 'tot']

In [None]:
fulldf

In [14]:
#!/usr/bin/env python3

import pytz
import requests
import time
import sys

import pandas as pd
pd.set_option('display.max_columns', None)

from datetime import datetime
from string import punctuation


sys.path[0] = '../../'
import config
from QuantumCapital import constants
from QuantumCapital.DBManager import DBManager

import demoji
demoji.download_codes()

DB_USERNAME = config.DB_USERNAME
DB_PASS = config.DB_PASS


def clean_ticker(tick):
    r = ''.join([t for t in list(tick) if t not in punctuation])
    return demoji.replace(r, '')


def is_ticker_like(text):
    """
    Checks if the text looks like a ticker. Not 100% reliable
    """
    if (text.upper() == text) or (text[0] == '$'):

        text = ''.join([t for t in list(text) if t not in punctuation])

        if text in ['I', '', 'WSB']:
            return False
        else:
            try:
                float(text)
                return False
            except:
                try:
                    float(text.split('$')[1])
                    return False
                except:
                    return True
    else:
        return False


def find_ticker_like(text):
    for candidate in text.split(' '):
        if is_ticker_like(candidate):
            r = clean_ticker(candidate)
            if 0 < len(r) <= 10:
                return r
    return None


def get_posts():
    fresh = ''
    while str(datetime.today().year) not in fresh:
        r = requests.get(
            'https://www.reddit.com/r/wallstreetbets/.json',
            headers={'user-agent': 'Mozilla/5.0'})
        fresh = 'https://www.reddit.com' + r.json()['data']['children'][0]['data']['permalink'] + '.json'
    r = requests.get(fresh, headers={'user-agent': 'Mozilla/5.0'})
    posts = r.json()[1]['data']['children']
    posts_new = []
    for post in posts:
        if 'body' not in post['data']:
            continue
        dt = datetime.fromtimestamp(post['data']['created_utc'], pytz.UTC)
        print(dt)
        print(dt.tzinfo)
        break
        dt = datetime(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, 0, pytz.timezone('Asia/Almaty'))
        posts_new.append({'dt': dt, 'id': post['data']['id'], 'ticker_like': find_ticker_like(post['data']['body']), 'text': post['data']['body']})
    return posts_new


def get_last_record_id(dbm: DBManager):
    cursor = dbm.select_cursor(f'select id from {constants.WSB_DISCUSSION_THREAD_TABLE} order by dt desc limit 1')
    try:
        id_num = cursor.fetchone()[0]
    except TypeError as error:
        id_num = None
    finally:
        cursor.close()
    return id_num


def cut_posts(last_id, posts):
    i = 0
    post = posts[i]
    while i < len(posts) and post['id'] != last_id:
        i += 1
        if i < len(posts):
            post = posts[i]
    return posts[:i]


dbm = DBManager(DB_USERNAME, DB_PASS, 'ODS')
last_id = get_last_record_id(dbm)

# while True:
#     posts = get_posts()
#     posts = cut_posts(last_id, posts)
#     if posts:
#         dbm.insert_df_simple(pd.DataFrame(posts[::-1]), constants.WSB_DISCUSSION_THREAD_TABLE)
#         dbm.commit()
#         last_id = posts[0]['id']
#     time.sleep(200)

Downloading emoji data ...
... OK (Got response in 0.81 seconds)
Writing emoji data to /home/batyrkhan/.demoji/codes.json ...
... OK


In [15]:
get_posts()

2021-03-23 11:33:17+00:00
UTC


[]