Scrape Reddit Data.

In [None]:
import praw
import json

In [None]:
# Load the credentials.
creds = json.load(open('../Datasets/credentials.json'))

reddit = praw.Reddit(client_id=creds['client_id'],
                    client_secret=creds['client_secret'],
                    user_agent=creds['user_agent'])
subreddit = reddit.subreddit("jokes")
posts = []

# Scrape top and hot subreddit sections.
for submission in subreddit.top(limit=None):
    text = submission.title + ' ' + submission.selftext
    posts.append(text)
    
for submission in subreddit.hot(limit=None):
    text = submission.title + ' ' + submission.selftext
    posts.append(text)

In [None]:
json.dump({'text': posts}, open(f'../Datasets/reddit-jokes-{subreddit}.json', 'w'))

Load the data from JSON

In [1]:
import json
import numpy as np

In [2]:
subreddit = "jokes"
posts = json.load(open(f'../Datasets/reddit-jokes-{subreddit}.json'))['text']

In [3]:
print(f"Number of Samples: {len(posts)}")
print(f"Average # of characters of submission: {sum([len(i) for i in posts]) / len(posts)}")
print(f"Average # of words of submission: {sum([len(i.split()) for i in posts]) / len(posts)}")
print(f"Max length of submissions: {max([len(i) for i in posts])}")
print(f"Min length of submissions: {min([len(i) for i in posts])}")
print(f"Max # of words in submissions: {max([len(i.split()) for i in posts])}")
print(f"Min # of words in submissions: {min([len(i.split()) for i in posts])}")

Number of Samples: 1889
Average # of characters of submission: 347.065643197459
Average # of words of submission: 64.07146638433034
Max length of submissions: 9147
Min length of submissions: 17
Max # of words in submissions: 1648
Min # of words in submissions: 2


Build the tokenizer

    - <s> is a special token that is meant to fill up space.

In [9]:
from tokenizers import ByteLevelBPETokenizer
import os

In [5]:
tokenizer = ByteLevelBPETokenizer(lowercase=True)
text = '\n'.join(posts)
open('../Datasets/temp.txt', 'w').write(text)
special_tokens = ['<s>']

In [10]:
tokenizer.train(
    '../Datasets/temp.txt',
    vocab_size=40000,
    min_frequency=3,
    show_progress=False,
    special_tokens=special_tokens
)
os.remove('../Datasets/temp.txt')

In [11]:
tokenizer.encode("Hello World").tokens

['hello', 'Ġworld']

In [12]:
tokenizer.save(f'../Datasets/tokenizer-{subreddit}.json', pretty=True)