In [6]:
import praw
from psaw import PushshiftAPI
import datetime as dt
from dotenv import load_dotenv
import os
import re

In [7]:
load_dotenv()
client_id = os.getenv("CLIENTID")
client_secret = os.getenv("CLIENTSECRET")
user_agent = "AskHistGen:v1.0 (by u/AverageAngryPeasant)"

reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)
api = PushshiftAPI(reddit)
start_epoch=int(dt.datetime(2021, 1, 1).timestamp())
data = list(api.search_submissions(after=start_epoch, subreddit='askhistorians', filter=['url','author', 'title', 'subreddit']))

Version 7.0.0 of praw is outdated. Version 7.1.3 was released 1 day ago.


KeyboardInterrupt: 

In [None]:
removed = ["removed your question",
        "remove your submission",
        "remove your question",
        "remove your submission",
        "question has been removed",
        "submission has been removed",
        "question was removed",
        "submission was removed",
        "Sorry, we don't allow",
        "question is fine",
        "submission is fine",
        "meets our standard",
        "meet our standard",
        "[removed]"]

In [None]:
questions, answers = [], []
for i in range(len(data)):
    if data[i].link_flair_text and "question" not in data[i].link_flair_text.lower():
        continue
    if data[i].author and data[i].author.name == "AutoModerator":
        continue
        
    comments = [c for c in data[i].comments]
    if comments:
        while type(comments[-1]) == praw.models.reddit.more.MoreComments:
            comments = comments[:-1] + comments[-1].comments()
    comments = [c.body if not c.author or c.author.name != "AutoModerator" else None for c in comments]
    comments = list(filter(lambda x: x and len(x) > 500 and not any([r in x for r in removed]), comments))
    if comments:
        answers += comments
        questions.append(data[i].title)

In [None]:
for i in range(len(answers)):
    ans = answers[i]
    ans = re.sub(r'\(http\S+', '', ans)
    ans = re.sub(r'\[http\S+', '', ans)
    ans = re.sub("\([0-9]*\)", "", ans)
    ans = ''.join((c for c in ans if 0 < ord(c) < 255))
    ans = ans.replace("\\n\\n", " ").replace("&#x200B; ", "")
    ans = ans.translate(ans.maketrans("", "", "[]\\*>"))
    ans = " ".join(ans.split())
    answers[i] = ans

In [None]:
with open("questions.txt", "w") as f:
    for item in questions:
        f.write(item + "\n")

with open("answers.txt", "w") as f:
    for item in answers:
        f.write(item + "\n")

In [9]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing
import numpy as np

RuntimeError: Physical devices cannot be modified after being initialized

In [None]:
with open("answers.txt", "r") as f:
    answers = [text.strip() for text in f.readlines()]

In [None]:
# Preprocessing: convert strings to numerical representation plus a function for the reverse

vocab = sorted(set("".join(answers)))
print(vocab)
print('{} unique characters'.format(len(vocab)))

# convert from tokens to character IDs, padding = 0
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [None]:
# Separate all of our answers into sequence chunks

# From the input sequence, we want the current sequence and the next one
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

seq_length = 128
datasets = []
for ans in answers:
    # Get a dataset of seq_length chunks for each answer
    ids = ids_from_chars(tf.strings.unicode_split(ans, 'UTF-8'))
    ids = tf.data.Dataset.from_tensor_slices(ids)
    # examples_per_epoch = len(text)//(seq_length+1)
    sequences = ids.batch(seq_length+1, drop_remainder=True)
    datasets.append(sequences.map(split_input_target))

In [None]:
dataset = datasets[0]
for data in datasets[1:]:
    dataset = dataset.concatenate(data)
    
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

print(len(dataset))
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
print(len(dataset))

In [None]:
# Building the model

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True, 
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
        states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x

model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
print(len(ids_from_chars.get_vocabulary()), len(vocab))

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    print(input_example_batch)
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [None]:
model.summary()

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices