In [1]:
# run once (may have to restart notebook)
# !pip install tensorflow-gpu==1.15 --user # if you do not have a gpu remove -gpu 
# !pip install gpt-2-simple --user

In [2]:
import pandas as pd
import numpy as np
import json
import os
import requests

In [3]:
is_local = True # change this if you are not loading a pretrained model locally
if is_local:
    # import tensorflow as tf
    import sys
    sys.path.insert(0, os.path.abspath('../../gpt-2-simple-0.7/gpt_2_simple'))
    import gpt_2 as gpt2
    local_checkpoint_dir = "../../local_checkpoints" # directory where local models are stored
    local_model_name = 'model-100'
else:
    import gpt_2_simple as gpt2

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
# check to make sure gpu is recognized for significantly faster training

# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

In [5]:
model_name = "124M"
if not os.path.isdir(os.path.join("models", model_name)):
    print(f"Downloading {model_name} model...")
    gpt2.download_gpt2(model_name=model_name)

In [6]:
dir_path = "../data/"

dem_file_name = "democrats_result.txt"
dem_sample_name = "democrats_sample.txt"
dem_model_name = 'dem'

rep_file_name = 'republican_result.txt'
rep_sample_name = 'republican_sample.txt'
rep_model_name = 'rep'

both_file_name = 'both_result.txt'
both_sample_name = 'both_sample.txt'
both_model_name = 'both'

In [7]:
# reading and writing sample files for each side

# with open(dir_path+dem_file_name,'r') as demf:
#     dem_data = demf.readlines(500000)
# with open(dir_path+dem_sample_name,'w+') as dem_write:
#     dem_write.writelines(dem_data)

# with open(dir_path+rep_file_name,'r') as repf:
#     rep_data = repf.readlines(500000)
# with open(dir_path+rep_sample_name,'w+') as rep_write:
#     rep_write.writelines(rep_data)

# dem_data.extend(rep_data)
# both_data = dem_data


# with open(dir_path+both_file_name,'r') as bothf:
#     both_data = bothf.readlines(100000)
# with open(dir_path+both_sample_name,'w+') as both_write:
#     both_write.writelines(both_data)

In [8]:
# text file to train model on
train_fp = dir_path + both_sample_name
train_name = both_model_name
results_fp = "../results/" + train_name + "_generated.txt"

In [9]:
# this cell takes the longest. Can only be run once without restarting the notebook
sess = gpt2.start_tf_sess()
if is_local:
    gpt2.load_gpt2(sess, 
                   checkpoint=local_model_name, 
                   run_name=train_name, 
                   checkpoint_dir=local_checkpoint_dir)
else:
    gpt2.finetune(sess,
                  train_fp,
                  model_name=model_name,
                  steps=100, # steps is max number of training steps
                  restore_from='fresh', # makes sure model doesnt resume from previous trained model
                  print_every=20, # only prints every 20 training steps,
                  run_name=train_name # model name, so we can load different models locally
                 )

Loading checkpoint ../../local_checkpoints/both/model-100
INFO:tensorflow:Restoring parameters from ../../local_checkpoints/both/model-100


In [10]:
# prompt to generate response to, going to be a post/comment from the political discussion subreddits

pre = "After witnessing the heinous amount of police brutality in the US against POC, \
my partner and I have created some protest posters in response. We do not care about \
credit, just getting the message out there, so feel free to distribute however way you \
want, or make better versions yourself if you think it can be improved!"

In [None]:
gpt2.generate(sess, 
              temperature=.8, # uniqueness of the output (usually ranges from .5 to 2)
              prefix=pre, # prompt
              nsamples=5, # number of generated responses 
              length=400 # number of words (including prompt) per response
             )

In [11]:
gpt2.generate_to_file(sess, 
                      destination_path=results_fp,
                      temperature=.8, # uniqueness of the output (usually ranges from .5 to 2)
                      prefix=pre, # prompt
                      nsamples=5, # number of generated responses 
                      length=400 # number of words (including prompt) per response)
                     )