In [0]:
%tensorflow_version 1.x
import os
import tarfile
import shutil

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
!pip install bert-tensorflow
!pip install -q gpt-2-simple
import gpt_2_simple as gpt2
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

tf.logging.set_verbosity(tf.logging.ERROR)

OUTPUT_DIR = 'tmp'

gpt2.mount_gdrive()

gpt2.copy_file_from_gdrive("bert_gan_real.csv")

real = pd.read_csv('bert_gan_real.csv')
fakes = []
for i in ["100", "200", "400", "600", "800"]:
  gpt2.copy_file_from_gdrive(f"bert_gan_fake{i}.csv")
  fakes.append(pd.read_csv(f"bert_gan_fake{i}.csv"))
fake = pd.concat(fakes)
fake['reply']=fake['reply'].astype(str)
fake['real']=0
df = pd.concat([real,fake])[['comment','reply','real']].dropna()


INPUT_COLUMN = 'comment'
DATA_COLUMN = 'reply'
LABEL_COLUMN = 'real'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

train, test = train_test_split(df, test_size=0.1)

# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[INPUT_COLUMN], 
                                                                   text_b = x[DATA_COLUMN], 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[INPUT_COLUMN], 
                                                                   text_b = x[DATA_COLUMN], 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 64
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(
            label_ids,
            predicted_labels)
        auc = tf.metrics.auc(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

def getPrediction(in_sentence_pairs):
  labels = ["Fake", "Real"]
  input_examples = [run_classifier.InputExample(guid="", text_a = x[0], text_b = x[1], label = 0) for x in in_sentence_pairs] # here, "" is just a dummy label
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn)
  return pd.DataFrame([(sentence[0], sentence[1], np.exp(prediction['probabilities'][1]), labels[prediction['labels']]) for sentence, prediction in zip(in_sentence_pairs, predictions)], columns=['comment', 'reply', 'prob_real','label'])

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [0]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Training took time  0:08:40.117440


In [0]:
estimator.evaluate(input_fn=test_input_fn, steps=None)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


{'auc': 0.99014676,
 'eval_accuracy': 0.99841666,
 'f1_score': 0.9991478,
 'false_negatives': 2.0,
 'false_positives': 15.0,
 'global_step': 9059,
 'loss': 0.008924643,
 'precision': 0.9984971,
 'recall': 0.9997994,
 'true_negatives': 754.0,
 'true_positives': 9966.0}

In [0]:
real_sent = test.loc[test['real']==1,['comment','reply']].values.tolist()
predictions_real = getPrediction(real_sent)
predictions_real.sort_values('prob_real')

Unnamed: 0,comment,reply,prob_real,label
1055,Physicists are welcome to poach on biological ...,"&gt;""Why"" it evolved is not a useful question....",0.000186,Fake
5771,There's an episode of the original *Battlestar...,https://www.imdb.com/title/tt0519755/,0.061550,Fake
9177,"Yeah, but where can it be streamed? Its not on...",[deleted],0.544600,Real
8042,Remindme! 2 hours,[deleted],0.623466,Real
8764,Why would you expect larger batch sizes to red...,[deleted],0.726553,Real
...,...,...,...,...
8213,The tutorial is json for me. Not sure if this ...,It's a Jupyter notebook lol,0.999986,Real
8491,"only have one monitor turned on while writing,...","Phew, good thing I use Firefox",0.999986,Real
7972,Dont worry about the monsters at all.\n\n&amp;...,"This. I once asked my brother, who’s an actor,...",0.999986,Real
4730,Jessica woke up in the morning and yawned and ...,"Until, she realised the sun was actually a poe...",0.999986,Real


In [0]:
gpt2.copy_file_from_gdrive("proposed_replies.csv")
proposed_replies = pd.read_csv('proposed_replies.csv').dropna()
replies_to_test = proposed_replies[['comment','proposed_reply']].values.tolist()
predictions_proposed = getPrediction(replies_to_test)
predictions_proposed.sort_values('prob_real', ascending=False)

Unnamed: 0,comment,reply,prob_real,label
116,"""They answered the question at the same time. ...","This is a great post, thank you. Keep up the g...",0.999974,Real
233,I love using semi-colons; they're highly versa...,I still use them for the same reason people us...,0.999967,Real
10,"""They blurted out opposite answers simultaneou...",They were answering the same question.,0.999959,Real
191,When to use a semicolon: at the end of every s...,The semicolon is like a semicolon in that it a...,0.999926,Real
234,I love using semi-colons; they're highly versa...,"I don't use them to separate two words, I use ...",0.999908,Real
...,...,...,...,...
61,"""Yes,"" said Joe.\n\n""No,"" said Jane at the sam...",I'm not sure why you would want to read this. ...,0.000078,Fake
253,This is the end-all-be-all of writing. Just re...,I've been meaning to do this for years.\nIn fa...,0.000078,Fake
254,This is the end-all-be-all of writing. Just re...,I don't think it is possible to follow these s...,0.000076,Fake
215,"...man, I love me some semicolons. Once you fi...",This is a great post. I hope I can contribute ...,0.000074,Fake


In [0]:
best_predictions = predictions_proposed.sort_values('prob_real', ascending=False).groupby('comment').first().reset_index()

In [0]:
return_table = proposed_replies.drop(['proposed_reply','Unnamed: 0'], axis=1).drop_duplicates().merge(best_predictions,on='comment')

In [0]:
return_table.loc[return_table['prob_real']>0.9].to_csv('realistic_replies.csv')
gpt2.copy_file_to_gdrive('realistic_replies.csv')


In [0]:
!pip install praw
import praw
reddit = praw.Reddit(client_id='kik3_XlQn0DcHQ', 
                     client_secret='',
                     password='',
                     username='',
                     user_agent='')
subreddit = reddit.subreddit('writing')

Collecting praw
[?25l  Downloading https://files.pythonhosted.org/packages/f6/df/b42c0a3b86a43a62a46e5b2f07930230ac7719624800a2052218993fb767/praw-6.4.0-py2.py3-none-any.whl (126kB)
[K     |████████████████████████████████| 133kB 4.7MB/s 
[?25hCollecting websocket-client>=0.54.0
[?25l  Downloading https://files.pythonhosted.org/packages/29/19/44753eab1fdb50770ac69605527e8859468f3c0fd7dc5a76dd9c4dbd7906/websocket_client-0.56.0-py2.py3-none-any.whl (200kB)
[K     |████████████████████████████████| 204kB 13.3MB/s 
[?25hCollecting prawcore<2.0,>=1.0.1
  Downloading https://files.pythonhosted.org/packages/76/b5/ce6282dea45cba6f08a30e25d18e0f3d33277e2c9fcbda75644b8dc0089b/prawcore-1.0.1-py2.py3-none-any.whl
Collecting update-checker>=0.16
  Downloading https://files.pythonhosted.org/packages/17/c9/ab11855af164d03be0ff4fddd4c46a5bd44799a9ecc1770e01a669c21168/update_checker-0.16-py2.py3-none-any.whl
Installing collected packages: websocket-client, prawcore, update-checker, praw
Successfu

In [0]:
for h in subreddit.rising(limit=5):
  already_replied = False
  for c in h.comments:
    if c.author == 'tupperware-party':
      already_replied = True
    for r in c.replies:
      if r.author == 'tupperware-party':
        already_replied = True
    if already_replied:
      break
  if already_replied:
    continue
  for c in h.comments:
    if c.score > 3:
      replies = generate_reply(c.body)
      reply_record.extend([(h.id, h.title, h.score, c.id, c.score, c.body, r) for r in replies])


In [0]:
# file_path = 'tuned_BERT_discriminator.tar'

# with tarfile.open(file_path, 'w') as tar:
#     tar.add(OUTPUT_DIR)

# shutil.copyfile(file_path, "/content/drive/My Drive/" + file_path)

'/content/drive/My Drive/tuned_BERT_discriminator.tar'