In [None]:
# Uncomment line below, FOR COLAB ONLY: YOU NEED TO UPDATE NLTK
#!pip install --upgrade nltk
#reqs
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random
import numpy as np
import pickle
from google.colab import drive
from nltk.util import ngrams
from nltk.lm import NgramCounter
import matplotlib
from operator import itemgetter
%matplotlib inline

#mount google drive
drive.mount('/content/drive')

#load test data and tokenizer
with open('/content/drive/MyDrive/testing_data3.pkl', 'rb') as f:
    data = pickle.load(f)
with open('/content/drive/MyDrive/tokenizer3.pkl', 'rb') as f:
    tokenizer = pickle.load(f)


In [None]:
#Some functions needed to load the model
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer.lr
    return lr
optimizer = tf.keras.optimizers.Adam()
lr_metric = get_lr_metric(optimizer)

In [None]:
#Load Model and compile
model = tf.keras.models.load_model('/content/drive/MyDrive/saved_model3/model_v3_final', compile=False)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy',lr_metric])

In [None]:
#Define the data class
#TODO: Move into its own file, so it can be used in multpile places
class Data:
  def __init__ (self, data, batch_size, vocab_length):
    self.data = data
    self.batch_size = batch_size
    self.lengths = [len(x) for x in data]
    self.counters = self.create_counters()
    self.vocab_length = vocab_length
  def next_batch(self):
    X = []
    y_class = []
    y_gen = []
    for i in range(len(self.data)):
      for ii in range(self.batch_size):
        X.append(data[i][self.counters[i][ii]][0])
        y_class.append(data[i][self.counters[i][ii]][1])
        y_gen.append(data[i][self.counters[i][ii]][2])
    self.update_counters()
    return np.asarray(X), keras.utils.to_categorical(y_gen, num_classes=self.vocab_length), np.asarray(y_class)
  def create_counters(self):
    counters = {}
    for i in (range(len(self.data))):
      counters[i] = random.sample(range(len(data[i])-1), self.batch_size)
    return counters
  def update_counters(self):
    for key  in self.counters:
      for i in range(len(self.counters[key])):
        self.counters[key][i] =  (self.counters[key][i] + 1) % self.lengths[key]
  def get_random_starts_for_testing(self, author):
    starts = []
    for i in range(self.batch_size*6):
      starts.append(random.randint(0,len(data[author])-1))
    lis_total = []
    for i in range(self.batch_size*6):
      lis_total.append(data[author][starts[i]])
    X = np.asarray([x[0] for x in lis_total])
    y_class = np.asarray([x[1] for x in lis_total])
    y_gen = tf.keras.utils.to_categorical([x[2] for x in lis_total], num_classes =(len(tokenizer.word_index) + 1))
    return X, y_class, y_gen


In [None]:
#Define the data class
data_getter = Data(data, 150, len(tokenizer.word_index) + 1)

In [None]:
#Test model on test data
iter_num = 20
for i in range(iter_num):
    totalX, total_gen_y, total_class_y = data_getter.next_batch()
    total_loss, loss_g, loss_c, acc_g, lr_g, acc_c, lr_c = model.test_on_batch(totalX, [total_gen_y, total_class_y], reset_metrics=False)
print(total_loss)

In [None]:
#Check Accuracy
print(acc_g)

In [None]:
#Create a dictionary in which to store generated samples
samples = {}
for author in range(6):
    samples[author] = {}
    samples[author]['real'] = []
    samples[author]['fake'] = []

In [None]:
#Iterate, generate samples of length 100. Store in dictionary, separated into real and fake samples. THIS TAKES A WHILE AND COLAB LIKES TO TIME OUT :(
num_iter = 100
for i in range(num_iter):
  print(i)
  for author in range(6):
    test = data_getter.get_random_starts_for_testing(author)
    X = test[0]
    for ii in range(100):
      results = model.predict_on_batch(X[:,-100:])
      processed_results = np.argmax(results[0], axis=1).reshape(900,1)
      X = np.concatenate((X, processed_results ), axis =1)
    samples[author]['real'].append(np.asarray(tokenizer.sequences_to_texts(X[:,:100].tolist())))
    samples[author]['fake'].append(np.asarray(tokenizer.sequences_to_texts(X[:,-100:].tolist())))
    print("completed")
  with open('/content/drive/MyDrive/evaluate_data', 'wb+') as file:
      pickle.dump(samples, file)



In [None]:
#Load samples, if they've already been generated
with open('/content/drive/MyDrive/evaluate_data', 'rb') as file:
    samples = pickle.load(file)

In [None]:
#Concatenate data so it's easier to work with
for key in samples:
  samples[key]['real'] = np.concatenate(samples[key]['real'], axis=0)
  samples[key]['fake'] = np.concatenate(samples[key]['fake'], axis=0)

In [None]:
#Create Ngram counter for Unigrams, Bigrams, and Trigrams. Then save it
ngram_total = {}
for key in samples:
  ngram_total[key] = {}
  for truth in ['real', 'fake']:
    print("starting")
    print(samples[key][truth].tolist()[0].split())
    text_trigrams = [ngrams(sent.split(), 3) for sent in samples[key][truth].tolist()]
    text_bigrams = [ngrams(sent.split(), 2) for sent in samples[key][truth]]
    text_unigrams = [ngrams(sent.split(), 1) for sent in samples[key][truth]]
    ngram_counts = NgramCounter(text_bigrams + text_unigrams + text_trigrams)
    ngram_total[key][truth] = {}
    ngram_total[key][truth]['ngrams'] = [text_unigrams, text_bigrams, text_trigrams]
    ngram_total[key][truth]['counter'] = ngram_counts
    print("ending")
with open('/content/drive/MyDrive/ngrams_data', 'wb+') as file:
    pickle.dump(ngram_total, file)


In [None]:
# This checks a certain word, to look for things like a word being especially prevalent in one text, etc

# freq_dict = {}
for author in range(6):
  # freq_dict[author] = {}
  for truth in ['real','fake']:
    print(ngram_total[author][truth]['counter'][1]['expect'])


  

In [None]:
#This function prints all the unigrams, as well as their collections in the text. Used to looks for words taht are present more in one text than another
# freq_dict = {}

# freq_dict[author] = {}
for truth in ['fake']:
  for author in range(6):
    for item in ngram_total[author][truth]['counter'][1]:
      total = [item]
      for author2 in range(6):
        total.append(ngram_total[author2][truth]['counter'][1][item])
      print(total)





  