In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import functools
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
from tensorflow.python.keras import regularizers


# prepare kaggle dataset
###################

df = pd.read_json("test_set.json")
# print(df)
Xd = df.iloc[: , 1]
# print(Xd)
kaggle_x = Xd.to_numpy()

# use the ids as dummy y values.
# makes code reuse easier. they do not impact the results whatsoever.

ids_kaggle = df.iloc[:, 0]

###################

# prepare LIAR dataset
# ##################

def func(x):
    if (x=="true" or x=="mostly-true" or x=="half-true"):
        return 1
    elif(x=="false" or x=="barely-true" or x=="pants-fire" ):
        return 0

df_liar = pd.read_csv('test.tsv', sep='\t')
Xd = df_liar.iloc[:,2]
Yd = df_liar.iloc[:,1]

X_arr = Xd.to_numpy()
print(X_arr.shape)
Y_arr = Yd.map(func)
Y_arr = Y_arr.to_numpy()

liar_x = X_arr
liar_y = Y_arr

####################

BATCH_SIZE = 1

train_data = pd.read_csv("train.csv")
valid_data = pd.read_csv("valid.csv")
test_data = pd.read_csv("test.csv")


train_x = train_data['content'].to_numpy()
train_y = train_data['type_id'].to_numpy()

valid_x = valid_data['content'].to_numpy()
valid_y = valid_data['type_id'].to_numpy()

test_x = test_data['content'].to_numpy()
test_y = test_data['type_id'].to_numpy()



train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_x, valid_y))
test_dataset = tf.data.Dataset.from_tensor_slices((test_x, test_y))
liar_dataset = tf.data.Dataset.from_tensor_slices((liar_x, liar_y))
kaggle_dataset = tf.data.Dataset.from_tensor_slices(kaggle_x)

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()

# only build vocabulary on training set
for content, _ in train_dataset:
  some_tokens = tokenizer.tokenize(content.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)

encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)


def encode(text_tensor, label):
 
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))

  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label


def encode_kaggle(text_tensor):
  
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text,


def encode_kaggle_map_fn(text):

  encoded_text = tf.py_function(encode_kaggle, inp=[text], Tout=tf.int64)
  encoded_text.set_shape([None])
  return encoded_text,



train_encoded_data = train_dataset.map(encode_map_fn)  
valid_encoded_data = valid_dataset.map(encode_map_fn)  
test_encoded_data = test_dataset.map(encode_map_fn)
liar_encoded_data = liar_dataset.map(encode_map_fn)
kaggle_encoded_data = kaggle_dataset.map(encode_kaggle_map_fn)


train_batches = train_encoded_data.padded_batch(BATCH_SIZE)
valid_batches = valid_encoded_data.padded_batch(BATCH_SIZE)
test_batches = test_encoded_data.padded_batch(BATCH_SIZE)
liar_batches = liar_encoded_data.padded_batch(BATCH_SIZE)
kaggle_batches = kaggle_encoded_data.padded_batch(BATCH_SIZE)


embedding_dim=8

model = keras.Sequential([
  layers.Embedding(encoder.vocab_size, embedding_dim, ),
  layers.GlobalAveragePooling1D(),
  layers.Dense(8, activation='relu', kernel_regularizer=regularizers.l2()),
  layers.Dense(1)
])

# model.summary()


model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_batches,
    validation_data=valid_batches,
    epochs=1,
    )



result = model.evaluate(liar_batches)
result2 = model.evaluate(test_batches)


#####################

predictions = (model.predict(kaggle_batches) > 0.5).astype("int32")


truecounts=0
falsecounts=0
for res in predictions:
  if (res[0]==0):
      falsecounts+=1
  else:
      truecounts+=1    

print(falsecounts)
print(truecounts)

####################

def decodeKaggle(x):
  if (x == 0):
    return 'FAKE'
  else:
    return 'TRUE'  

labels = pd.DataFrame(data=predictions, columns=['label']).applymap(decodeKaggle)
ids_kaggle = ids_kaggle.to_frame()
ids_kaggle.columns = ['id']

kaggle_res = ids_kaggle.join(labels)

kaggle_res.to_csv('predictions.csv', index=False)

####################


(1266,)
5454
881


