In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, TFGPT2Model, TFGPT2ForSequenceClassification
import tensorflow as tf
import re
import os
from tensorflow.keras.metrics import AUC

In [14]:
os.environ['TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE'] = 'False'

In [15]:
model_dir = "../../dem_rep/dem_rep_model"
democrat_file = "../../newtweets/democrat.csv"
republican_file = "../../newtweets/republican.csv"

model_name = "gpt2"

batch_size = 8
batch_size_fitting = 1024

In [16]:
democrat_df = pd.read_csv(democrat_file)
republican_df = pd.read_csv(republican_file)

In [17]:
democrat_df["label"] = 1
republican_df["label"] = 0

amount_of_entries = min(len(democrat_df), len(republican_df)) - 10

democrat_df = democrat_df.head(amount_of_entries)
republican_df = republican_df.head(amount_of_entries)

In [18]:
combined_df = pd.concat([democrat_df, republican_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [19]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    combined_df["text"].values,
    combined_df["label"].values,
    test_size=0.1,
    random_state=42
)

In [20]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [21]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='tf')
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='tf')

In [22]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

In [None]:
class GPT2Classifier(tf.keras.Model):
    def __init__(self, num_classes, l2_lambda=0.01):
        super(GPT2Classifier, self).__init__()
        self.gpt2 = TFGPT2Model.from_pretrained(model_name)
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.dense = tf.keras.layers.Dense(num_classes, activation='softmax',
                                           kernel_regularizer=tf.keras.regularizers.L2(l2_lambda))

    def call(self, inputs):
        outputs = self.gpt2(inputs)[0]
        pooled_output = tf.reduce_mean(outputs, axis=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.dense(pooled_output)
        return logits

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

strategy = tf.distribute.experimental.TPUStrategy(tpu)
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [10]:
auc_metric = AUC()




In [None]:
with strategy.scope():
    model = GPT2Classifier(num_classes=2)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) #orig 3e-5
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )

In [None]:
i = 0

final_dataset = train_dataset.shuffle(amount_of_entries).batch(batch_size_fitting)
amount_of_steps = len(final_dataset)

for batch in final_dataset:
    print(f"Run {i + 1} out of {amount_of_steps}")
    model.fit(batch[0], batch[1], epochs=6, batch_size=batch_size)
    if i % 50 == 0:
        print(f"Saving at step {i}")
        model.save(model_dir)
        print("Saved")
    i += 1

In [None]:
model.save(model_dir)

In [23]:
loss, accuracy, auc = model.evaluate(test_dataset.batch(batch_size))
print(f"Test accuracy: {accuracy}")
print(f"AUC: {auc}")


  38/2756 [..............................] - ETA: 3:10:46 - loss: 0.9244 - accuracy: 0.5789

KeyboardInterrupt: 

In [11]:
model = tf.keras.models.load_model(model_dir)




In [None]:
def predict_news_class(probabilities, threshold=0.5):
    republican_probability, democrat_probability = probabilities[0]
    if (republican_probability - democrat_probability)**2 > 0.3:
        if republican_probability > threshold:
            return "republican", republican_probability, democrat_probability, ((republican_probability - democrat_probability)**2)
        else:
            return "democrat", republican_probability, democrat_probability, ((republican_probability - democrat_probability)**2)
    else:
        return "neutral", republican_probability, democrat_probability, ((republican_probability - democrat_probability)**2)


In [None]:
def remove_emojis_and_links(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]|RT', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace("\n", "")
    return text.strip()

In [None]:
while True:
    inputs = tokenizer(remove_emojis_and_links(input()), return_tensors='tf', max_length=512, truncation=True, padding='max_length')
    output = model(inputs)
    print(predict_news_class(output), sep = "\n")

In [2]:
tf. __version__ 

'2.15.0'