In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, TFGPT2Model, TFGPT2ForSequenceClassification
import tensorflow as tf
import re
import os

In [4]:
os.environ['TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE'] = 'False'

In [5]:
democrat_file = "/content/drive/MyDrive/mount/democrat.csv"
republican_file = "/content/drive/MyDrive/mount/republican.csv"

model_name = "gpt2"
model_dir = "/content/drive/MyDrive/mount/results/dem_rep_model"

batch_size = 8
batch_size_fitting = 1024

In [5]:
democrat_df = pd.read_csv(democrat_file)
republican_df = pd.read_csv(republican_file)

In [6]:
democrat_df["label"] = 1
republican_df["label"] = 0

amount_of_entries = min(len(democrat_df), len(republican_df)) - 10

democrat_df = democrat_df.head(amount_of_entries)
republican_df = republican_df.head(amount_of_entries)

In [7]:
combined_df = pd.concat([democrat_df, republican_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    combined_df["text"].values,
    combined_df["label"].values,
    test_size=0.1,
    random_state=42
)

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='tf')
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512, return_tensors='tf')

In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

In [12]:
class GPT2Classifier(tf.keras.Model):
    def __init__(self, num_classes, l2_lambda=0.01):
        super(GPT2Classifier, self).__init__()
        self.gpt2 = TFGPT2Model.from_pretrained(model_name)
        self.dropout = tf.keras.layers.Dropout(0.1)
        self.dense = tf.keras.layers.Dense(num_classes, activation='softmax',
                                           kernel_regularizer=tf.keras.regularizers.L2(l2_lambda))

    def call(self, inputs):
        outputs = self.gpt2(inputs)[0]
        pooled_output = tf.reduce_mean(outputs, axis=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.dense(pooled_output)
        return logits

In [13]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

strategy = tf.distribute.experimental.TPUStrategy(tpu)
print("REPLICAS: ", strategy.num_replicas_in_sync)



REPLICAS:  8


In [14]:
with strategy.scope():
    model = GPT2Classifier(num_classes=2)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) #orig 3e-5
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )

All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [15]:
i = 0

final_dataset = train_dataset.shuffle(amount_of_entries).batch(batch_size_fitting)
amount_of_steps = len(final_dataset)

for batch in final_dataset:
    print(f"Run {i + 1} out of {amount_of_steps}")
    model.fit(batch[0], batch[1], epochs=6, batch_size=batch_size)
    if i % 50 == 0:
        print(f"Saving at step {i}")
        model.save(model_dir)
        print("Saved")
    i += 1

Run 1 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Saving at step 0
Saved
Run 2 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 3 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 4 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 5 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 6 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 7 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 8 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 9 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 10 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 11 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 12 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 13 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 14 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 15 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 16 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Run 17 out of 194
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [16]:
model.save(model_dir)

In [17]:
loss, accuracy = model.evaluate(test_dataset.batch(batch_size))
print(f"Test accuracy: {accuracy}")

Test accuracy: 0.5456277132034302


In [6]:
new_model = tf.keras.models.load_model(model_dir)


In [57]:
def predict_news_class(probabilities, threshold=0.5):
    republican_probability, democrat_probability = probabilities[0]
    if (republican_probability - democrat_probability)**2 > 0.3:
        if republican_probability > threshold:
            return "republican", republican_probability, democrat_probability, ((republican_probability - democrat_probability)**2)
        else:
            return "democrat", republican_probability, democrat_probability, ((republican_probability - democrat_probability)**2)
    else:
        return "neutral", republican_probability, democrat_probability, ((republican_probability - democrat_probability)**2)


In [59]:
def remove_emojis_and_links(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]|RT', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.replace("\n", "")
    return text.strip()

In [60]:
while True:
    inputs = tokenizer(remove_emojis_and_links(input()), return_tensors='tf', max_length=512, truncation=True, padding='max_length')
    output = new_model(inputs)
    print(predict_news_class(output), sep = "\n")

This is what Biden's White House was busy with last week. Lord help us.
('republican', <tf.Tensor: shape=(), dtype=float32, numpy=0.97664464>, <tf.Tensor: shape=(), dtype=float32, numpy=0.02335537>, <tf.Tensor: shape=(), dtype=float32, numpy=0.9087604>)
A pro-Palestine mob is blocking the road to O'Hare International Airport, forcing angry passengers to ditch their cars.
('republican', <tf.Tensor: shape=(), dtype=float32, numpy=0.8128468>, <tf.Tensor: shape=(), dtype=float32, numpy=0.18715325>, <tf.Tensor: shape=(), dtype=float32, numpy=0.39149243>)
Texas National Guard stops a mob of illegals from storming the border:
('republican', <tf.Tensor: shape=(), dtype=float32, numpy=0.9584139>, <tf.Tensor: shape=(), dtype=float32, numpy=0.041586023>, <tf.Tensor: shape=(), dtype=float32, numpy=0.8405733>)
I found a guy who goes around Miami asking insanely fit people over 40:  • What they eat • What workouts they do • What supplements they take  They all have really surprising answers...  Here

KeyboardInterrupt: Interrupted by user