# Modules

In [9]:
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

# Dataset

In [10]:
df = pd.read_csv('./fake_news_tweets.csv', delimiter=';')
print("Dataset columns: ", df.columns)

Dataset columns:  Index(['tweet', 'label'], dtype='object')


# Bert model

## Tokenizer

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_texts = tokenizer(df['tweet'].tolist(), padding=True, truncation=True, return_tensors='tf')
labels = df['label'].tolist()

## Classification model

In [None]:
model_name = 'bert-base-uncased'
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Training

## Model compilation

In [13]:
model.compile(optimizer=Adam(learning_rate=2e-5),
              loss=BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

## Traning data

In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(tokenized_texts), labels))
train_dataset = train_dataset.shuffle(50).batch(8)

## Model fitting

In [15]:
history = model.fit(train_dataset, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


# Inference

In [28]:
def get_class(prediction):
  threshold = 0.5
  return 'fake' if prediction.logits[0][0] > threshold else 'real'

In [None]:
new_tweet = ["Sample Tweet"]
tokenized_new_tweet = tokenizer(new_tweet, padding=True, truncation=True, return_tensors='tf')
prediction = model.predict(dict(tokenized_new_tweet))
get_class(prediction)