In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [2]:
# Fake news/Real news dataset
df_real = pd.read_csv('True.csv')
df_real['RealNews'] = True
df_fake = pd.read_csv('Fake.csv')
df_fake['RealNews'] = False
df_news = df_real.append(df_fake)

In [3]:
# randomly select 50 rows of real news and 50 rows of fake news
real_news = df_news[df_news['RealNews'] == True].sample(n=50)
fake_news = df_news[df_news['RealNews'] == False].sample(n=50)

# combine the two dataframes
df = pd.concat([real_news, fake_news])

# reset the index of the new dataframe
df = df.reset_index(drop=True)

In [4]:
df.head()

Unnamed: 0,title,text,subject,date,RealNews
0,"Al Gore says Trump driving, not weakening, cli...",LONDON (Reuters) - Former U.S. Vice President ...,politicsNews,"August 11, 2017",True
1,Vietnam gives thumbs-up to U.S. regional role ...,HANOI (Reuters) - Vietnam supports U.S. “inter...,politicsNews,"October 18, 2016",True
2,U.S. Commerce chief warns against China semico...,WASHINGTON (Reuters) - Massive government inve...,politicsNews,"November 3, 2016",True
3,"Trump on Twitter (Aug 14) - Luther Strange, Me...",The following statements were posted to the ve...,politicsNews,"August 14, 2017",True
4,Pentagon confirms 'probable' North Korean miss...,WASHINGTON (Reuters) - The Pentagon said on Tu...,worldnews,"November 28, 2017",True


In [5]:
df = df[df['text'].str.strip() != '']

In [6]:
# Split the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

In [7]:
# Example for few-shot training
def prepare_training_examples(n):
    examples = train_df.head(n)[['text', 'RealNews']]
    formatted_examples = ""
    for index, row in examples.iterrows():
        formatted_examples += f"Text: {row['text']}\nRealNews: {row['RealNews']}\n\n"
    return formatted_examples

In [8]:
# Generate prompt function
def generate_prompt(n, row):
    formatted_training_examples = prepare_training_examples(n)
    formatted_validation_example = f"Text: {row['text']}\nRealNews: "
    prompt = formatted_training_examples + formatted_validation_example
    return prompt

In [9]:
# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
# GPT text generation function
def generate_words(prompt):
    max_length = model.config.n_positions
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=max_length, truncation=True)
    
    with torch.no_grad():
        outputs = model(inputs, return_dict=True)
        logits = outputs.logits

    true_token_id = tokenizer.encode("True", add_special_tokens=False)[0]
    false_token_id = tokenizer.encode("False", add_special_tokens=False)[0]

    true_logits = logits[0, -1, true_token_id].item()
    false_logits = logits[0, -1, false_token_id].item()

    if true_logits > false_logits:
        return 1
    else:
        return 0

In [11]:
# Score function
def score(n, dataset):
    y_true = []
    y_pred = []
    for index, row in dataset.iterrows():
        prompt = generate_prompt(n, row)
        prediction = generate_words(prompt)
        y_true.append(row['RealNews'])
        y_pred.append(int(prediction))
        
    y_true = [1 if value == 'True' else 0 for value in y_true]
    f1 = f1_score(y_true, y_pred, average='weighted')
    return f1

In [12]:
# Call the score function using validation set
for n in range(1, 4):
    f1 = score(n, val_df)
    print(f"F1 score for n={n}: {f1}")

F1 score for n=1: 0.888888888888889
F1 score for n=2: 0.9473684210526316
F1 score for n=3: 0.6666666666666666


In few-shot learning,  provide a small number of examples to the model to demonstrate the task. I use the range from 1-3 here.

When increasing the value of n, more examples for the model to learn from. This can potentially improve the model's understanding of the task, leading to better predictions. However, there's a trade-off. If n is too large, it might result in overfitting, which would cause the model to perform poorly on unseen examples. Conversely, if n is too small, the model may not have enough information to understand the task effectively.

Therefore, based on the results of the validation set, I choose n = 2.

In [14]:
# Call the sort function for test set
n = 2
test_f1 = score(n, test_df)
print(f"F1 score on test set for n={n}: {test_f1}")

F1 score on test set for n=2: 0.8571428571428571
