# Import Required Libraries


In [None]:
!pip install transformers



In [None]:
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
MODEL_NAME = "distilbert/distilbert-base-uncased"

# Load the Dataset

Load the dataset into a pandas DataFrame

In [None]:
dataset = pd.read_csv('clean_dataset.csv')

In [None]:
dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
dataset.shape

(20800, 5)

In [None]:
dataset.drop_duplicates(inplace=True)

In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset.shape

In [None]:
dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


**Rename the dataset class column to labels**

In [None]:
dataset["class"] = dataset["labels"]

In [None]:
dataset.head()

# Split the data into training, validation and test sets


In [None]:
# train-temp split (80:20)

train_data, temp_data = train_test_split(dataset, test_size=0.2, random_state=42)

# validation-test split (10:10) -> 80:10:10

val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
test_data.head()

In [None]:
val_data.head()

Unnamed: 0,id,title,author,text,label
11784,11784,Russian Spies and Americas Reality TV Electio...,Finian Cunningham,Russian Spies and Americas Reality TV Electi...,1
6997,6997,A Peek Inside the Strange World of Fake Academ...,Kevin Carey,The caller ID on my office telephone said the ...,0
14903,14903,A Rediscovered Mark Twain Fairy Tale Is Coming...,Alexandra Alter,"One night nearly 140 years ago, Samuel Clemens...",0
14381,14381,’Gays for Trump’ Banned from Participating in ...,Katherine Rodriguez,Members of a gay group say they have been ba...,0
16567,16567,SNIP creó mercado negro e informal,voltairenet.org,Páginas Libres\nSNIP creó mercado negro e info...,1


In [None]:
val_data.shape

In [None]:
''' This class takes in the data, tokenizes it using the AutoTokenizer for our MODEL_NAME,
 and returns the input IDs, attention masks, and labels.'''

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class FakeNewsDataset(Dataset):
    def __init__(self, data, max_len=128):
        self.data = data
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        labels = self.data.iloc[index]['labels']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), torch.tensor(labels, dtype=torch.long)

# DataLoader

Create a DataLoader for the training and test datasets so the data is iterated as batches

In [None]:
train_dataset = FakeNewsDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = FakeNewsDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Load the model

We'll use the AutoModelForSequenceClassification to load from the MODEL_NAME (base model) for binary text classification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Optimizer and learning rate scheduler

Create an optimizer and learning rate scheduler to fine-tune the model.

We use the AdamW optimizer from PyTorch.


In [None]:
# Training Hyperparameters
batch_size = 32
num_epochs = 3
learning_rate = 2e-5
weight_decay = 0.01


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

Create the default learning rate scheduler from Trainer:

In [None]:
from transformers import get_scheduler

# number of warmup steps for learning rate scheduler
warmup_steps=int((len(train_loader)/batch_size) * num_epochs * 0.2)

num_training_steps = num_epochs * len(train_loader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
)

# Specify the device to use a GPU if available

In [None]:
from accelerate.test_utils.testing import get_backend

device, _, _ = get_backend() # automatically detects the device type (CUDA, CPU, XPU, MPS, etc.)

model.to(device)

In [None]:
if torch.cuda.is_available():
    print("GPU is available and PyTorch can use it.")
else:
    print("GPU is not available or PyTorch cannot use it.")

# Train the Model

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()

# Training Loop
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# Evaluate

In [None]:
import evaluate

metric = evaluate.load("accuracy")

model.eval()

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

# Instead of torch.save(model), this will save the model in a structured folder)
#torch.save(model, f"fake_news_detection")
model.save_pretrained("fake_news_classification")
tokenizer.save_pretrained("fake_news_classification")