In [1]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
news=pd.read_csv('/kaggle/input/fakenews/output (1).csv')
news.head()

Unnamed: 0,clean_text,class
0,donald trump sends embarrassing year’s message...,0
1,drunk bragging trump staffer started russian c...,0
2,sheriff david clarke becomes internet joke thr...,0
3,trump obsessed even obama’s name coded website...,0
4,pope francis called donald trump christmas spe...,0


In [4]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base",clean_up_tokenization_spaces=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [25]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    report_to=None
)



In [8]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
from sklearn.model_selection import train_test_split
seed=18
features = news['clean_text'].tolist()
targets = news['class'].tolist()

# splitting data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.20, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=seed)

In [10]:
train_encodings = tokenizer(X_train,max_length = tokenizer.model_max_length,truncation=True,padding=True,return_tensors='pt')
test_encodings = tokenizer(X_test,max_length = tokenizer.model_max_length,truncation=True,padding=True,return_tensors='pt')
val_encodings = tokenizer(X_val,max_length = tokenizer.model_max_length,truncation=True,padding=True,return_tensors='pt')


In [11]:
from torch.utils.data import Dataset
class NewsDataset(Dataset):
    
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # convert to pytorch tensors
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item 
    
    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = NewsDataset(train_encodings, y_train)
test_dataset = NewsDataset(test_encodings, y_test)
val_dataset = NewsDataset(val_encodings, y_val)

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    acc = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')
    
    return {"acc": acc, "precision": precision, "recall": recall, "f1 score": f1}



In [15]:
from transformers import Trainer

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [17]:
import torch
import torch.nn.functional as F

In [23]:
import wandb 
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
secret_value = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [29]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Acc,Precision,Recall,F1 score
100,0.0069,0.040072,0.992762,0.992668,0.992954,0.992759
200,0.0262,0.014292,0.997494,0.997459,0.997531,0.997493
300,0.0118,0.009846,0.99833,0.998314,0.998344,0.998328
400,0.0135,0.008778,0.998886,0.998901,0.99887,0.998886
500,0.0163,0.013633,0.998051,0.998029,0.998073,0.99805
600,0.0078,0.011402,0.998608,0.998586,0.99863,0.998607
700,0.0103,0.013246,0.99833,0.998314,0.998344,0.998328
800,0.0041,0.012163,0.998608,0.998615,0.998599,0.998607
900,0.0098,0.007828,0.998886,0.998886,0.998886,0.998886
1000,0.0045,0.00471,0.998886,0.998886,0.998886,0.998886


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=1011, training_loss=0.011009628428966101, metrics={'train_runtime': 1199.2294, 'train_samples_per_second': 26.956, 'train_steps_per_second': 0.843, 'total_flos': 4282141128953856.0, 'train_loss': 0.011009628428966101, 'epoch': 1.0})

In [30]:
trainer.evaluate(test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.0021776731591671705,
 'eval_acc': 0.9996659242761693,
 'eval_precision': 0.9996697246138644,
 'eval_recall': 0.9996611611942784,
 'eval_f1 score': 0.999665430452473,
 'eval_runtime': 83.2502,
 'eval_samples_per_second': 107.868,
 'eval_steps_per_second': 3.375,
 'epoch': 1.0}

In [32]:
trainer.save_model('ake_detector')

In [33]:
!tar -zcvf outputname.tar.gz /kaggle/working

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


tar: Removing leading `/' from member names
/kaggle/working/
/kaggle/working/results/
/kaggle/working/results/checkpoint-500/
/kaggle/working/results/checkpoint-500/rng_state.pth
/kaggle/working/results/checkpoint-500/trainer_state.json
/kaggle/working/results/checkpoint-500/config.json
/kaggle/working/results/checkpoint-500/optimizer.pt
/kaggle/working/results/checkpoint-500/model.safetensors
/kaggle/working/results/checkpoint-500/training_args.bin
/kaggle/working/results/checkpoint-500/scheduler.pt
/kaggle/working/results/checkpoint-1000/
/kaggle/working/results/checkpoint-1000/rng_state.pth
/kaggle/working/results/checkpoint-1000/trainer_state.json
/kaggle/working/results/checkpoint-1000/config.json
/kaggle/working/results/checkpoint-1000/optimizer.pt
/kaggle/working/results/checkpoint-1000/model.safetensors
/kaggle/working/results/checkpoint-1000/training_args.bin
/kaggle/working/results/checkpoint-1000/scheduler.pt
/kaggle/working/results/checkpoint-1011/
/kaggle/working/results/c

In [34]:
from huggingface_hub import notebook_login

In [35]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
trainer.push_to_hub("Am09/distilroberta-base-fake_news_detector-Am09")

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Am09/results/commit/9e74eaad5f602581b1f90451430d76a61b73f32b', commit_message='Am09/distilroberta-base-fake_news_detector-Am09', commit_description='', oid='9e74eaad5f602581b1f90451430d76a61b73f32b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Am09/results', endpoint='https://huggingface.co', repo_type='model', repo_id='Am09/results'), pr_revision=None, pr_num=None)