In [None]:
!pip install kaggle
!pip install transformers==4.28.0
!pip install sentencepiece
!pip install datasets
!pip install python-decouple

In [None]:
import os
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [None]:
path = Path("clmentbisaillon/fake-and-real-news-dataset")
!cat ./kaggle.json > /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

In [None]:
import zipfile
with zipfile.ZipFile("/content/fake-and-real-news-dataset.zip", 'r') as zip_ref:
  zip_ref.extractall("/content")

In [None]:
!ls /content

In [None]:
import pandas as pd

In [None]:
path = "/content/"
fake = pd.read_csv(path+'Fake.csv')
true = pd.read_csv(path+'True.csv')
fake['fake'] = 0.0
true['fake'] = 1.0
fake = fake.head(100)
true = true.head(100)

In [None]:
fake

In [None]:
df = pd.DataFrame()
df = true.append(fake)
df['input'] = "TITLE: " + df['title'] + " TEXT: " + df["text"]

In [None]:
model_nm = 'microsoft/deberta-v3-small'


In [None]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

In [None]:
tokz.tokenize("G'day folks, I'm Jeremy from fast.ai!")

In [None]:
df = df.drop(['subject', 'date'], axis=1)

In [None]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)

def tok_func(x): return tokz(x["input"], padding=True, truncation=True)

tok_ds = ds.map(tok_func, batched=True)

In [None]:
tokz.vocab['of']

In [None]:
tok_ds = tok_ds.rename_columns({"fake":"labels"})
row = tok_ds[0]

In [None]:
from sklearn.model_selection import train_test_split
dds = tok_ds.train_test_split(0.25, seed=42)

In [None]:
from transformers import TrainingArguments,Trainer
bs = 2
epochs = 4
lr = 8e-5

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True, evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [None]:
dds
import numpy as np
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train();

In [None]:
!kaggle datasets download -d ruchi798/source-based-news-classification

In [None]:
!ls /content

In [None]:
with zipfile.ZipFile("/content/source-based-news-classification.zip", 'r') as zip_ref:
  zip_ref.extractall("/content")

In [None]:
eval_df = pd.read_csv("/content/news_articles.csv", encoding="ISO-8859-1")
eval_df = eval_df.head(10)
eval_df["input"] = "TITLE: " + eval_df['title'] + " TEXT: " + eval_df['text']
eval_df = eval_df.drop(["author", "published", "language", "site_url", "main_img_url", "type", "title_without_stopwords", "text_without_stopwords", "hasImage"], axis=1)
eval_df

In [None]:
mapping = {True: 1.0, False: 0.0}
eval_df['label'] = eval_df['label'].map(mapping)
eval_df
eval_df = eval_df.drop(['label'], axis=1)
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

In [None]:
trainer.save_model("/content/model")