In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! nvidia-smi
! pip install transformers -q

Mon Mar 28 03:23:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
%%capture
from sklearn.model_selection import train_test_split
from transformers import DistilBertForSequenceClassification, Trainer, EarlyStoppingCallback ,DistilBertTokenizerFast
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score ,classification_report

import numpy as np
import torch
import pandas as pd
import re


tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
RANDOM_SEED = 42

model = DistilBertForSequenceClassification.from_pretrained(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/Fine-tuned Model Improved V3", num_labels=2)

# Define test trainer
trainer = Trainer(model)

In [4]:
def label_map(label):
  if label in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL']:
    return 0
  if label in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE']:
    return 1

In [5]:
# class Dataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         item['labels'] = torch.tensor(self.labels[idx])
#         return item

#     def __len__(self):
#         return len(self.labels)

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

# train_dataset = Dataset(train_encodings, train_labels)
# val_dataset = Dataset(val_encodings, val_labels)
# test_dataset = Dataset(test_encodings, labels)

In [6]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [7]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_nonascii(sent):
  return "".join([i for i in sent if i.isascii()])

def remove_mul_space(text):
  return " ".join(text.split())

def clean(text):
  
  text = remove_urls(text)
  text = remove_nonascii(text)
  text = remove_mul_space(text)

  return text

In [8]:
# field name of id , label , text_column of respective datasets 
id={"codalab":"id", "fakenewsnet":"id_1", "isot":"id", "kaggle":"id","kagglerealfake":"id", "liar":"ID", "politifact":"claim_id", "welfake":"id","FA-KES":"unit_id","Politifact_test":"id","COVID19_test":"id"}
label={"codalab":"label", "fakenewsnet":"label", "isot":"label", "kaggle":"label","kagglerealfake":"label","liar":"label", "politifact":"cred_label", "welfake":"label","FA-KES":"label","Politifact_test":"target","COVID19_test":"label"}
features=["lexicon","sementic","sentiment","embedding"]
text_column={"codalab":"tweet", "fakenewsnet":"title", "isot":"text", "kaggle":"text","kagglerealfake":"text", "liar":"statement", "politifact":"text", "welfake":"text","FA-KES":"article_content","Politifact_test":"statement","COVID19_test":"text"}

datasetBasePath="/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/"

inputPath={
    "codalab": datasetBasePath+"CodaLab Covid/Constraint_English_All.csv",
    "fakenewsnet" :datasetBasePath+"FakeNewsNet/FakeNewsNet_All.csv",
    # "kaggle" :datasetBasePath+Kaggle/Kaggle.csv,
    "kagglerealfake":datasetBasePath+"Kaggle_real_fake/fake_or_real_news.csv",
    "liar":datasetBasePath+"LIAR/Liar_all.csv",
    "isot":datasetBasePath+"ISOT/ISOT.csv",
    "FA-KES":datasetBasePath+"/FA-KES/FA-KES.csv",
    "Politifact_test":datasetBasePath+"Politifact_test/Politifact_testset.csv" ,
    "COVID19_test":datasetBasePath+"COVID19_test/COVID19_test.csv",

    # "politifact":datasetBasePath+"Politifact/Politifact.tsv",
    "welfake":datasetBasePath+"WELFake/WELFake.csv",
    }

outFolderName={
    # "codalab":"CodaLab Covid",
    # "fakenewsnet" : "FakeNewsNet",
    # # # "kaggle" ,
    # "kagglerealfake":"Kaggle_real_fake",
    # "liar":"LIAR",
    # "isot":"ISOT",
    # # # "politifact",
    # #  "welfake":"Welfake"
    # "FA-KES":"FA-KES",
    # "Politifact_test":"Politifact_test"
    # "COVID19_test":"COVID19_test"
}    
# columns= ['cred_label', 'claim_id', 'claim_text', 'claim_source', 'evidence', 'evidence_source' ]

**WARNING uncomment df_to_csv to override the result**

In [None]:
## load all data set in to a dataframe dictionary (UPDATE :not  NORMALIZED)
all_df={}
for key,value in inputPath.items():
  print("----------",key,"--------------")
  error=False
  ID=id[key]
  TEXT=text_column[key]
  LABEL=label[key]

  df=pd.read_csv(value)

  df["label"]=df[LABEL].apply(label_map)
  
  print(df["label"].value_counts())
  
  if (key=="codalab" or key=="liar"):  
    df=df[[TEXT,"label",ID,"split"]]
  else :
    df=df[[TEXT,"label",ID,]]     

  print("null rows : ",df.isnull().any(axis=1).sum())
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
  print("inf rows : ",df.isnull().any(axis=1).sum())
  df.dropna(inplace=True)

  df[TEXT]=df[TEXT].apply(clean)
  texts=df[TEXT].to_list()
# Runcomment for welfake
  # output0=[]
  # output1=[]
  # for text in texts:
  #   encodings = tokenizer([text], truncation=True, padding=True)
  #   dataset = Dataset(encodings)

  #   raw_pred, _, _ = trainer.predict(dataset)

  #   output0.append(raw_pred[:,0][0])
  #   output1.append(raw_pred[:,0][0])


  encodings = tokenizer(texts, truncation=True, padding=True)
  dataset = Dataset(encodings)

  raw_pred, _, _ = trainer.predict(dataset)
  

  df["embd_true"]=raw_pred[:,1]
  df["embd_fake"]=raw_pred[:,0]
  outDir="/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/"+outFolderName[key]+"/"+outFolderName[key]+"_title_embedding_new.csv"

  # df.to_csv(outDir,index=False)
  # all_df[key]=df.copy(deep=True)

---------- FA-KES --------------
0    418
1    371
Name: label, dtype: int64
null rows :  0
inf rows :  0


In [None]:
# outDir="/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/"+outFolderName[key]+"/"+outFolderName[key]+"_embedding_new.csv"

# df.to_csv(outDir,index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3119 entries, 0 to 3118
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   text       3119 non-null   object 
 1   label      3119 non-null   int64  
 2   id         3119 non-null   int64  
 3   embd_true  3119 non-null   float32
 4   embd_fake  3119 non-null   float32
dtypes: float32(2), int64(2), object(1)
memory usage: 121.8+ KB


In [None]:
outDir

'/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/COVID19_test/COVID19_test_title_embedding_new.csv'

## example

In [None]:
encodings = tokenizer([text], truncation=True, padding=True)
dataset = Dataset(encodings)

In [None]:
dataset

<__main__.Dataset at 0x7f5a070d8290>

In [None]:
raw_pred, _, _ = trainer.predict(dataset)

In [None]:
raw_pred

array([[-4.6981244,  3.8629055]], dtype=float32)

In [None]:
raw_pred=np.array([[1,2]])

In [None]:
np.append(raw_pred,raw_pred,axis=0)

array([[1, 2],
       [2, 3],
       [1, 2],
       [2, 3]])

In [None]:
raw_pred[:,1]

array([3.8629055], dtype=float32)

In [None]:
df.head()

In [None]:
for key,value in inputPath.items():
  outDir="/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/"+outFolderName[key]+"/"+outFolderName[key]+"_embedding.csv"
  print(outDir)

/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/CodaLab Covid/CodaLab Covid_embedding.csv
/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FakeNewsNet/FakeNewsNet_embedding.csv
/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/ISOT/ISOT_embedding.csv
/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Kaggle_real_fake/Kaggle_real_fake_embedding.csv
/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/LIAR/LIAR_embedding.csv
/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Welfake/Welfake_embedding.csv


## predict for a dataset

In [26]:
key="isot"
df=pd.read_csv(inputPath[key])

In [27]:
print("null rows : ",df.isnull().any(axis=1).sum())
df.replace([np.inf, -np.inf], np.nan, inplace=True)
print("inf rows : ",df.isnull().any(axis=1).sum())
df.dropna(inplace=True)
df["text"]=df[text_column[key]].apply(clean)
texts=df["text"].to_list()
labels=df["label"].apply(label_map).to_list()

null rows :  0
inf rows :  0


In [28]:
encodings = tokenizer(texts, truncation=True, padding=True)
dataset = Dataset(encodings)

raw_pred, _, _ = trainer.predict(dataset)

# Preprocess raw predictions
# y_pred = np.argmax(raw_pred, axis=1)

# classification_report(labels,y_pred)
compute_metrics([raw_pred,labels])

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00     21417\n           1       1.00      1.00      1.00     23481\n\n    accuracy                           1.00     44898\n   macro avg       1.00      1.00      1.00     44898\nweighted avg       1.00      1.00      1.00     44898\n'

In [25]:
compute_metrics([raw_pred,labels])

{'accuracy': 0.4779439252336449,
 'f1': 0.6027592092163276,
 'precision': 0.4728855166257532,
 'recall': 0.8309803921568627}

In [None]:
{'accuracy': 0.4779439252336449,
 'f1': 0.6027592092163276,
 'precision': 0.4728855166257532,
 'recall': 0.8309803921568627} coda
