In [None]:
! pip install -U transformers -q
! pip install -U annoy -q
! pip install psutil -q
! pip install pandarallel -q

[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
[K     |████████████████████████████████| 596 kB 46.7 MB/s 
[K     |████████████████████████████████| 61 kB 490 kB/s 
[K     |████████████████████████████████| 3.3 MB 41.4 MB/s 
[K     |████████████████████████████████| 895 kB 48.6 MB/s 
[K     |████████████████████████████████| 646 kB 5.3 MB/s 
[?25h  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone


In [None]:
import psutil, re, torch
from pandarallel.utils import progress_bars
from pandarallel import pandarallel
from transformers import AutoTokenizer, AutoModel, DistilBertTokenizerFast
import pandas as pd
from annoy import AnnoyIndex
from tqdm.notebook import trange, tqdm
from sklearn.metrics import classification_report

In [None]:
workers = psutil.cpu_count()

progress_bars.is_notebook_lab = lambda : True

pandarallel.initialize(progress_bar=True, nb_workers=workers, use_memory_fs=False)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
EMBEDDING_RANGE = 200

def lower(text):
  return text.lower()

# remove urls
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_nonascii(sent):
  return "".join([i for i in sent if i.isascii()])

def remove_punctuations(text):
  res = re.sub(r'[^\w\s]', '', text)
  return res

def remove_num(text):
  return "".join([c for c in text if not c.isdigit()])

# remove multiple spaces
def remove_mul_space(text):
  return " ".join(text.split())

def clean(text):
  
  text = lower(text)
  text = remove_urls(text)
  text = remove_nonascii(text)
  text = remove_punctuations(text)
  text = remove_num(text)
  text = remove_mul_space(text)

  return text

def multiply(x,y):
  return x/y

def convert_label(label):
  if label in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL']:
    return 0
  if label in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE']:
    return 1

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Load model from HuggingFace Hub
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/Fine-tuned Model Improved")
model = model.to(device)

# Generate Embeddings - This function returns the embeddings of all the texts
def generate_embeddings(sentences, model, tokenizer):

  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)

  with torch.no_grad():
    model_output = model(**encoded_input)

  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  return sentence_embeddings.detach().cpu().numpy().tolist()

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Some weights of the model checkpoint at /content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/Fine tuning WELFake/Fine-tuned Model Improved were not used when initializing DistilBertModel: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
words_df = pd.read_csv(r'/content/drive/Shareddrives/[FYP] Fake News Detection/Kogul_Language_Modelling/ghanashyamvtatti roberta-fake-news/dictionaries/lexicon_WELFake.csv')
words_df.info()

all_words = words_df['word'].tolist()
all_words_embeddings = generate_embeddings(all_words, model, tokenizer)
words_df['embedding'] = all_words_embeddings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1406 entries, 0 to 1405
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   word              1406 non-null   object
 1   common_score      1406 non-null   int64 
 2   true_score        1406 non-null   int64 
 3   fake_score        1406 non-null   int64 
 4   doc_common_score  1406 non-null   int64 
 5   doc_true_score    1406 non-null   int64 
 6   doc_fake_score    1406 non-null   int64 
dtypes: int64(6), object(1)
memory usage: 77.0+ KB


In [None]:
embeddings = words_df['embedding'].tolist()
f = 768

ann = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
for i in range(len(embeddings)):
  ann.add_item(i, embeddings[i])

ann.build(1000, n_jobs=-1) # 1000 trees

True

In [None]:
words_df['cumm_true_score'] = multiply(words_df['true_score'], words_df['doc_true_score'])
words_df['cumm_fake_score'] = multiply(words_df['fake_score'], words_df['doc_fake_score'])
words_df['cumm_common_score'] = multiply(words_df['common_score'], words_df['doc_common_score'])

In [None]:
word_dict = {}

for i in range (len(words_df)):
  word_dict[words_df['word'][i]] = {}

for i in range (len(words_df)):
  
  word_dict[words_df['word'][i]]['fake_score'] = words_df['fake_score'][i]
  word_dict[words_df['word'][i]]['true_score'] = words_df['true_score'][i]
  word_dict[words_df['word'][i]]['common_score'] = words_df['common_score'][i]

  word_dict[words_df['word'][i]]['doc_fake_score'] = words_df['doc_fake_score'][i]
  word_dict[words_df['word'][i]]['doc_true_score'] = words_df['doc_true_score'][i]
  word_dict[words_df['word'][i]]['doc_common_score'] = words_df['doc_common_score'][i]

  word_dict[words_df['word'][i]]['cumm_fake_score'] = words_df['cumm_fake_score'][i]
  word_dict[words_df['word'][i]]['cumm_true_score'] = words_df['cumm_true_score'][i]
  word_dict[words_df['word'][i]]['cumm_common_score'] = words_df['cumm_common_score'][i]

## Analysis using LIAR

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/LIAR/Liar_all.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12791 entries, 0 to 12790
Data columns (total 15 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   ID                                  12791 non-null  object 
 1   label                               12791 non-null  object 
 2   statement                           12791 non-null  object 
 3   subject(s)                          12789 non-null  object 
 4   speaker                             12789 non-null  object 
 5   speaker_job_title                   9224 non-null   object 
 6   state                               10042 non-null  object 
 7   party                               12789 non-null  object 
 8   credit_history_count_barely_true    12789 non-null  float64
 9   credit_history_count_false          12789 non-null  float64
 10  credit_history_count_half_true      12789 non-null  float64
 11  credit_history_count_mostly_true    12789

In [None]:
try:
  df = df.loc[df['split'] == 'test']
except KeyError:
  pass

In [None]:
df['label'] = df['label'].apply(convert_label)

df = df.drop_duplicates(subset=["statement"]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
all_words = df['statement'].tolist()
all_words_embeddings = []

for i in range(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
y_true = df['label'].tolist()

for k in range(10, 200, 10):

  y_pred = []

  # for i in trange(len(df)):
  for i in range(len(df)):

    true, fake = 0, 0

    indexes, distances = ann.get_nns_by_vector(df['embedding'][i], k, include_distances=True, search_k=-1)

    for j in range(len(indexes)):
      # true += words_df['true_score'][indexes[j]]
      # fake += words_df['fake_score'][indexes[j]]

      # true += words_df['doc_true_score'][indexes[j]]
      # fake += words_df['doc_fake_score'][indexes[j]]

      true += words_df['doc_true_score'][indexes[j]]
      fake += words_df['doc_fake_score'][indexes[j]]
      
    if true > fake:
      y_pred.append(0)
    else:
      y_pred.append(1)
    # print(true, fake, df['label'][i])

  print(f"The Classification Report for {k} words")
  print()
  print(classification_report(y_true, y_pred, digits = 4))
  print("===============================================================")

The Classification Report for 10 words

              precision    recall  f1-score   support

           0     0.5638    0.9958    0.7200       714
           1     0.5000    0.0054    0.0107       553

    accuracy                         0.5635      1267
   macro avg     0.5319    0.5006    0.3654      1267
weighted avg     0.5360    0.5635    0.4104      1267

The Classification Report for 20 words

              precision    recall  f1-score   support

           0     0.5630    0.9944    0.7190       714
           1     0.3333    0.0036    0.0072       553

    accuracy                         0.5620      1267
   macro avg     0.4482    0.4990    0.3631      1267
weighted avg     0.4628    0.5620    0.4083      1267

The Classification Report for 30 words

              precision    recall  f1-score   support

           0     0.5634    0.9958    0.7196       714
           1     0.4000    0.0036    0.0072       553

    accuracy                         0.5627      1267
   macro

## Analysis using FakeNewsNet

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FakeNewsNet/FakeNewsNet_All.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23196 entries, 0 to 23195
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         23196 non-null  object
 1   news_url   22866 non-null  object
 2   title      23196 non-null  object
 3   tweet_ids  21695 non-null  object
 4   label      23196 non-null  object
 5   source     22140 non-null  object
 6   id_1       23196 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.2+ MB


In [None]:
try:
  df = df.loc[df['split'] == 'test']
except KeyError:
  pass

In [None]:
df['label'] = df['label'].apply(convert_label)

df = df.drop_duplicates(subset=["title"]).reset_index(drop=True)

In [None]:
all_words = df['title'].tolist()
all_words_embeddings = []

for i in range(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
y_true = df['label'].tolist()

for k in range(10, 200, 10):

  y_pred = []

  # for i in trange(len(df)):
  for i in range(len(df)):

    true, fake = 0, 0

    indexes, distances = ann.get_nns_by_vector(df['embedding'][i], k, include_distances=True, search_k=-1)

    for j in range(len(indexes)):
      # true += words_df['true_score'][indexes[j]]
      # fake += words_df['fake_score'][indexes[j]]

      # true += words_df['doc_true_score'][indexes[j]]
      # fake += words_df['doc_fake_score'][indexes[j]]

      true += words_df['doc_true_score'][indexes[j]]
      fake += words_df['doc_fake_score'][indexes[j]]

    if true > fake:
      y_pred.append(0)
    else:
      y_pred.append(1)
    # print(true, fake, df['label'][i])

  print(f"The Classification Report for {k} words")
  print()
  print(classification_report(y_true, y_pred, digits = 4))
  print("===============================================================")

The Classification Report for 10 words

              precision    recall  f1-score   support

           0     0.7508    0.6925    0.7205     16402
           1     0.2353    0.2916    0.2604      5322

    accuracy                         0.5943     21724
   macro avg     0.4930    0.4920    0.4905     21724
weighted avg     0.6245    0.5943    0.6078     21724

The Classification Report for 20 words

              precision    recall  f1-score   support

           0     0.7511    0.7318    0.7413     16402
           1     0.2340    0.2525    0.2429      5322

    accuracy                         0.6144     21724
   macro avg     0.4926    0.4922    0.4921     21724
weighted avg     0.6244    0.6144    0.6192     21724

The Classification Report for 30 words

              precision    recall  f1-score   support

           0     0.7529    0.7495    0.7512     16402
           1     0.2387    0.2420    0.2403      5322

    accuracy                         0.6252     21724
   macro

## Analysis using CodaLab

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/CodaLab Covid/Constraint_English_All.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10700 entries, 0 to 10699
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10700 non-null  object
 1   tweet   10700 non-null  object
 2   label   10700 non-null  object
 3   split   10700 non-null  object
dtypes: object(4)
memory usage: 334.5+ KB


In [None]:
try:
  df = df.loc[df['split'] == 'test']
except KeyError:
  pass

In [None]:
df['label'] = df['label'].apply(convert_label)

df = df.drop_duplicates(subset=["tweet"]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
all_words = df['tweet'].tolist()
all_words_embeddings = []

for i in range(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
y_true = df['label'].tolist()

for k in range(10, 200, 10):

  y_pred = []

  # for i in trange(len(df)):
  for i in range(len(df)):

    true, fake = 0, 0

    indexes, distances = ann.get_nns_by_vector(df['embedding'][i], k, include_distances=True, search_k=-1)

    for j in range(len(indexes)):
      # true += words_df['true_score'][indexes[j]]
      # fake += words_df['fake_score'][indexes[j]]

      # true += words_df['doc_true_score'][indexes[j]]
      # fake += words_df['doc_fake_score'][indexes[j]]

      true += words_df['doc_true_score'][indexes[j]]
      fake += words_df['doc_fake_score'][indexes[j]]

    if true > fake:
      y_pred.append(0)
    else:
      y_pred.append(1)
    # print(true, fake, df['label'][i])

  print(f"The Classification Report for {k} words")
  print()
  print(classification_report(y_true, y_pred, digits = 4))
  print("===============================================================")

The Classification Report for 10 words

              precision    recall  f1-score   support

           0     0.3126    0.2286    0.2641      1120
           1     0.3460    0.4480    0.3904      1020

    accuracy                         0.3332      2140
   macro avg     0.3293    0.3383    0.3272      2140
weighted avg     0.3285    0.3332    0.3243      2140

The Classification Report for 20 words

              precision    recall  f1-score   support

           0     0.3302    0.2536    0.2869      1120
           1     0.3469    0.4353    0.3861      1020

    accuracy                         0.3402      2140
   macro avg     0.3386    0.3444    0.3365      2140
weighted avg     0.3382    0.3402    0.3342      2140

The Classification Report for 30 words

              precision    recall  f1-score   support

           0     0.3326    0.2616    0.2929      1120
           1     0.3431    0.4235    0.3791      1020

    accuracy                         0.3388      2140
   macro

## Analysis using ISOT

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/ISOT/ISOT.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
 5   id       44898 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 2.1+ MB


In [None]:
try:
  df = df.loc[df['split'] == 'test']
except KeyError:
  pass

In [None]:
df['label'] = df['label'].apply(convert_label)

df['total_text'] = df['title'].fillna('') + " " + df['text'].fillna('')
df = df.drop_duplicates(subset=["total_text"]).reset_index(drop=True)

In [None]:
all_words = df['total_text'].tolist()
all_words_embeddings = []

for i in range(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
y_true = df['label'].tolist()

for k in range(10, 200, 10):

  y_pred = []

  # for i in trange(len(df)):
  for i in range(len(df)):

    true, fake = 0, 0

    indexes, distances = ann.get_nns_by_vector(df['embedding'][i], k, include_distances=True, search_k=-1)

    for j in range(len(indexes)):
      # true += words_df['true_score'][indexes[j]]
      # fake += words_df['fake_score'][indexes[j]]

      # true += words_df['doc_true_score'][indexes[j]]
      # fake += words_df['doc_fake_score'][indexes[j]]

      true += words_df['doc_true_score'][indexes[j]]
      fake += words_df['doc_fake_score'][indexes[j]]

    if true > fake:
      y_pred.append(0)
    else:
      y_pred.append(1)
    # print(true, fake, df['label'][i])

  print(f"The Classification Report for {k} words")
  print()
  print(classification_report(y_true, y_pred, digits = 4))
  print("===============================================================")

The Classification Report for 10 words

              precision    recall  f1-score   support

           0     0.7724    0.9986    0.8711     21197
           1     0.9975    0.6517    0.7883     17908

    accuracy                         0.8397     39105
   macro avg     0.8850    0.8251    0.8297     39105
weighted avg     0.8755    0.8397    0.8332     39105

The Classification Report for 20 words

              precision    recall  f1-score   support

           0     0.7453    0.9992    0.8537     21197
           1     0.9983    0.5958    0.7462     17908

    accuracy                         0.8144     39105
   macro avg     0.8718    0.7975    0.8000     39105
weighted avg     0.8611    0.8144    0.8045     39105

The Classification Report for 30 words

              precision    recall  f1-score   support

           0     0.7199    0.9995    0.8369     21197
           1     0.9989    0.5396    0.7007     17908

    accuracy                         0.7889     39105
   macro

## Analysis using Kaggle

In [None]:
df = pd.read_csv(r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Kaggle_real_fake/fake_or_real_news.csv")
df.info()

In [None]:
try:
  df = df.loc[df['split'] == 'test']
except KeyError:
  pass

In [None]:
df['label'] = df['label'].apply(convert_label)

df['total_text'] = df['title'].fillna('') + " " + df['text'].fillna('')
df = df.drop_duplicates(subset=["total_text"]).reset_index(drop=True)

In [None]:
all_words = df['total_text'].tolist()
all_words_embeddings = []

for i in range(0, len(all_words), EMBEDDING_RANGE):
  all_words_embeddings.extend(generate_embeddings(all_words[i:i+EMBEDDING_RANGE], model, tokenizer))

df['embedding'] = all_words_embeddings

In [None]:
y_true = df['label'].tolist()

for k in range(10, 200, 10):

  y_pred = []

  # for i in trange(len(df)):
  for i in range(len(df)):

    true, fake = 0, 0

    indexes, distances = ann.get_nns_by_vector(df['embedding'][i], k, include_distances=True, search_k=-1)

    for j in range(len(indexes)):
      # true += words_df['true_score'][indexes[j]]
      # fake += words_df['fake_score'][indexes[j]]

      # true += words_df['doc_true_score'][indexes[j]]
      # fake += words_df['doc_fake_score'][indexes[j]]

      true += words_df['doc_true_score'][indexes[j]]
      fake += words_df['doc_fake_score'][indexes[j]]

    if true > fake:
      y_pred.append(0)
    else:
      y_pred.append(1)
    # print(true, fake, df['label'][i])

  print(f"The Classification Report for {k} words")
  print()
  print(classification_report(y_true, y_pred, digits = 4))
  print("===============================================================")