In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **1) IMPORTING LIBRARIES AND DEFINING FUNCTIONS**

<ins>**Functions**</ins>
<br>
<br>
The following table defines and describes the functions used throughout this notebook.
<br>
<br>

\begin{array}{|c|c|} \hline
\textbf{Function} & \textbf{Description} \\ \hline
\textbf{Convert Label} & Converts\hspace{0.15cm}all\hspace{0.15cm}types\hspace{0.15cm}of\hspace{0.15cm}true\hspace{0.15cm}news\hspace{0.15cm}to\hspace{0.15cm}0\hspace{0.15cm}and\hspace{0.15cm}fake\hspace{0.15cm}to\hspace{0.15cm}1\hspace{0.15cm} \\ \hline
\textbf{Get Fake Score, Get True Score} & Get\hspace{0.15cm}respective\hspace{0.15cm}scores\hspace{0.15cm}from\hspace{0.15cm}array \\ \hline
\textbf{Get Scaler} & Returns\hspace{0.15cm}the\hspace{0.15cm}scaler\hspace{0.15cm}object\hspace{0.15cm}to\hspace{0.15cm}transform\hspace{0.15cm}data \\  \hline
\textbf{Get All Data} &  \\ \hline
\textbf{Evaluate Model} & Given\hspace{0.15cm}data\hspace{0.15cm}and\hspace{0.15cm}model\hspace{0.15cm}returns\hspace{0.15cm}accuracy,\hspace{0.15cm}f1,\hspace{0.15cm}recall\hspace{0.15cm}and\hspace{0.15cm}precision \\ \hline
\textbf{Get Dataset} & Given\hspace{0.15cm}dataset\hspace{0.15cm}name\hspace{0.15cm}returns\hspace{0.15cm}X\_train,\hspace{0.15cm}y\_train,\hspace{0.15cm}X\_test\hspace{0.15cm}and\hspace{0.15cm}y\_test \\ \hline
\textbf{Get Dataset One vs Rest} & Given\hspace{0.15cm}dataset\hspace{0.15cm}name\hspace{0.15cm}returns\hspace{0.15cm}X\_train\hspace{0.15cm}and\hspace{0.15cm}y\_train\hspace{0.15cm}from\hspace{0.15cm}it\hspace{0.15cm}and\hspace{0.15cm}X\_test\hspace{0.15cm}and\hspace{0.15cm}y\_test\hspace{0.15cm}from\hspace{0.15cm}the\hspace{0.15cm}rest\hspace{0.15cm} \\ \hline
\end{array}

<br>

In [None]:
# ============================================================================================================================================= #
# SEMANTIC_FEATURES = ["fake_score", "true_score", "embd_true","embd_fake"]
LANGUAGE_FEATURES = ["fake_score_occ", "true_score_occ", "fake_score_doc", "true_score_doc", "embd_true","embd_fake"]
EMOTION_FEATURES  = ["anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"]
SEMANTIC_FEATURES = ['words_per_sentence', 'characters_per_word', 'punctuations_per_sentence', 'get_sentiment_polarity', 
                    'lexical_diversity', 'content_word_diversity', 'redundancy', 'noun', 'verb', 'adj', 'adv', "qn_symbol_per_sentence", 
                    "num_exclamation_per_sentence", "url_count_per_sentence"]

# "fake_score_occ", "true_score_occ", "fake_score_doc", "true_score_doc"

ALL_FEATURES = EMOTION_FEATURES
# ALL_FEATURES = LANGUAGE_FEATURES.copy()
# ============================================================================================================================================= #
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier

import numpy as np
rng = np.random.RandomState(42)
import pandas as pd
from google.colab import data_table
data_table.enable_dataframe_formatter()
import pickle

# ============================================================================================================================================= #
def write_to_pickle(path,model):
  with open(path, 'wb') as file:  
      pickle.dump(model, file)

def read_pickle_model(path):
  with open(path, 'rb') as file:  
      return pickle.load(file)
# ============================================================================================================================================= #
def convert_label(label):

  if label in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL']:
    return 0

  if label in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE']:
    return 1
# ============================================================================================================================================= #
def get_fake_score(scores):
  return float(scores.strip("[]").split(",")[0].strip())

def get_true_score(scores):
  return float(scores.strip("[]").split(",")[1].strip())

def get_fake_score_occ(scores):
  return float(scores.strip("[]").split(",")[2].strip())

def get_fake_score_doc(scores):
  return float(scores.strip("[]").split(",")[3].strip())

def get_true_score_occ(scores):
  return float(scores.strip("[]").split(",")[4].strip())

def get_true_score_doc(scores):
  return float(scores.strip("[]").split(",")[5].strip())
# ============================================================================================================================================= #
def get_dataset_test(dataset_name, data):
  
  df = data[dataset_name]["combined_dataframe"]
  test_split_available = False

  try:
    df_test = df.loc[df['split'] == 'test']
    test_split_available = True

  except KeyError:
    pass

  if(test_split_available):

    X, y = df_test[ALL_FEATURES].to_numpy(), df_test['label'].to_numpy()

  else:

    X, y = df[ALL_FEATURES].to_numpy(), df['label'].to_numpy()

  return X, y
# ============================================================================================================================================= #
def get_dataset_all(dataset_name, data):

  df = data[dataset_name]["combined_dataframe"]

  X = df[ALL_FEATURES].to_numpy()

  y = df['label'].to_numpy()

  return X, y
# ============================================================================================================================================= #
def shape_outliers(dataFrame, features):

  dataframe = dataFrame.copy(deep=True)

  for column in dataframe[features].columns.tolist():

    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    
    IQR = (Q3 - Q1)
    minV = Q1 - 1.5*IQR
    maxV = Q3 + 1.5*IQR
    
    temp = dataframe[column].copy()

    if ( column not in ["qn_symbol_per_sentence" , "num_exclamation_per_sentence" ,"lexical_diversity" ,"url_count_per_sentence"] ) :
      dataframe[column]=dataframe[column].apply(lambda x:minV if x< minV else maxV if x>maxV else x)

      mean = dataframe[column].mean()
      std  = dataframe[column].std() 

      dataframe[column]=dataframe[column].apply(lambda x: (x-mean)/std )
      
    else:
      dataframe[column]=dataframe[column].apply(lambda x : 1 if x>0 else 0)
    
  return dataframe
# ============================================================================================================================================= #

# **2) IMPORTING DATASETS**

<ins>**Datasets Used**</ins>
<br>
<br>
\begin{array}{|c|c|} \hline
\textbf{Dataset} & \textbf{Description} & \textbf{No. True} & \textbf{No. Fake}  \\ \hline \hline
\textbf{LIAR} & News\hspace{0.15cm}from\hspace{0.15cm}Politifact & foo & bar \\ \hline
\textbf{CodaLab} & COVID\hspace{0.15cm}Tweets & foo & bar \\ \hline
\textbf{FakeNewsNet} & Politifact\hspace{0.15cm}and\hspace{0.15cm}GossipCop & foo & bar \\  \hline
\textbf{Kaggle} & bar & foo & bar \\ \hline
\textbf{ISOT} & Long\hspace{0.15cm}text & foo & bar \\ \hline
\end{array}

<br>

In [None]:
datasets = {
    
    "LIAR" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/Liar_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/LIAR/LIAR_embedding_new.csv"
    },

    "ISOT" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/ISOT/ISOT_embedding_new.csv"
    },

    "Kaggle_real_fake" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Kaggle_real_fake/Kaggle_real_fake_embedding_new.csv" 
    },

    "FA-KES" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FA-KES/FA-KES_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FA-KES/FA-KES_embedding_new.csv" 
    },

    "CodaLab" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/Codalab_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab Covid_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/CodaLab Covid/CodaLab Covid_embedding_new.csv"
    },

    "FakeNewsNet" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FakeNewsNet/FakeNewsNet_embedding_new.csv"
    }
    
}

In [None]:
for d in datasets:
  
  print("=========================")
  print("Reading {}".format(d))
  print("=========================\n")
  
  datasets[d]["lexicon_dataframe"] = pd.read_csv(datasets[d]["lexicon_file_path"]) 
  datasets[d]["emotion_dataframe"] = pd.read_csv(datasets[d]["emotion_file_path"]) 
  datasets[d]["semantic_dataframe"] = pd.read_csv(datasets[d]["semantic_file_path"])
  datasets[d]["embedding_dataframe"] = pd.read_csv(datasets[d]["embedding_file_path"])

  id = {"CodaLab":"id", "FakeNewsNet":"id_1", "ISOT":"id", "Kaggle_real_fake":"id", "LIAR":"ID", "FA-KES":"unit_id"}
  ID = id[d]

  df = datasets[d]["emotion_dataframe"].merge(datasets[d]["semantic_dataframe"], how='inner', on=ID,suffixes=('_Sentiment', '_Semantic'))
  df = df.merge(datasets[d]["lexicon_dataframe"], how='inner', on = ID,suffixes=('', '_Lexicon'))
  df = df.merge(datasets[d]["embedding_dataframe"], how='inner', on = ID,suffixes=('', '_Embedding'))

  # df["fake_score"], df["true_score"] = df["scores"].apply(get_fake_score), df["scores"].apply(get_true_score)

  df["fake_score_occ"], df["true_score_occ"] = df["scores"].apply(get_fake_score_occ), df["scores"].apply(get_true_score_occ)
  df["fake_score_doc"], df["true_score_doc"] = df["scores"].apply(get_fake_score_doc), df["scores"].apply(get_true_score_doc)

  df = df.loc[df["lang"]=="en"].copy(deep=True)

  df["label"] = df["label"].apply(convert_label)

  # df["qn_symbol_per_sentence"] = df["qn_symbol"] / df["num_sentences"]
  # df["num_exclamation_per_sentence"] = df["num_exclamation"] / df["num_sentences"]
  # df["url_count_per_sentence"] = df["url_count"] / df["num_sentences"]
  
  df = shape_outliers(df, ALL_FEATURES)

  ALL = ALL_FEATURES.copy()
  ALL.append("label")

  ALL_WITH_SPLIT = ALL.copy()
  ALL_WITH_SPLIT.append("split")

  try:

    datasets[d]["combined_dataframe"] = df[ALL_WITH_SPLIT]

  except KeyError:

    datasets[d]["combined_dataframe"] = df[ALL]

  print("Total number of datapoints : {}".format(len(df)))

  del df

  print("\nDone !!!\n")

Reading ISOT

Total number of datapoints : 44140

Done !!!

Reading FA-KES

Total number of datapoints : 789

Done !!!



# **3) RESULTS**

In [None]:
classifiers = {
    # "LIAR" : {},
    # "CodaLab" : {},
    # "Kaggle_real_fake" : {},
    # "FakeNewsNet" : {},
    "FA-KES" : {},
    "ISOT" : {}
}
from sklearn.pipeline import make_pipeline

for dataset in classifiers:

  print("===========================================")
  print("Training Classifiers for {}".format(dataset))
  print("===========================================")
  print()

  X_train, y_train = get_dataset_all(dataset, datasets)

  print("Training SVC")
  print("------------\n")
  # model = make_pipeline(StandardScaler(), SVC(kernel = 'linear', gamma='auto'))
  model = SVC(kernel = 'linear', gamma='auto')
  model.fit(X_train, y_train)
  file_path = "{}_SVC.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["SVC"] = file_path
  del model
  print("Done !!!\n")

  print("Training ETC")
  print("------------\n")
  model = ExtraTreesClassifier(n_estimators=300, random_state=0).fit(X_train, y_train)
  file_path = "{}_ETC.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["ETC"] = file_path
  del model
  print("Done !!!\n")
  
  print("Training LGR")
  print("------------\n")
  model = LogisticRegression(random_state=0,max_iter=1000).fit(X_train, y_train)
  file_path = "{}_LGR.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["LGR"] = file_path
  del model
  print("Done !!!\n")

  print("Training DTC")
  print("------------\n")
  model = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
  file_path = "{}_DTC.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["DTC"] = file_path
  del model
  print("Done !!!\n")

  print("Training GNB")
  print("------------\n")
  model = GaussianNB().fit(X_train, y_train)
  file_path = "{}_GNB.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["GNB"] = file_path
  del model
  print("Done !!!\n")

  print("Training RFC")
  print("------------\n")
  model = RandomForestClassifier(random_state=0).fit(X_train, y_train)
  file_path = "{}_RFC.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["RFC"] = file_path
  del model
  print("Done !!!\n")

  print("Training KNC")
  print("------------\n")
  model = KNeighborsClassifier().fit(X_train, y_train)
  file_path = "{}_KNC.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["KNC"] = file_path
  del model
  print("Done !!!\n")
  
  print("Training ABC")
  print("------------\n")
  model = AdaBoostClassifier(random_state=0).fit(X_train, y_train)
  file_path = "{}_ABC.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["ABC"] = file_path
  del model
  print("Done !!!\n")
  
  print("Training XGB")
  print("------------\n")
  model = xgboost.XGBClassifier().fit(X_train, y_train)
  file_path = "{}_XGB.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["XGB"] = file_path
  del model
  print("Done !!!\n")

  print("Training BGC")
  print("------------\n")
  model = model = BaggingClassifier(random_state=0).fit(X_train, y_train)
  file_path = "{}_BGC.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["BGC"] = file_path
  del model
  print("Done !!!\n")

  print("Training LGM")
  print("------------\n")
  model = LGBMClassifier().fit(X_train, y_train)
  file_path = "{}_LGM.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["LGM"] = file_path
  del model
  print("Done !!!\n")

  del X_train
  del y_train

Training Classifiers for FA-KES

Training SVC
------------

Done !!!

Training ETC
------------

Done !!!

Training LGR
------------

Done !!!

Training DTC
------------

Done !!!

Training GNB
------------

Done !!!

Training RFC
------------

Done !!!

Training KNC
------------

Done !!!

Training ABC
------------

Done !!!

Training XGB
------------

Done !!!

Training BGC
------------

Done !!!

Training LGM
------------

Done !!!

Training Classifiers for ISOT

Training SVC
------------

Done !!!

Training ETC
------------

Done !!!

Training LGR
------------

Done !!!

Training DTC
------------

Done !!!

Training GNB
------------

Done !!!

Training RFC
------------

Done !!!

Training KNC
------------

Done !!!

Training ABC
------------

Done !!!

Training XGB
------------

Done !!!

Training BGC
------------

Done !!!

Training LGM
------------

Done !!!



In [None]:
from scipy.stats import mode

# all_datasets = ["LIAR", "CodaLab", "Kaggle_real_fake", "FakeNewsNet", "ISOT"]
all_datasets = list(classifiers.keys())
models = ["SVC", "ETC", "LGR", "DTC", "GNB", "RFC", "KNC", "ABC", "XGB", "BGC", "LGM"]
# models = ["ETC", "LGR", "DTC", "GNB", "RFC", "KNC", "ABC", "XGB", "BGC", "LGM"]
LIAR, CodaLab, Kaggle, FakeNewsNet, ISOT, FA_KES = [], [], [], [], [], []

for data in all_datasets:
  
  print("========================================")
  print("Testing Model Trained on {}".format(data))
  print("========================================")
  print()

  for model_path in models:
    
    print("Model Name : {}".format(model_path))
    print("-----------------\n")

    total_accuracy = 0

    model = read_pickle_model(classifiers[data][model_path])

    for d in all_datasets:
    
      if d == data:
        continue 

      X_test, y_test = get_dataset_test(d, datasets)   

      y_pred = model.predict(X_test)

      total_accuracy += accuracy_score(y_pred, y_test)

    if data == "LIAR":
      LIAR.append(round(total_accuracy/(len(all_datasets)-1), 2))
    if data == "CodaLab":
      CodaLab.append(round(total_accuracy/(len(all_datasets)-1), 2))
    if data == "Kaggle_real_fake":
      Kaggle.append(round(total_accuracy/(len(all_datasets)-1), 2))
    if data == "FakeNewsNet":
      FakeNewsNet.append(round(total_accuracy/(len(all_datasets)-1), 2))
    if data == "ISOT":
      ISOT.append(round(total_accuracy/(len(all_datasets)-1), 2))
    if data == "FA-KES":
      FA_KES.append(round(total_accuracy/(len(all_datasets)-1), 2))
    
    print("Average Accuracy is {}\n".format(round(total_accuracy/(len(all_datasets)-1), 2)))

    del model
    del X_test
    del y_test
    del y_pred

Testing Model Trained on FA-KES

Model Name : SVC
-----------------

Average Accuracy is 0.39

Model Name : ETC
-----------------

Average Accuracy is 0.45

Model Name : LGR
-----------------

Average Accuracy is 0.42

Model Name : DTC
-----------------

Average Accuracy is 0.47

Model Name : GNB
-----------------

Average Accuracy is 0.42

Model Name : RFC
-----------------

Average Accuracy is 0.46

Model Name : KNC
-----------------

Average Accuracy is 0.46

Model Name : ABC
-----------------

Average Accuracy is 0.53

Model Name : XGB
-----------------

Average Accuracy is 0.49

Model Name : BGC
-----------------

Average Accuracy is 0.49

Model Name : LGM
-----------------

Average Accuracy is 0.49

Testing Model Trained on ISOT

Model Name : SVC
-----------------

Average Accuracy is 0.48

Model Name : ETC
-----------------

Average Accuracy is 0.47

Model Name : LGR
-----------------

Average Accuracy is 0.48

Model Name : DTC
-----------------

Average Accuracy is 0.48

Model 

In [None]:
results_final = pd.DataFrame()
results_final["Models"] = models
# results_final["LIAR"] = LIAR
# results_final["CodaLab"] = CodaLab
# results_final["Kaggle"] = Kaggle
# results_final["FakeNewsNet"] = FakeNewsNet
results_final["ISOT"] = ISOT
results_final["FA-KES"] = FA_KES

In [None]:
results_final

Unnamed: 0,Models,ISOT,FA-KES
0,SVC,0.48,0.39
1,ETC,0.47,0.45
2,LGR,0.48,0.42
3,DTC,0.48,0.47
4,GNB,0.49,0.42
5,RFC,0.47,0.46
6,KNC,0.49,0.46
7,ABC,0.47,0.53
8,XGB,0.46,0.49
9,BGC,0.5,0.49
