In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **1) IMPORTING LIBRARIES AND DEFINING FUNCTIONS**

<ins>**Functions**</ins>
<br>
<br>
The following table defines and describes the functions used throughout this notebook.
<br>
<br>

\begin{array}{|c|c|} \hline
\textbf{Function} & \textbf{Description} \\ \hline
\textbf{Convert Label} & Converts\hspace{0.15cm}all\hspace{0.15cm}types\hspace{0.15cm}of\hspace{0.15cm}true\hspace{0.15cm}news\hspace{0.15cm}to\hspace{0.15cm}0\hspace{0.15cm}and\hspace{0.15cm}fake\hspace{0.15cm}to\hspace{0.15cm}1\hspace{0.15cm} \\ \hline
\textbf{Get Fake Score, Get True Score} & Get\hspace{0.15cm}respective\hspace{0.15cm}scores\hspace{0.15cm}from\hspace{0.15cm}array \\ \hline
\textbf{Get Scaler} & Returns\hspace{0.15cm}the\hspace{0.15cm}scaler\hspace{0.15cm}object\hspace{0.15cm}to\hspace{0.15cm}transform\hspace{0.15cm}data \\  \hline
\textbf{Get All Data} &  \\ \hline
\textbf{Evaluate Model} & Given\hspace{0.15cm}data\hspace{0.15cm}and\hspace{0.15cm}model\hspace{0.15cm}returns\hspace{0.15cm}accuracy,\hspace{0.15cm}f1,\hspace{0.15cm}recall\hspace{0.15cm}and\hspace{0.15cm}precision \\ \hline
\textbf{Get Dataset} & Given\hspace{0.15cm}dataset\hspace{0.15cm}name\hspace{0.15cm}returns\hspace{0.15cm}X\_train,\hspace{0.15cm}y\_train,\hspace{0.15cm}X\_test\hspace{0.15cm}and\hspace{0.15cm}y\_test \\ \hline
\textbf{Get Dataset One vs Rest} & Given\hspace{0.15cm}dataset\hspace{0.15cm}name\hspace{0.15cm}returns\hspace{0.15cm}X\_train\hspace{0.15cm}and\hspace{0.15cm}y\_train\hspace{0.15cm}from\hspace{0.15cm}it\hspace{0.15cm}and\hspace{0.15cm}X\_test\hspace{0.15cm}and\hspace{0.15cm}y\_test\hspace{0.15cm}from\hspace{0.15cm}the\hspace{0.15cm}rest\hspace{0.15cm} \\ \hline
\end{array}

<br>

In [2]:
! pip install mlens -q
! pip install deslib -q

[K     |████████████████████████████████| 227 kB 3.2 MB/s 
[K     |████████████████████████████████| 158 kB 3.3 MB/s 
[?25h

In [3]:
import pickle

# read and write model to pickle
def write_to_pickle(path,model):
  with open(path, 'wb') as file:  
      pickle.dump(model, file)

def read_pickle_model(path):
  with open(path, 'rb') as file:  
      return pickle.load(file)

In [4]:
# ============================================================================================================================================= #
# SEMANTIC_FEATURES = ["fake_score", "true_score", "embd_true","embd_fake"]
SEMANTIC_FEATURES = ["fake_score_occ", "true_score_occ", "fake_score_doc", "true_score_doc", "embd_true","embd_fake"]
EMOTION_FEATURES  = ["anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"]
LANGUAGE_FEATURES = ['words_per_sentence', 'characters_per_word', 'punctuations_per_sentence', 'get_sentiment_polarity', 
                    'lexical_diversity', 'content_word_diversity', 'redundancy', 'noun', 'verb', 'adj', 'adv', "qn_symbol_per_sentence", 
                    "num_exclamation_per_sentence", "url_count_per_sentence"]

# "fake_score_occ", "true_score_occ", "fake_score_doc", "true_score_doc"

ALL_FEATURES = SEMANTIC_FEATURES + EMOTION_FEATURES + LANGUAGE_FEATURES
# ============================================================================================================================================= #
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier

from deslib.des.knora_e import KNORAE
from deslib.des import METADES
from deslib.dcs.mcb import MCB
from mlens.ensemble import SuperLearner

import numpy as np
rng = np.random.RandomState(42)
import pandas as pd
from google.colab import data_table
data_table.enable_dataframe_formatter()
# ============================================================================================================================================= #
def convert_label(label):

  if label in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL']:
    return 0

  if label in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE']:
    return 1
# ============================================================================================================================================= #
def get_fake_score(scores):
  return float(scores.strip("[]").split(",")[0].strip())

def get_true_score(scores):
  return float(scores.strip("[]").split(",")[1].strip())

def get_fake_score_occ(scores):
  return float(scores.strip("[]").split(",")[2].strip())

def get_fake_score_doc(scores):
  return float(scores.strip("[]").split(",")[3].strip())

def get_true_score_occ(scores):
  return float(scores.strip("[]").split(",")[4].strip())

def get_true_score_doc(scores):
  return float(scores.strip("[]").split(",")[5].strip())
# ============================================================================================================================================= #
def get_scaler(X, type_of_scaler="standard"):
  
  if type_of_scaler == "get_all":
    scaler = {}
    scaler["standard"] = StandardScaler().fit(X)
    scaler["power"] = PowerTransformer().fit(X)
    scaler["robust"] = RobustScaler().fit(X)
    scaler["quantile"] = QuantileTransformer(random_state=0).fit(X)
    scaler["normalizer"] = Normalizer().fit(X)
    scaler["minmax"] = MinMaxScaler.fit(X)
  else: 
    if type_of_scaler == "standard":
      scaler = StandardScaler().fit(X)
    if type_of_scaler == "power":
      scaler = PowerTransformer().fit(X)
    if type_of_scaler == "robust":
      scaler = RobustScaler().fit(X)
    if type_of_scaler == "quantile":
      scaler = QuantileTransformer(random_state=0).fit(X)
    if type_of_scaler == "normalizer":
      scaler = Normalizer().fit(X)
    if type_of_scaler == "minmax":
      scaler = MinMaxScaler().fit(X)

  return scaler
# ============================================================================================================================================= #
def get_all_data(datasets):

  X = []
  
  for dataset in datasets:
    df = data[dataset_name]["combined_dataframe"]
    X_temp = np.array(df[ALL_FEATURES]).tolist()
    X.extend(X_temp)

  return np.array(X)
# ============================================================================================================================================= #
def evaluate_model(X_train, y_train, X_test, y_test, model_name):

  if model_name == 'lgr':
    model = LogisticRegression(random_state=0).fit(X_train, y_train)
  if model_name == 'dtc':
    model = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
  if model_name == 'gnb':
    model = GaussianNB().fit(X_train, y_train)
  if model_name == 'rfc':
    model = RandomForestClassifier(random_state=0).fit(X_train, y_train)
  if model_name == 'knc':
    model = KNeighborsClassifier().fit(X_train, y_train)
  if model_name == 'abc':
    model = AdaBoostClassifier(random_state=0).fit(X_train, y_train)
  if model_name == 'xgb':
    model = xgboost.XGBClassifier().fit(X_train, y_train)
  if model_name == 'etc':
    model = ExtraTreesClassifier(random_state=0).fit(X_train, y_train)
  if model_name == 'bgc':
    model = BaggingClassifier(random_state=0).fit(X_train, y_train)
  if model_name == 'lgm':
    model = LGBMClassifier().fit(X_train, y_train)

  y_pred = model.predict(X_test)

  return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average="micro"), f1_score(y_test, y_pred, average="macro"), \
          precision_score(y_test, y_pred, average="micro"), precision_score(y_test, y_pred, average="macro"), \
          recall_score(y_test, y_pred, average="micro"), recall_score(y_test, y_pred, average="macro")
# ============================================================================================================================================= #
def get_dataset(dataset_name, data):
  
  df = data[dataset_name]["combined_dataframe"]
  test_split_available = False

  try:
    
    df_train = df.loc[df['split'] != 'test']
    df_test = df.loc[df['split'] == 'test']
    test_split_available = True
  except KeyError:
    pass

  if(test_split_available):

    X_train, y_train = df_train[ALL_FEATURES].to_numpy(), df_train[['label']].to_numpy()

    X_test, y_test = df_test[ALL_FEATURES].to_numpy(), df_test[['label']].to_numpy()

  else:

    from sklearn.model_selection import train_test_split
    X, y = df[ALL_FEATURES].to_numpy(), df[['label']].to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)

  
  return X_train, y_train, X_test, y_test
# ============================================================================================================================================= #
def get_dataset_one_v_rest(dataset_name, data):
  
  df_train = data[dataset_name]["combined_dataframe"]
  X_train, y_train = df_train[ALL_FEATURES].to_numpy(), df_train[['label']].to_numpy()

  X_test, y_test = [], []

  for d in data:

      if d != dataset_name:

        df_test = data[d]["combined_dataframe"]
        X, y = np.array(df[ALL_FEATURES]).tolist(), np.array(df[['label']]).tolist()

        X_test.extend(X)
        y_test.extend(y)
    
  X_test, y_test = np.array(X_test), np.array(y_test)

  return X_train, y_train, X_test, y_test
# ============================================================================================================================================= #
def get_emotion_language_semantic(dataset_name, data):

  df = data[dataset_name]["combined_dataframe"]

  X_emotion = df[EMOTION_FEATURES].to_numpy()
  X_language = df[LANGUAGE_FEATURES].to_numpy()
  X_semantic = df[SEMANTIC_FEATURES].to_numpy()
  
  y = df['label'].to_numpy()

  return X_emotion, X_language, X_semantic, y
# ============================================================================================================================================= #
def get_emotion_language_semantic_test(dataset_name, data):

  df = data[dataset_name]["combined_dataframe"]

  try:
    df = df.loc[df['split'] == 'test']
  except KeyError:
    pass
  
  X_emotion = df[EMOTION_FEATURES].to_numpy()
  X_language = df[LANGUAGE_FEATURES].to_numpy()
  X_semantic = df[SEMANTIC_FEATURES].to_numpy()
  
  y = df['label'].to_numpy()

  return X_emotion, X_language, X_semantic, y
# ============================================================================================================================================= #
def get_dataset_all(dataset_name, data):

  df = data[dataset_name]["combined_dataframe"]

  X = df[ALL_FEATURES].to_numpy()

  y = df['label'].to_numpy()

  return X, y
# ============================================================================================================================================= #
def shape_outliers(dataFrame, features):

  dataframe = dataFrame.copy(deep=True)

  for column in dataframe[features].columns.tolist():

    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    
    IQR = (Q3 - Q1)
    minV = Q1 - 1.5*IQR
    maxV = Q3 + 1.5*IQR
    
    temp = dataframe[column].copy()

    if ( column not in ["qn_symbol_per_sentence" , "num_exclamation_per_sentence" ,"lexical_diversity" ,"url_count_per_sentence"] ) :
      dataframe[column]=dataframe[column].apply(lambda x:minV if x< minV else maxV if x>maxV else x)

      mean = dataframe[column].mean()
      std  = dataframe[column].std() 

      dataframe[column]=dataframe[column].apply(lambda x: (x-mean)/std )
      
    else:
      dataframe[column]=dataframe[column].apply(lambda x : 1 if x>0 else 0)
    
  return dataframe
# ============================================================================================================================================= #

[MLENS] backend: threading


# **2) IMPORTING DATASETS**

<ins>**Datasets Used**</ins>
<br>
<br>
\begin{array}{|c|c|} \hline
\textbf{Dataset} & \textbf{Description} & \textbf{No. True} & \textbf{No. Fake}  \\ \hline \hline
\textbf{LIAR} & News\hspace{0.15cm}from\hspace{0.15cm}Politifact & foo & bar \\ \hline
\textbf{CodaLab} & COVID\hspace{0.15cm}Tweets & foo & bar \\ \hline
\textbf{FakeNewsNet} & Politifact\hspace{0.15cm}and\hspace{0.15cm}GossipCop & foo & bar \\  \hline
\textbf{Kaggle} & bar & foo & bar \\ \hline
\textbf{ISOT} & Long\hspace{0.15cm}text & foo & bar \\ \hline
\end{array}

<br>

In [5]:
datasets = {
    "LIAR" : {
        # "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/LIAR/LIAR_emotion_scores_modified.csv",
        # "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/LIAR/Liar_with_WELFAKE_Lexicon_Scores.csv",
        # "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/Liar_sementic.csv",
        # "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/LIAR/LIAR_embedding.csv"
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/Liar_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/LIAR/LIAR_embedding_new.csv"
    },

    "ISOT" : {
        # "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/ISOT/ISOT_emotion_scores_modified.csv",
        # "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/ISOT/ISOT_with_WELFAKE_Lexicon_Scores.csv",
        # "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_sementic.csv",
        # "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/ISOT/ISOT_embedding.csv"
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/ISOT/ISOT_embedding_new.csv"
    },

    "Kaggle_real_fake" : {
        # "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Kaggle_real_fake/Kaggle_real_fake_emotion_scores_modified.csv",
        # "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Kaggle_real_fake/Kaggle_real_fake_with_WELFAKE_Lexicon_Scores.csv",
        # "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_sementic.csv",
        # "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Kaggle_real_fake/Kaggle_real_fake_embedding.csv" 
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Kaggle_real_fake/Kaggle_real_fake_embedding_new.csv" 
    },

    "CodaLab" : {
        # "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/CodaLab Covid/Codalab_emotion_scores_modified.csv",
        # "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/CodaLab Covid/CodaLab_with_WELFAKE_Lexicon_Scores.csv",
        # "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab_sementic.csv",
        # "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/CodaLab Covid/CodaLab Covid_embedding.csv"
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/Codalab_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab Covid_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/CodaLab Covid/CodaLab Covid_embedding_new.csv"
    },

    "FakeNewsNet" : {
        # "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FakeNewsNet/FakeNewsNet_emotion_scores_modified.csv",
        # "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FakeNewsNet/FakeNewsNet_with_WELFAKE_Lexicon_Scores.csv",
        # "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_sementic.csv",
        # "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FakeNewsNet/FakeNewsNet_embedding.csv"
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FakeNewsNet/FakeNewsNet_embedding_new.csv"
    }
    
}

In [6]:
for d in datasets:
  
  print("=========================")
  print("Reading {}".format(d))
  print("=========================\n")
  
  datasets[d]["lexicon_dataframe"] = pd.read_csv(datasets[d]["lexicon_file_path"]) 
  datasets[d]["emotion_dataframe"] = pd.read_csv(datasets[d]["emotion_file_path"]) 
  datasets[d]["semantic_dataframe"] = pd.read_csv(datasets[d]["semantic_file_path"])
  datasets[d]["embedding_dataframe"] = pd.read_csv(datasets[d]["embedding_file_path"])

  id = {"CodaLab":"id", "FakeNewsNet":"id_1", "ISOT":"id", "Kaggle_real_fake":"id", "LIAR":"ID"}
  ID = id[d]

  df = datasets[d]["emotion_dataframe"].merge(datasets[d]["semantic_dataframe"], how='inner', on=ID,suffixes=('_Sentiment', '_Semantic'))
  df = df.merge(datasets[d]["lexicon_dataframe"], how='inner', on = ID,suffixes=('', '_Lexicon'))
  df = df.merge(datasets[d]["embedding_dataframe"], how='inner', on = ID,suffixes=('', '_Embedding'))

  # df["fake_score"], df["true_score"] = df["scores"].apply(get_fake_score), df["scores"].apply(get_true_score)

  df["fake_score_occ"], df["true_score_occ"] = df["scores"].apply(get_fake_score_occ), df["scores"].apply(get_true_score_occ)
  df["fake_score_doc"], df["true_score_doc"] = df["scores"].apply(get_fake_score_doc), df["scores"].apply(get_true_score_doc)

  df = df.loc[df["lang"]=="en"].copy(deep=True)

  df["label"] = df["label"].apply(convert_label)

  df["qn_symbol_per_sentence"] = df["qn_symbol"] / df["num_sentences"]
  df["num_exclamation_per_sentence"] = df["num_exclamation"] / df["num_sentences"]
  df["url_count_per_sentence"] = df["url_count"] / df["num_sentences"]
  
  df = shape_outliers(df, ALL_FEATURES)

  datasets[d]["combined_dataframe"] = df

  print("Total number of datapoints : {}\n".format(len(df)))

  del df

  print("\nDone !!!\n")

Reading LIAR

Total number of datapoints : 12709


Done !!!

Reading ISOT

Total number of datapoints : 44140


Done !!!

Reading Kaggle_real_fake

Total number of datapoints : 6294


Done !!!

Reading CodaLab

Total number of datapoints : 10620


Done !!!

Reading FakeNewsNet

Total number of datapoints : 21715


Done !!!



# **3) RESULTS**

| Model Used       | Dataset Tested On | Accuracy    |
|------------------|-------------------|-------------|
| LIAR             | CodaLab           | 0.503505656 |
| LIAR             | Kaggle_real_fake  | 0.508580858 |
| LIAR             | FakeNewsNet       | 0.600073651 |
| LIAR             | ISOT              | 0.5172832   |
| CodaLab          | LIAR              | 0.52972973  |
| CodaLab          | Kaggle_real_fake  | 0.541254125 |
| CodaLab          | FakeNewsNet       | 0.58198306  |
| CodaLab          | ISOT              | 0.456242728 |
| Kaggle_real_fake | LIAR              | 0.518292205 |
| Kaggle_real_fake | CodaLab           | 0.430214079 |
| Kaggle_real_fake | FakeNewsNet       | 0.486190389 |
| Kaggle_real_fake | ISOT              | 0.501232129 |
| FakeNewsNet      | LIAR              | 0.555111633 |
| FakeNewsNet      | CodaLab           | 0.53398149  |
| FakeNewsNet      | Kaggle_real_fake  | 0.507920792 |
| FakeNewsNet      | ISOT              | 0.502954649 |
| ISOT             | LIAR              | 0.550724638 |
| ISOT             | CodaLab           | 0.524726559 |
| ISOT             | Kaggle_real_fake  | 0.50709571  |
| ISOT             | FakeNewsNet       | 0.723899834 |


In [None]:
classifiers = {
    "LIAR" : {},
    "CodaLab" : {},
    "Kaggle_real_fake" : {},
    "FakeNewsNet" : {},
    "ISOT" : {}
}

test_size = 0.5

for dataset in classifiers:

  # print("Training Extra tree classifier")
  # print("------------------------------")
  # print()

  # X, y = get_dataset_all(dataset, datasets)

  # scaler = get_scaler(X, "standard")
  # scaler.transform(X)

  # model = ExtraTreesClassifier().fit(X, y.ravel())
  # classifiers[dataset]["model"] = model
  # classifiers[dataset]["scaler"] = scaler

  # print("\nDone !!! \n")
# ============================================================================================================================================= #
  # print("==================================")
  # print("Training {}".format(dataset))
  # print("==================================")
  # print()

  # X_emotion, X_language, X_semantic, y = get_emotion_language_semantic(dataset, datasets)

  # print("Training Emotion")
  # print("----------------")
  # model = ExtraTreesClassifier(n_estimators=300).fit(X_emotion, y.ravel())
  # file_path = "{}_emotion_model.pkl".format(dataset)
  # write_to_pickle(file_path, model)
  # classifiers[dataset]["emotion_model"] = file_path
  # del model
  # print()

  # print("Training Language")
  # print("-----------------")
  # model = ExtraTreesClassifier(n_estimators=300).fit(X_language, y.ravel())
  # file_path = "{}_language_model.pkl".format(dataset)
  # write_to_pickle(file_path, model)
  # classifiers[dataset]["language_model"] = file_path
  # del model
  # print()

  # print("Training Semantic")
  # print("-----------------")
  # model = ExtraTreesClassifier(n_estimators=300).fit(X_semantic, y.ravel())
  # file_path = "{}_semantic_model.pkl".format(dataset)
  # write_to_pickle(file_path, model)
  # classifiers[dataset]["semantic_model"] = file_path
  # del model
  # print()

  # del X_emotion
  # del X_language
  # del X_semantic
  # del y

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

  print("==================================")
  print("Training {}".format(dataset))
  print("==================================")
  print()

  X_emotion, X_language, X_semantic, y = get_emotion_language_semantic(dataset, datasets)
  
  print("Training Emotion")
  print("----------------")
  X_train, X_dsel, y_train, y_dsel = train_test_split(X_emotion, y, test_size=test_size, random_state=0, stratify=y)
  print("- Training Extra Tree Classifier")
  pool_classifiers = ExtraTreesClassifier(n_estimators=1000).fit(X_train, y_train.ravel())
  print("- Training DES")
  model = METADES(pool_classifiers = pool_classifiers, k = 20)
  model.fit(X_dsel, y_dsel.ravel())
  file_path = "{}_emotion_model.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["emotion_model"] = file_path
  del pool_classifiers
  del model
  print()

  print("Training Language")
  print("-----------------")
  X_train, X_dsel, y_train, y_dsel = train_test_split(X_language, y, test_size=test_size, random_state=0, stratify=y)
  print("- Training Extra Tree Classifier")
  pool_classifiers = ExtraTreesClassifier(n_estimators=1000).fit(X_train, y_train.ravel())
  print("- Training DES")
  model = METADES(pool_classifiers = pool_classifiers, k = 20)
  model.fit(X_dsel, y_dsel.ravel())
  file_path = "{}_language_model.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["language_model"] = file_path
  del pool_classifiers
  del model
  print()

  print("Training Semantic")
  print("-----------------")
  X_train, X_dsel, y_train, y_dsel = train_test_split(X_semantic, y, test_size=test_size, random_state=0, stratify=y)
  print("- Training Extra Tree Classifier")
  pool_classifiers = ExtraTreesClassifier(n_estimators=1000).fit(X_train, y_train.ravel())
  print("- Training DES")
  model = METADES(pool_classifiers = pool_classifiers, k = 20)
  model.fit(X_dsel, y_dsel.ravel())
  file_path = "{}_semantic_model.pkl".format(dataset)
  write_to_pickle(file_path, model)
  classifiers[dataset]["semantic_model"] = file_path
  del pool_classifiers
  del model
  print()

  del X_train
  del X_dsel
  del y_train
  del y_dsel

  print("\nDone !!!\n")

Training LIAR

Training Emotion
----------------
- Training Extra Tree Classifier
- Training DES

Training Language
-----------------
- Training Extra Tree Classifier
- Training DES

Training Semantic
-----------------
- Training Extra Tree Classifier
- Training DES


Done !!!

Training CodaLab

Training Emotion
----------------
- Training Extra Tree Classifier
- Training DES

Training Language
-----------------
- Training Extra Tree Classifier
- Training DES

Training Semantic
-----------------
- Training Extra Tree Classifier
- Training DES


Done !!!

Training Kaggle_real_fake

Training Emotion
----------------
- Training Extra Tree Classifier
- Training DES

Training Language
-----------------
- Training Extra Tree Classifier
- Training DES

Training Semantic
-----------------
- Training Extra Tree Classifier
- Training DES


Done !!!

Training FakeNewsNet

Training Emotion
----------------
- Training Extra Tree Classifier
- Training DES


In [None]:
from scipy.stats import mode

all_datasets = list(classifiers.keys())
which_model, which_dataset, accuracies = [], [], []

for data in classifiers:
  print("========================================")
  print("Testing Model Trained on {}".format(data))
  print("========================================")
  print()

  total_accuracy = 0

  for d in all_datasets:
    
    if d == data:
      continue 

    #======================================================================================#

    # X_test, y_test = get_dataset_all(d, datasets)
    # X_test = classifiers[data]["scaler"].transform(X_test)
    # y_pred = classifiers[data]["model"].predict(X_test)

    #======================================================================================#
    X_emotion, X_language, X_semantic, y_test = get_emotion_language_semantic_test(d, datasets)
    # X_semantic = classifiers[data]["semantic_scaler"].transform(X_semantic)
    # X_emotion = classifiers[data]["emotion_scaler"].transform(X_emotion)
    # X_language = classifiers[data]["language_scaler"].transform(X_language)

    model = read_pickle_model(classifiers[data]["emotion_model"])
    y_pred_emotion = model.predict(X_emotion)

    model = read_pickle_model(classifiers[data]["language_model"])
    y_pred_language = model.predict(X_language)

    model = read_pickle_model(classifiers[data]["semantic_model"])
    y_pred_semantic = model.predict(X_semantic)

    y_pred = np.vstack((y_pred_emotion, y_pred_language, y_pred_semantic))
    y_pred = mode(y_pred)[0][0]
    #======================================================================================#

    print("Accuracy on {} is {}\n".format(d, accuracy_score(y_pred, y_test)))
    total_accuracy += accuracy_score(y_pred, y_test)
    which_model.append(data)
    which_dataset.append(d)
    accuracies.append(accuracy_score(y_pred, y_test))
    
    del model

  print("Average Accuracy Across All datasets is {}".format(total_accuracy/(len(all_datasets)-1)))
  print()

In [None]:
results_final = pd.DataFrame()
results_final["Model Used"] = which_model
results_final["Dataset Tested On"] = which_dataset
results_final["Accuracy"] = accuracies

In [None]:
results_final