In [12]:
! pip install deslib -q

In [13]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Dynamic Classifier Selection Algorithms
from deslib.dcs.mcb import MCB
from deslib.dcs.ola import OLA

# Importing the Garbage Collector
import gc

In [14]:
# ============================================================================================================================================= #
LANGUAGE_FEATURES = ["fake_score_occ", "true_score_occ", "fake_score_doc", "true_score_doc", "embd_true","embd_fake"]
EMOTION_FEATURES  = ["anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"]
SEMANTIC_FEATURES = ['words_per_sentence', 'characters_per_word', 'punctuations_per_sentence', 'get_sentiment_polarity', 
                    'lexical_diversity', 'content_word_diversity', 'redundancy', 'noun', 'verb', 'adj', 'adv', "qn_symbol_per_sentence", 
                    "num_exclamation_per_sentence", "url_count_per_sentence"]

ALL_FEATURES = LANGUAGE_FEATURES + EMOTION_FEATURES + SEMANTIC_FEATURES
# ============================================================================================================================================= #
def convert_label(label):

  if label in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL']:
    return 0

  if label in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE']:
    return 1
# ============================================================================================================================================= #
def get_fake_score(scores):
  return float(scores.strip("[]").split(",")[0].strip())

def get_true_score(scores):
  return float(scores.strip("[]").split(",")[1].strip())

def get_fake_score_occ(scores):
  return float(scores.strip("[]").split(",")[2].strip())

def get_fake_score_doc(scores):
  return float(scores.strip("[]").split(",")[3].strip())

def get_true_score_occ(scores):
  return float(scores.strip("[]").split(",")[4].strip())

def get_true_score_doc(scores):
  return float(scores.strip("[]").split(",")[5].strip())
# ============================================================================================================================================= #
def shape_outliers(dataFrame, features):

  dataframe = dataFrame.copy(deep=True)

  for column in dataframe[features].columns.tolist():

    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    
    IQR = (Q3 - Q1)
    minV = Q1 - 1.5*IQR
    maxV = Q3 + 1.5*IQR
    
    temp = dataframe[column].copy()

    if (column not in ["qn_symbol_per_sentence","num_exclamation_per_sentence","lexical_diversity","url_count_per_sentence"]):
      
      dataframe[column]=dataframe[column].apply(lambda x:minV if x< minV else maxV if x>maxV else x)

      mean = dataframe[column].mean()
      std  = dataframe[column].std() 

      dataframe[column]=dataframe[column].apply(lambda x: (x-mean)/std )
      
    else:
      dataframe[column]=dataframe[column].apply(lambda x : 1 if x>0 else 0)
    
  return dataframe
# ============================================================================================================================================= #

In [15]:
datasets = {
    
    "LIAR" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/Liar_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/LIAR/LIAR_embedding_new.csv"
    },

    "ISOT" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/ISOT/ISOT_embedding_new.csv"
    },

    "Kaggle_real_fake" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Kaggle_real_fake/Kaggle_real_fake_embedding_new.csv" 
    },

    "FA-KES" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FA-KES/FA-KES_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FA-KES/FA-KES_embedding_new.csv" 
    },

    # "CodaLab" : {
    #     "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/Codalab_emotion_scores_modified.csv",
    #     "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab_with_WELFAKE_Lexicon_Scores_Modified.csv",
    #     "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab Covid_sementic.csv",
    #     "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/CodaLab Covid/CodaLab Covid_embedding_new.csv"
    # },

    # "FakeNewsNet" : {
    #     "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_emotion_scores_modified.csv",
    #     "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_with_WELFAKE_Lexicon_Scores_Modified.csv",
    #     "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_sementic.csv",
    #     "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FakeNewsNet/FakeNewsNet_embedding_new.csv"
    # }
    
}


for d in datasets:
  print("=========================")
  print("Reading {}".format(d))
  print("=========================\n")
  
  datasets[d]["lexicon_dataframe"] = pd.read_csv(datasets[d]["lexicon_file_path"]) 
  datasets[d]["emotion_dataframe"] = pd.read_csv(datasets[d]["emotion_file_path"]) 
  datasets[d]["semantic_dataframe"] = pd.read_csv(datasets[d]["semantic_file_path"])
  datasets[d]["embedding_dataframe"] = pd.read_csv(datasets[d]["embedding_file_path"])

  id = {"CodaLab":"id", "FakeNewsNet":"id_1", "ISOT":"id", "Kaggle_real_fake":"id", "LIAR":"ID", "FA-KES":"unit_id"}
  ID = id[d]

  df = datasets[d]["emotion_dataframe"].merge(datasets[d]["semantic_dataframe"], how='inner', on=ID,suffixes=('_Sentiment', '_Semantic'))
  df = df.merge(datasets[d]["lexicon_dataframe"], how='inner', on = ID,suffixes=('', '_Lexicon'))
  df = df.merge(datasets[d]["embedding_dataframe"], how='inner', on = ID,suffixes=('', '_Embedding'))

  df["fake_score_occ"], df["true_score_occ"] = df["scores"].apply(get_fake_score_occ), df["scores"].apply(get_true_score_occ)
  df["fake_score_doc"], df["true_score_doc"] = df["scores"].apply(get_fake_score_doc), df["scores"].apply(get_true_score_doc)

  df = df.loc[df["lang"]=="en"].copy(deep=True)

  df["label"] = df["label"].apply(convert_label)
  
  df = shape_outliers(df, EMOTION_FEATURES + SEMANTIC_FEATURES)

  ALL = ALL_FEATURES.copy()
  ALL.append("label")

  ALL_WITH_SPLIT = ALL.copy()
  ALL_WITH_SPLIT.append("split")

  try:

    datasets[d]["combined_dataframe"] = df[ALL_WITH_SPLIT]

  except KeyError:

    datasets[d]["combined_dataframe"] = df[ALL]

  print("Total number of datapoints : {}".format(len(df)))

  del df

  print("\nDone !!!\n")

gc.collect()

Reading LIAR

Total number of datapoints : 12709

Done !!!

Reading ISOT

Total number of datapoints : 44140

Done !!!

Reading Kaggle_real_fake

Total number of datapoints : 6294

Done !!!

Reading CodaLab

Total number of datapoints : 10620

Done !!!

Reading FakeNewsNet

Total number of datapoints : 21715

Done !!!



48

In [18]:
X_train, y_train = [], []
dataset_in_consideration = "CodaLab"

for d in datasets:

  if d == dataset_in_consideration:
    continue
  temp_df = datasets[d]["combined_dataframe"]

  try:
    temp_df = temp_df.loc[temp_df["split"] != "test"]
  except KeyError:
    pass

  X_train.extend(temp_df[ALL_FEATURES].to_numpy())
  y_train.extend(temp_df["label"].to_numpy())

X_train = np.array(X_train)
y_train = np.array(y_train)

rng = np.random.RandomState(42)

# Split the data into training and DSEL for DS techniques
X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train,
                                                    test_size=0.5,
                                                    random_state=rng)

temp_df = datasets[dataset_in_consideration]["combined_dataframe"]

X_test, y_test = temp_df[ALL_FEATURES].to_numpy(), temp_df["label"].to_numpy()

In [19]:
# model_svc = SVC(probability = True, kernel = 'linear', gamma='auto').fit(X_train, y_train)
model_bayes = GaussianNB().fit(X_train, y_train)
model_knn = KNeighborsClassifier().fit(X_train, y_train)

pool_classifiers = [
                    # model_svc,
                    model_bayes,
                    model_knn
                  ]

# DCS techniques
mcb = MCB(pool_classifiers)

# Fitting the DS techniques
mcb.fit(X_dsel, y_dsel)

print('Classification accuracy of MCB: ', mcb.score(X_test, y_test))

Classification accuracy of MCB:  0.4382297551789077
