In [4]:
! pip install minisom -q

# **1) IMPORTING LIBRARIES AND DEFINING FUNCTIONS**

<ins>**Functions**</ins>
<br>
<br>
The following table defines and describes the functions used throughout this notebook.
<br>
<br>

\begin{array}{|c|c|} \hline
\textbf{Function} & \textbf{Description} \\ \hline
\textbf{Convert Label} & Converts\hspace{0.15cm}all\hspace{0.15cm}types\hspace{0.15cm}of\hspace{0.15cm}true\hspace{0.15cm}news\hspace{0.15cm}to\hspace{0.15cm}0\hspace{0.15cm}and\hspace{0.15cm}fake\hspace{0.15cm}to\hspace{0.15cm}1\hspace{0.15cm} \\ \hline
\textbf{Get Fake Score, Get True Score} & Get\hspace{0.15cm}respective\hspace{0.15cm}scores\hspace{0.15cm}from\hspace{0.15cm}array \\ \hline
\textbf{Get Scaler} & Returns\hspace{0.15cm}the\hspace{0.15cm}scaler\hspace{0.15cm}object\hspace{0.15cm}to\hspace{0.15cm}transform\hspace{0.15cm}data \\  \hline
\textbf{Get All Data} &  \\ \hline
\textbf{Evaluate Model} & Given\hspace{0.15cm}data\hspace{0.15cm}and\hspace{0.15cm}model\hspace{0.15cm}returns\hspace{0.15cm}accuracy,\hspace{0.15cm}f1,\hspace{0.15cm}recall\hspace{0.15cm}and\hspace{0.15cm}precision \\ \hline
\textbf{Get Dataset} & Given\hspace{0.15cm}dataset\hspace{0.15cm}name\hspace{0.15cm}returns\hspace{0.15cm}X\_train,\hspace{0.15cm}y\_train,\hspace{0.15cm}X\_test\hspace{0.15cm}and\hspace{0.15cm}y\_test \\ \hline
\textbf{Get Dataset One vs Rest} & Given\hspace{0.15cm}dataset\hspace{0.15cm}name\hspace{0.15cm}returns\hspace{0.15cm}X\_train\hspace{0.15cm}and\hspace{0.15cm}y\_train\hspace{0.15cm}from\hspace{0.15cm}it\hspace{0.15cm}and\hspace{0.15cm}X\_test\hspace{0.15cm}and\hspace{0.15cm}y\_test\hspace{0.15cm}from\hspace{0.15cm}the\hspace{0.15cm}rest\hspace{0.15cm} \\ \hline
\end{array}

<br>

In [20]:
# ============================================================================================================================================= #
LANGUAGE_FEATURES = ["fake_score_occ", "true_score_occ", "fake_score_doc", "true_score_doc", "embd_true","embd_fake"]
EMOTION_FEATURES  = ["anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"]
SEMANTIC_FEATURES = ['words_per_sentence', 'characters_per_word', 'punctuations_per_sentence', 'get_sentiment_polarity', 
                    'lexical_diversity', 'content_word_diversity', 'redundancy', 'noun', 'verb', 'adj', 'adv', "qn_symbol_per_sentence", 
                    "num_exclamation_per_sentence", "url_count_per_sentence"]

ALL_FEATURES = EMOTION_FEATURES + LANGUAGE_FEATURES + SEMANTIC_FEATURES
# ============================================================================================================================================= #
from minisom import MiniSom
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import numpy as np
rng = np.random.RandomState(42)
import pandas as pd
from google.colab import data_table
data_table.enable_dataframe_formatter()
import pickle
# ============================================================================================================================================= #
def convert_label(label):

  if label in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL']:
    return 0

  if label in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE']:
    return 1
# ============================================================================================================================================= #
def get_fake_score(scores):
  return float(scores.strip("[]").split(",")[0].strip())

def get_true_score(scores):
  return float(scores.strip("[]").split(",")[1].strip())

def get_fake_score_occ(scores):
  return float(scores.strip("[]").split(",")[2].strip())

def get_fake_score_doc(scores):
  return float(scores.strip("[]").split(",")[3].strip())

def get_true_score_occ(scores):
  return float(scores.strip("[]").split(",")[4].strip())

def get_true_score_doc(scores):
  return float(scores.strip("[]").split(",")[5].strip())
# ============================================================================================================================================= #
def get_dataset_test(dataset_name, data):
  
  df = data[dataset_name]["combined_dataframe"]
  test_split_available = False

  try:
    df_test = df.loc[df['split'] == 'test']
    test_split_available = True

  except KeyError:
    pass

  if(test_split_available):

    X, y = df_test[ALL_FEATURES].to_numpy(), df_test['label'].to_numpy()

  else:

    X, y = df[ALL_FEATURES].to_numpy(), df['label'].to_numpy()

  return X, y
# ============================================================================================================================================= #
def get_dataset_all(dataset_name, data):

  df = data[dataset_name]["combined_dataframe"]

  X = df[ALL_FEATURES].to_numpy()

  y = df['label'].to_numpy()

  return X, y
# ============================================================================================================================================= #
def shape_outliers(dataFrame, features):

  dataframe = dataFrame.copy(deep=True)

  for column in dataframe[features].columns.tolist():

    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    
    IQR = (Q3 - Q1)
    minV = Q1 - 1.5*IQR
    maxV = Q3 + 1.5*IQR
    
    temp = dataframe[column].copy()

    if ( column not in ["qn_symbol_per_sentence" , "num_exclamation_per_sentence" ,"lexical_diversity" ,"url_count_per_sentence"] ) :
      dataframe[column]=dataframe[column].apply(lambda x:minV if x< minV else maxV if x>maxV else x)

      mean = dataframe[column].mean()
      std  = dataframe[column].std() 

      dataframe[column]=dataframe[column].apply(lambda x: (x-mean)/std )
      
    else:
      dataframe[column]=dataframe[column].apply(lambda x : 1 if x>0 else 0)
    
  return dataframe
# ============================================================================================================================================= #

# **2) IMPORTING DATASETS**

<ins>**Datasets Used**</ins>
<br>
<br>
\begin{array}{|c|c|} \hline
\textbf{Dataset} & \textbf{Description} & \textbf{No. True} & \textbf{No. Fake}  \\ \hline \hline
\textbf{LIAR} & News\hspace{0.15cm}from\hspace{0.15cm}Politifact & foo & bar \\ \hline
\textbf{CodaLab} & COVID\hspace{0.15cm}Tweets & foo & bar \\ \hline
\textbf{FakeNewsNet} & Politifact\hspace{0.15cm}and\hspace{0.15cm}GossipCop & foo & bar \\  \hline
\textbf{Kaggle} & bar & foo & bar \\ \hline
\textbf{ISOT} & Long\hspace{0.15cm}text & foo & bar \\ \hline
\end{array}

<br>

In [21]:
datasets = {

    "ISOT" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/ISOT/ISOT_embedding_new.csv"
    },

    "FA-KES" : {
        "emotion_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_emotion_scores_modified.csv",
        "lexicon_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_with_WELFAKE_Lexicon_Scores_Modified.csv",
        "semantic_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FA-KES/FA-KES_sementic.csv",
        "embedding_file_path" : r"/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FA-KES/FA-KES_embedding_new.csv" 
    }
    
}

In [22]:
for d in datasets:
  
  print("=========================")
  print("Reading {}".format(d))
  print("=========================\n")
  
  datasets[d]["lexicon_dataframe"] = pd.read_csv(datasets[d]["lexicon_file_path"]) 
  datasets[d]["emotion_dataframe"] = pd.read_csv(datasets[d]["emotion_file_path"]) 
  datasets[d]["semantic_dataframe"] = pd.read_csv(datasets[d]["semantic_file_path"])
  datasets[d]["embedding_dataframe"] = pd.read_csv(datasets[d]["embedding_file_path"])

  id = {"CodaLab":"id", "FakeNewsNet":"id_1", "ISOT":"id", "Kaggle_real_fake":"id", "LIAR":"ID", "FA-KES":"unit_id"}
  ID = id[d]

  df = datasets[d]["emotion_dataframe"].merge(datasets[d]["semantic_dataframe"], how='inner', on=ID,suffixes=('_Sentiment', '_Semantic'))
  df = df.merge(datasets[d]["lexicon_dataframe"], how='inner', on = ID,suffixes=('', '_Lexicon'))
  df = df.merge(datasets[d]["embedding_dataframe"], how='inner', on = ID,suffixes=('', '_Embedding'))

  df["fake_score_occ"], df["true_score_occ"] = df["scores"].apply(get_fake_score_occ), df["scores"].apply(get_true_score_occ)
  df["fake_score_doc"], df["true_score_doc"] = df["scores"].apply(get_fake_score_doc), df["scores"].apply(get_true_score_doc)

  df = df.loc[df["lang"]=="en"].copy(deep=True)

  df["label"] = df["label"].apply(convert_label)
  
  df = shape_outliers(df, SEMANTIC_FEATURES + EMOTION_FEATURES)

  ALL = ALL_FEATURES.copy()
  ALL.append("label")

  ALL_WITH_SPLIT = ALL.copy()
  ALL_WITH_SPLIT.append("split")

  try:

    datasets[d]["combined_dataframe"] = df[ALL_WITH_SPLIT]

  except KeyError:

    datasets[d]["combined_dataframe"] = df[ALL]

  print("Total number of datapoints : {}".format(len(df)))

  del df
  del datasets[d]["lexicon_dataframe"]
  del datasets[d]["emotion_dataframe"]
  del datasets[d]["semantic_dataframe"] 
  del datasets[d]["embedding_dataframe"]

  gc.collect()

  print("\nDone !!!\n")

Reading ISOT

Total number of datapoints : 44140

Done !!!

Reading FA-KES

Total number of datapoints : 789

Done !!!



# **3) RESULTS**

In [23]:
def classify(som, data):
    """Classifies each sample in data in one of the classes definited
    using the method labels_map.
    Returns a list of the same length of data where the i-th element
    is the class assigned to data[i].
    """
    winmap = som.labels_map(X_train, y_train)
    default_class = np.sum(list(winmap.values())).most_common()[0][0]
    result = []
    for d in data:
        win_position = som.winner(d)
        if win_position in winmap:
            result.append(winmap[win_position].most_common()[0][0])
        else:
            result.append(default_class)
    return result

In [39]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

X, y = get_dataset_all("FA-KES", datasets)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)
# X_test, y_test = get_dataset_test("FA-KES", datasets)

som = MiniSom(11, 11, 28, sigma=3, learning_rate=1, 
              neighborhood_function='gaussian', activation_distance="cosine")

som.pca_weights_init(X_train)
som.train(X_train, 500, verbose=True)

print(classification_report(y_test, classify(som, X_test)))

 [ 500 / 500 ] 100% - 0:00:00 left 
 quantization error: 3.485957442431342
              precision    recall  f1-score   support

           0       0.57      0.58      0.58        84
           1       0.51      0.50      0.51        74

    accuracy                           0.54       158
   macro avg       0.54      0.54      0.54       158
weighted avg       0.54      0.54      0.54       158

