In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Import Lib and Def Func

In [2]:
import pandas as pd
import time
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_fscore_support, \
    confusion_matrix
import torch
import time
import numpy as np
from sklearn.model_selection import train_test_split ,KFold
import pickle
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,GradientBoostingClassifier ,AdaBoostClassifier ,ExtraTreesClassifier ,VotingClassifier ,StackingClassifier
 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
 
import xgboost as xgb
 

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 

In [3]:
# read and write model to pickle
def write_to_pickle(Pkl_File_path,model):
  with open(Pkl_File_path, 'wb') as file:  
      pickle.dump(model, file)

def read_pickle_model(path):
  with open(path, 'rb') as file:  
      return pickle.load(file)

In [4]:
# compute model scores
def compute_metrics(pred,ground_labels):
    labels_all = ground_labels
    preds_all = [round(value) for value in  list(pred)]
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels_all, preds_all ,zero_division=0)
    acc = accuracy_score(labels_all, preds_all)
    confusion_mat = confusion_matrix(labels_all, preds_all)
    # tn, fp, fn, tp = confusiton_mat.ravel()
    out_dict = {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusiton_mat': confusion_mat
      }
    return out_dict

In [5]:
# mapping of the labels to 0,1 
def label_map(x): 
  if x in ['true', 'mostly-true', 'half-true', 'real', 'Real', 0, 'REAL',"TRUE"]:
    return 0
  elif x in ['false', 'pants-fire', 'barely-true', 'fake', 'Fake', 1, 'FAKE',"FALSE"]:
    return 1
  else:return x

In [6]:
def normalize(dataFrame,features , parameterDict={}):
  dataframe=dataFrame.copy()
  normDict={}
  for column in dataframe[features].columns.tolist():
    Q1=dataframe[column].quantile(0.25)
    Q3=dataframe[column].quantile(0.75)

    IQR=(Q3-Q1)
    minV=Q1 - 1.5*IQR
    maxV=Q3 + 1.5*IQR

    if (bool(parameterDict)):
      minV = parameterDict[column]["minV"]
      maxV = parameterDict[column]["maxV"]
      # print("using parameterDict")


    temp=dataframe[column].copy()
  
    if ( column not in ["qn_symbol_per_sentence" , "num_exclamation_per_sentence" ,"lexical_diversity" ,"url_count_per_sentence"] ) :
      dataframe[column]=dataframe[column].apply(lambda x:minV if x< minV else maxV if x>maxV else x)

      mean = dataframe[column].mean()
      std  = dataframe[column].std() 
      if (bool(parameterDict)):
        mean = parameterDict[column]["mean"]
        std = parameterDict[column]["std"]
        # print("using parameterDict")

      try:
        dataframe[column]=dataframe[column].apply(lambda x:  (x-mean)/std )
      except:
        print(column) 
    else:
      dataframe[column]=dataframe[column].apply(lambda x : 1 if x>0 else 0)
      mean = dataframe[column].mean()
      std  = dataframe[column].std() 
      # print("col",column)

    
    summaryDict={"Q1":Q1,"Q3":Q3,"IQR":IQR,"minV":minV,"maxV":maxV,"mean":mean,"std":std}
    normDict[column]=summaryDict
  return dataframe ,normDict

# Define global varibles

In [7]:
# set All_features value to be used by the models 
Semantic_features= [
                    # 'url_count', #
                    # 'qn_symbol', #
                    # 'num_chars', #
                    # 'num_words', #
                    # 'num_sentences', #
                    # 'num_exclamation', #
                    'words_per_sentence',  
                    'characters_per_word',
                    'punctuations_per_sentence', 
                    'get_sentiment_polarity',  
                    'lexical_diversity',
                    'content_word_diversity',
                    'redundancy',
                    'noun',
                    'verb',
                    'adj',
                    'adv',
                    "qn_symbol_per_sentence",
                    "num_exclamation_per_sentence",
                    "url_count_per_sentence"
                    ]

LexMod_Features=   [
                    #  'fake_score', 
                    # 'true_score', 
                     'fake_score1', 
                    'true_score1', 
                     'fake_score2', 
                    'true_score2', 
                    # 'common_score'
                    ]

Emotion_features= [
                    #  'highest_eight_label', 
                    'anger', 
                    'anticipation',
                    'disgust', 
                    'fear',
                    'joy', 
                    'sadness', 
                    'surprise', 
                    'trust'
                    ]
Embed_features= ["embd_true","embd_fake"]

All_features=Semantic_features + Emotion_features +LexMod_Features+Embed_features

In [8]:
# set output directory for the models generated 
outputDirectory="/content/model/"
!mkdir model

### Pipeline 

In [9]:
# input path of the feature set
# -----------------------------

# datasets = ["LIAR", "ISOT", "FakeNewsNet", "Kaggle", "Constraint"]


inputPath = {
    "Constraint": [
               "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab_with_WELFAKE_Lexicon_Scores_Modified.csv",
               "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/CodaLab Covid_sementic.csv",
               "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/CodaLab Covid/Codalab_emotion_scores_modified.csv",
               "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/CodaLab Covid/CodaLab Covid_embedding_new.csv",
               "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/CodaLab Covid/Constraint_English_All.csv" ],
    
    "FakeNewsNet" :["/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_with_WELFAKE_Lexicon_Scores_Modified.csv",
                    "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_sementic.csv",
                    "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FakeNewsNet/FakeNewsNet_emotion_scores_modified.csv",
                    "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FakeNewsNet/FakeNewsNet_embedding_new.csv",
                    "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FakeNewsNet/FakeNewsNet_All.csv"],
           
    "ISOT":   ["/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_with_WELFAKE_Lexicon_Scores_Modified.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_sementic.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/ISOT/ISOT_emotion_scores_modified.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/ISOT/ISOT_embedding_new.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/ISOT/ISOT.csv"],
           
    "Kaggle":["/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_with_WELFAKE_Lexicon_Scores_Modified.csv",
                      "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_sementic.csv",
                      "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Kaggle_real_fake/Kaggle_real_fake_emotion_scores_modified.csv",
                      "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Kaggle_real_fake/Kaggle_real_fake_embedding_new.csv",
                      "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Kaggle_real_fake/fake_or_real_news.csv"] ,
    
    "LIAR":["/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/Liar_with_WELFAKE_Lexicon_Scores_Modified.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_sementic.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/LIAR/LIAR_emotion_scores_modified.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/LIAR/LIAR_embedding_new.csv",
            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/LIAR/Liar_all.csv"],
           
    # "FA-KES" :["/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_title_with_WELFAKE_Lexicon_Scores_Modified.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/FA-KES/FA-KES_title_sementic.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES_title_emotion_scores_modified.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/FA-KES/FA-KES_title_embedding_new.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/FA-KES/FA-KES.csv"         
    # ],

    # "Politifact_test" : ["/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Politifact_test/Politifact_test_with_WELFAKE_Lexicon_Scores_Modified.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Politifact_test/Politifact_test_sementic.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Politifact_test/Politifact_testset_emotion_scores_modified.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/Politifact_test/Politifact_test_embedding_new.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/Politifact_test/Politifact_test.csv",        
    # ],
    # "COVID19_test" :[
    #             "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/COVID19_test/COVID19_title_test_with_WELFAKE_Lexicon_Scores_Modified.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/COVID19_test/COVID19_test_title_sementic.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/COVID19_test/COVID19_test_title_emotion_scores_modified.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Results/Final/COVID19_test/COVID19_test_title_embedding_new.csv",
    #            "/content/drive/Shareddrives/[FYP] Fake News Detection/Datasets/COVID19_test/COVID19_test.csv",     
    # ]


}

In [10]:
!rm -fr model
!mkdir model

In [11]:
# datasets = ["LIAR", "ISOT", "FakeNewsNet", "Kaggle", "Constraint"]

In [11]:
# field name of id , label , text_column of respective datasets 
id={"Constraint":"id", "FakeNewsNet":"id_1", "ISOT":"id","Kaggle":"id", "LIAR":"ID", "FA-KES":"unit_id","Politifact_test":"id","COVID19_test":"id"}
label={"Constraint":"label", "FakeNewsNet":"label", "ISOT":"label","Kaggle":"label","LIAR":"label","FA-KES":"label","Politifact_test":"target","COVID19_test":"label"}
features=["lexicon","sementic","sentiment","embedding"]
text_column={"Constraint":"tweet", "FakeNewsNet":"title", "ISOT":"text","Kaggle":"text", "LIAR":"statement", "FA-KES":"article_content","Politifact_test":"statement","COVID19_test":"text"}

In [12]:
for key,value in inputPath.items():
  print("----------",key,"--------------")
  df=pd.read_csv(value[-1])
  # print(df[label[key]].value_counts())
  print("Total number of datapoints : {}".format(len(df)))
  print(df.info())

---------- Constraint --------------
Total number of datapoints : 10700
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10700 entries, 0 to 10699
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10700 non-null  object
 1   tweet   10700 non-null  object
 2   label   10700 non-null  object
 3   split   10700 non-null  object
dtypes: object(4)
memory usage: 334.5+ KB
None
---------- FakeNewsNet --------------
Total number of datapoints : 23196
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23196 entries, 0 to 23195
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         23196 non-null  object
 1   news_url   22866 non-null  object
 2   title      23196 non-null  object
 3   tweet_ids  21695 non-null  object
 4   label      23196 non-null  object
 5   source     22140 non-null  object
 6   id_1       23196 non-null  int64 
dtypes: int64(1), object(

In [13]:
## load all data set in to a dataframe dictionary (UPDATE :NORMALIZED)

all_df={}

# for each dataset 
for key,value in inputPath.items():
  print("----------",key,"--------------")
  error=False
  ID=id[key]
  LABEL=label[key]

  # check whether all 4 features are available 
  for v in range(4):
    if value[v]=="":
      error=True
      print(" Error: missing {:} skipping {:}".format(features[v],key))
      break
  if (not error):
    # read each of the 4 features for the dataset 
    dfLexicon  = pd.read_csv(value[0])
    dfSementic = pd.read_csv(value[1])
    dfSentiment = pd.read_csv(value[2])
    dfEmbedding = pd.read_csv(value[3])

    # combine the features using inner join 

    dff=dfSentiment.merge(dfSementic, how='inner', on=ID,suffixes=('_Sentiment', '_Sementic'))
    dff=dff.merge(dfLexicon, how='inner', on=ID,suffixes=('', '_Lexicon'))
    df=dff.merge(dfEmbedding, how='inner', on=ID,suffixes=('', '_Lexicon'))

    df=df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1)
 
    df["scores"]=df["scores"].apply(lambda x : list(map(np.float64, x.strip('][').replace('"', '').replace("'","").replace(" ","").split(','))))
    # df["fake_score"]=df["scores"].apply(lambda x:x[0])
    # df["true_score"]=df["scores"].apply(lambda x:x[1])

    df["fake_score1"]=df["scores"].apply(lambda x:x[2])
    df["true_score1"]=df["scores"].apply(lambda x:x[3])

    df["fake_score2"]=df["scores"].apply(lambda x:x[4])
    df["true_score2"]=df["scores"].apply(lambda x:x[5])

    # print(df.info())

    df["label"]=df[LABEL+"_Sementic"]  #set the label coulmn 



    # if (key=="Politifact_test"):
    #   df=df.loc[df["label"].isin(["FALSE","TRUE"])]
    df = df.loc[df["lang"]=="en"].copy(deep=True)

    df["label"]=df["label"].apply(label_map)   # converting labels to 0,1 
    df["id"] = df[ID]
    df['dataset'] = key
    
    fake_df = df.loc[df['label'] == 0].sample(3000)
    true_df = df.loc[df['label'] == 1].sample(3000)

    df = fake_df.append(true_df)
    
    df=df[All_features+["label","id","dataset"]]

    print(df["label"].value_counts())


    df=df.loc[df["label"].isin([0,1])]
    df["label"] = df["label"].astype("int64")
      
    #clean 
    print("null rows : ",df.isnull().any(axis=1).sum())
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    print("inf rows : ",df.isnull().any(axis=1).sum())
    df.dropna(inplace=True)
    
    print("Total number of datapoints : {}".format(len(df)))

    all_df[key]=df.copy(deep=True)         
    
print(all_df.keys())

---------- Constraint --------------
0    3000
1    3000
Name: label, dtype: int64
null rows :  0
inf rows :  0
Total number of datapoints : 6000
---------- FakeNewsNet --------------
0    3000
1    3000
Name: label, dtype: int64
null rows :  0
inf rows :  0
Total number of datapoints : 6000
---------- ISOT --------------
0    3000
1    3000
Name: label, dtype: int64
null rows :  0
inf rows :  0
Total number of datapoints : 6000
---------- Kaggle --------------
0    3000
1    3000
Name: label, dtype: int64
null rows :  0
inf rows :  0
Total number of datapoints : 6000
---------- LIAR --------------
0    3000
1    3000
Name: label, dtype: int64
null rows :  0
inf rows :  0
Total number of datapoints : 6000
dict_keys(['Constraint', 'FakeNewsNet', 'ISOT', 'Kaggle', 'LIAR'])


In [14]:
all_df['ISOT'].head()

Unnamed: 0,words_per_sentence,characters_per_word,punctuations_per_sentence,get_sentiment_polarity,lexical_diversity,content_word_diversity,redundancy,noun,verb,adj,...,trust,fake_score1,true_score1,fake_score2,true_score2,embd_true,embd_fake,label,id,dataset
18736,54.111111,5.225873,6.222222,-0.9978,58.811475,45.995893,16.221766,16.837782,12.320329,7.186858,...,0.0,0.002456,1.025272,0.002794,1.436636,-6.14477,6.43352,0,18737,ISOT
7048,70.5,4.549296,10.0,0.3664,64.084507,35.460993,29.078014,11.347518,9.219858,3.546099,...,13.206,0.00264,1.096713,0.002618,1.427903,-6.113999,6.459306,0,7049,ISOT
13656,37.4,4.868449,4.6,-0.9141,52.24359,41.176471,12.834225,17.754011,11.016043,6.096257,...,0.0,0.002461,1.035301,0.002983,1.551837,-6.118306,6.435842,0,13657,ISOT
20607,24.166667,5.219178,3.166667,0.8625,63.69863,51.034483,14.482759,15.862069,14.482759,6.896552,...,5.882,0.002969,1.262128,0.002903,1.59397,-6.118776,6.432452,0,20608,ISOT
4003,31.458333,4.861039,3.75,0.6765,50.396825,38.278146,12.980132,15.364238,9.668874,8.211921,...,3.512,0.002247,0.938875,0.002937,1.575983,-6.131537,6.458004,0,4004,ISOT


In [29]:
from itertools import combinations
  
datasets = ["LIAR", "ISOT", "FakeNewsNet", "Kaggle", "Constraint"]
  
# size of combination is set to 3
# a = combinations(datasets, 3) 
a = list(combinations(datasets, 4))
b = list_of_lists = [list(elem) for elem in a]

print(b)

final_comb = []
for item in b:
  # print(item)
  c = [i for i in datasets if i not in item]
  # print(c)
  comb = [item,c]
  # print(comb)
  final_comb.append(comb)

print(final_comb)

[['LIAR', 'ISOT', 'FakeNewsNet', 'Kaggle'], ['LIAR', 'ISOT', 'FakeNewsNet', 'Constraint'], ['LIAR', 'ISOT', 'Kaggle', 'Constraint'], ['LIAR', 'FakeNewsNet', 'Kaggle', 'Constraint'], ['ISOT', 'FakeNewsNet', 'Kaggle', 'Constraint']]
[[['LIAR', 'ISOT', 'FakeNewsNet', 'Kaggle'], ['Constraint']], [['LIAR', 'ISOT', 'FakeNewsNet', 'Constraint'], ['Kaggle']], [['LIAR', 'ISOT', 'Kaggle', 'Constraint'], ['FakeNewsNet']], [['LIAR', 'FakeNewsNet', 'Kaggle', 'Constraint'], ['ISOT']], [['ISOT', 'FakeNewsNet', 'Kaggle', 'Constraint'], ['LIAR']]]


In [77]:
# final_comb = [[['LIAR', 'FakeNewsNet', 'Constraint'],['Kaggle', 'ISOT']]]

In [30]:
len(final_comb)

5

In [31]:
final_comb

[[['LIAR', 'ISOT', 'FakeNewsNet', 'Kaggle'], ['Constraint']],
 [['LIAR', 'ISOT', 'FakeNewsNet', 'Constraint'], ['Kaggle']],
 [['LIAR', 'ISOT', 'Kaggle', 'Constraint'], ['FakeNewsNet']],
 [['LIAR', 'FakeNewsNet', 'Kaggle', 'Constraint'], ['ISOT']],
 [['ISOT', 'FakeNewsNet', 'Kaggle', 'Constraint'], ['LIAR']]]

# Voting Classifier

## Combined accuracy

In [None]:
All_features

In [None]:
def get_dataset_comb(comb):


  final_df = {}
  train_key = 'train_'
  test_key = 'test_'
  train_frames = []
  test_frames = []
  for i in comb[0]:
    train_key += '_' + i
    train_frames.append(all_df[i])

  for i in comb[1]:
    test_key += '_' + i
    test_frames.append(all_df[i])
  
  df_other = pd.concat(test_frames)
  # print(df_other.info())
  df_concat = pd.concat(train_frames)

  final_df[train_key],parameterDict =normalize(df_concat,Semantic_features+Emotion_features)   
  # print("normalizing key dataset")
  
  final_df[test_key],_ =normalize(df_other,Semantic_features+Emotion_features,parameterDict)
  # print(train_key , final_df[train_key].shape)


  print(final_df[test_key]['dataset'].value_counts())
  
  return final_df,train_key,test_key

# get_dataset_comb(final_comb[0])

In [80]:
# Train using all the features | Voting claasifier 
n_estimators=300
voting="hard"
# print("{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}".format('prec-t','prec-f', 'rec-t','rec-f','f1-t','f1-f','accu','tn', 'fp', 'fn', 'tp',"test_set","trained_model"))  # correct

train_set = []
test_set = []
accuracy = []
train_count = []
test_count = []

for comb in final_comb:

  final_df,train_key,test_key = get_dataset_comb(comb)

  print("========================================")
  print("Testing Model on {}".format(test_key))
  print("========================================")
  print()

  X_train = final_df[train_key][All_features]
  y_train = final_df[train_key]['label']
  X_val = final_df[test_key][All_features]
  y_val = final_df[test_key]['label']
  
  # print(X_train.columns)
  # print(X_val.columns)
  # print(X_train.shape)
  # print(X_val.shape)

  # column transforemers 
  sementic_ct = ColumnTransformer([("sementic","passthrough", Semantic_features )])
  lex_ct_embed_ct = ColumnTransformer([("lexicon","passthrough", LexMod_Features +Embed_features)])
  Emotion_ct = ColumnTransformer([("emotion","passthrough", Emotion_features )])
  # embed_ct = ColumnTransformer([("embedding","passthrough", embed_features )])

  
 
  model_sementic_clf = GaussianNB()  #ExtraTreesClassifier(n_estimators=n_estimators)
  model_lex_emd_clf =   KNeighborsClassifier() # ExtraTreesClassifier(n_estimators=n_estimators)
  model_emotion_clf =   GaussianNB() #ExtraTreesClassifier(n_estimators=n_estimators)
 
# DecisionTreeClassifier
# KNeighborsClassifier

# create pipeline 
  sementic_pipeline = Pipeline([
        ('trans', sementic_ct),
        ('clf', model_sementic_clf)
        ]
  )
  lex_embd_pipeline = Pipeline([
    ('trans', lex_ct_embed_ct),
    ('clf', model_lex_emd_clf )
    ]
  )
  emotion_pipeline = Pipeline([
    ('trans', Emotion_ct),
    ('clf', model_emotion_clf)
    ]
  )
  
  
  estimators = [
    ('sementic_estimator', sementic_pipeline),
    ('lexP_embd_estimator', lex_embd_pipeline),
    ("emotion_estimator" ,emotion_pipeline )
  ]

  final_classifier =VotingClassifier(estimators=estimators ,voting=voting) 
  final_classifier.fit(X_train, y_train)

  predicted_y= final_classifier.predict(X_val)
  d=compute_metrics(predicted_y,y_val)

  # tn, fp, fn, tp = d["confusiton_mat"].ravel() #correct
  # print ("{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}".format(d['precision'][0],d['precision'][1], d['recall'][0],d['recall'][1],d['f1'][0],d['f1'][1],d['accuracy'],tn, fp, fn, tp,test_key,train_key))
  train_set.append(train_key)
  test_set.append(test_key)
  accuracy.append(round(d['accuracy'],2))
  train_count.append(len(X_train))
  test_count.append(len(X_val))

  print('Accuracy is ')
  print(d['accuracy'])

Kaggle    6000
ISOT      6000
Name: dataset, dtype: int64
Testing Model on test__Kaggle_ISOT

Accuracy is 
0.7515833333333334


In [None]:
results_final = pd.DataFrame()
results_final["Train_set"] = train_set
results_final['Train_count'] = train_count
results_final["Test_set"] = test_set
results_final['Test_count'] = test_count
results_final["Accuracy"] = accuracy

In [None]:
results_final

Unnamed: 0,Train_set,Train_count,Test_set,Test_count,Accuracy
0,train__LIAR_FakeNewsNet_Kaggle_Constraint,51338,test__COVID19_test,2871,0.61


##Average Accuracy

In [32]:
def get_dataset_comb_seperate(comb):


  final_df = {}
  train_key = 'train_'
  test_key = 'test_'
  train_frames = []
  for i in comb[0]:
    train_key += '_' + i
    train_frames.append(all_df[i])  
  
  df_concat = pd.concat(train_frames)

  final_df[train_key],parameterDict =normalize(df_concat,Semantic_features+Emotion_features)

  for i in comb[1]:
    test_key += '_' + i
    final_df[i], _ =normalize(all_df[i],Semantic_features+Emotion_features,parameterDict)

  return final_df,train_key,test_key

In [33]:
# Train using all the features | Voting claasifier 
n_estimators=300
voting="hard"

train_set = []
test_set = []
accuracy = []
sep_acc = []

for comb in final_comb:

  final_df,train_key,test_key = get_dataset_comb_seperate(comb)

  print(final_df.keys())
  print("========================================")
  print("Training Model on {}".format(train_key))
  print("========================================")
  print()

  X_train = final_df[train_key][All_features]
  y_train = final_df[train_key]['label']

  # column transforemers 
  sementic_ct = ColumnTransformer([("sementic","passthrough", Semantic_features )])
  lex_ct_embed_ct = ColumnTransformer([("lexicon","passthrough", LexMod_Features +Embed_features)])
  Emotion_ct = ColumnTransformer([("emotion","passthrough", Emotion_features )])
  # embed_ct = ColumnTransformer([("embedding","passthrough", embed_features )])

  
 
  model_sementic_clf = GaussianNB()  #ExtraTreesClassifier(n_estimators=n_estimators)
  model_lex_emd_clf =   KNeighborsClassifier() # ExtraTreesClassifier(n_estimators=n_estimators)
  model_emotion_clf =   GaussianNB() #ExtraTreesClassifier(n_estimators=n_estimators)
 
# DecisionTreeClassifier
# KNeighborsClassifier

# create pipeline 
  sementic_pipeline = Pipeline([
        ('trans', sementic_ct),
        ('clf', model_sementic_clf)
        ]
  )
  lex_embd_pipeline = Pipeline([
    ('trans', lex_ct_embed_ct),
    ('clf', model_lex_emd_clf )
    ]
  )
  emotion_pipeline = Pipeline([
    ('trans', Emotion_ct),
    ('clf', model_emotion_clf)
    ]
  )
  
  
  estimators = [
    ('sementic_estimator', sementic_pipeline),
    ('lexP_embd_estimator', lex_embd_pipeline),
    ("emotion_estimator" ,emotion_pipeline )
  ]

  final_classifier =VotingClassifier(estimators=estimators ,voting=voting) 
  final_classifier.fit(X_train, y_train)

  cum_accuracy = 0
  test_result = {}
  for item in comb[1]:
     X_val = final_df[item][All_features]
     y_val = final_df[item]['label']
     predicted_y= final_classifier.predict(X_val)
     d = compute_metrics(predicted_y,y_val)
     print('Accuracy on test_set '+ item + " is ")
     print(round(d['accuracy'],3))
     print()
     test_result[item] = round(d['accuracy'],2)
     cum_accuracy += d['accuracy']

  average_accuracy = round(cum_accuracy / len(comb[1]) , 2)
  print("Average accuracy")
  print(average_accuracy)
  print()
  train_set.append(train_key)
  test_set.append(test_key)
  accuracy.append(average_accuracy)
  sep_acc.append(test_result)

  

dict_keys(['train__LIAR_ISOT_FakeNewsNet_Kaggle', 'Constraint'])
Training Model on train__LIAR_ISOT_FakeNewsNet_Kaggle

Accuracy on test_set Constraint is 
0.482

Average accuracy
0.48

dict_keys(['train__LIAR_ISOT_FakeNewsNet_Constraint', 'Kaggle'])
Training Model on train__LIAR_ISOT_FakeNewsNet_Constraint

Accuracy on test_set Kaggle is 
0.662

Average accuracy
0.66

dict_keys(['train__LIAR_ISOT_Kaggle_Constraint', 'FakeNewsNet'])
Training Model on train__LIAR_ISOT_Kaggle_Constraint

Accuracy on test_set FakeNewsNet is 
0.533

Average accuracy
0.53

dict_keys(['train__LIAR_FakeNewsNet_Kaggle_Constraint', 'ISOT'])
Training Model on train__LIAR_FakeNewsNet_Kaggle_Constraint

Accuracy on test_set ISOT is 
0.89

Average accuracy
0.89

dict_keys(['train__ISOT_FakeNewsNet_Kaggle_Constraint', 'LIAR'])
Training Model on train__ISOT_FakeNewsNet_Kaggle_Constraint

Accuracy on test_set LIAR is 
0.515

Average accuracy
0.52



In [34]:
results_final = pd.DataFrame()
results_final["Train_set"] = train_set
results_final['Seperate Accuracy'] = sep_acc
results_final["Test_set"] = test_set
# results_final['Test_count'] = test_count
# results_final['Train_count'] = train_count
results_final["Accuracy"] = accuracy

In [35]:
results_final

Unnamed: 0,Train_set,Seperate Accuracy,Test_set,Accuracy
0,train__LIAR_ISOT_FakeNewsNet_Kaggle,{'Constraint': 0.48},test__Constraint,0.48
1,train__LIAR_ISOT_FakeNewsNet_Constraint,{'Kaggle': 0.66},test__Kaggle,0.66
2,train__LIAR_ISOT_Kaggle_Constraint,{'FakeNewsNet': 0.53},test__FakeNewsNet,0.53
3,train__LIAR_FakeNewsNet_Kaggle_Constraint,{'ISOT': 0.89},test__ISOT,0.89
4,train__ISOT_FakeNewsNet_Kaggle_Constraint,{'LIAR': 0.52},test__LIAR,0.52
