In [16]:
!python -m pip install pandas spacy textstat language-tool-python numpy scikit-learn boruta nltk
!python -m spacy download en_core_web_sm
!python -m nltk.downloader wordnet omw-1.4





Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------- ------------------------------ 2.9/12.8 MB 15.2 MB/s eta 0:00:01
     ----------------- ---------------------- 5.5/12.8 MB 14.6 MB/s eta 0:00:01
     ---------------------------- ----------- 9.2/12.8 MB 14.6 MB/s eta 0:00:01
     ------------------------------------- - 12.3/12.8 MB 14.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 14.6 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [1]:
import pandas as pd
import json
import spacy
import textstat
import language_tool_python
import numpy as np
import random
import nltk
from spacy.tokens import Doc
from spacy.matcher import PhraseMatcher          
from spacy.lang.en.stop_words import STOP_WORDS 
from difflib import SequenceMatcher
from nltk.corpus import wordnet as wn
from urllib.request import urlopen
from pathlib import Path
from collections import Counter
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from boruta import BorutaPy



nlp = spacy.load("en_core_web_md")
language_tool = language_tool_python.LanguageTool("en-US")
#loading the nrc_en lexicon manually as the NRCLex 4.0 is very buggy
url = "https://raw.githubusercontent.com/DemetersSon83/NRCLex/refs/heads/master/nrc_en.json" 
with urlopen(url) as re:
    lexicon = json.load(re)

In [2]:
def load_articles (filepath: str) -> pd.DataFrame:

    """
        Load the articles as a pandas DF
        
    """

    article = []
    with open(filepath, "r", encoding = "utf-8") as file:
        for line in file:
            line.strip()
            if not line:
                continue
            article.append(json.loads(line))

    return pd.DataFrame(article)      


In [6]:
def process_texts (articles: list[str]) -> list[Doc]:

    """
        Use "en_core_web_sm" model on df["Text"] and df["Title]
     
    """

    return list(nlp.pipe(articles))




def POStagging (articles: list[Doc], df: pd.DataFrame) -> pd.DataFrame:

    """
        Counting and processing POS-Tags

    """

    #filter POS-tags we want
    #we stick with universal POS-tags for now
    #https://github.com/explosion/spaCy/blob/master/spacy/glossary.py
    universal_tags = ["ADJ", "ADP", "ADV", "AUX",  "CCONJ", "DET", "INTJ", "NOUN",
                      "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"]
    
    
    #retrieving the IDs of the relevant POS-tags
    tag_ids = {tag: nlp.vocab.strings[tag] for tag in universal_tags}


    temp_df_list = []
    for article in articles:


        #count all tokens (without space-tokens)
        total_tokens = sum(1 for token in article if not token.is_space)


        #counting all POS-tags
        pos_counts = article.count_by(spacy.attrs.POS)


        #adding the POS-Tags as percentage of the total tokens + total count 
        inner_temp = {}
        for tag, tag_id in tag_ids.items():
            counter = pos_counts[tag_id] if tag_id in pos_counts else 0
            inner_temp[f"{tag.lower()}_perc"] = counter / total_tokens if total_tokens else 0 
        temp_df_list.append(inner_temp)  


    #create a pd DF with the same index as df
    df_features = pd.DataFrame(temp_df_list, index = df.index)
    return pd.concat([df,df_features], axis = 1)     


    

def emotions_nrc (articles: list[Doc], df: pd.DataFrame) -> pd.DataFrame:

    """ 
        Counting and processing the emotions in the articles 

    """

    #filtering all emotions tags   
    emotion_tags = ["anger", "anticipation", "fear", "sadness"]
    

    #the following code is inspired by the NRCLex module by metalcorebear:
    #i only recreate parts of the code as the module itselfe tends to cause problems 
    #source: https://pypi.org/project/NRCLex/


    lex_keys = set(lexicon.keys())
    temp_df_list = []
    for article in articles:


        #filtering and processing all relevant tokens
        affect_list = []
        for tok in article:
            if not tok.is_alpha:
                continue
            orig = tok.lower_
            if orig in lex_keys:    
                affect_list.extend(lexicon[orig])    


        #creating relevant counters
        freq_counter = Counter()
        for emo in affect_list:
            freq_counter[emo] += 1
        total_emotions = sum(freq_counter.values()) or 1        


        #adding the emotions as percentage of the total emotinos + total count 
        inner_temp = {}
        for emotion in emotion_tags:
            emotion_counter = freq_counter[emotion] if emotion in freq_counter else 0
            inner_temp[f"{emotion.lower()}_perc"] = emotion_counter / total_emotions
        temp_df_list.append(inner_temp)    


    #create a pd DF with the same index as df
    df_features = pd.DataFrame(temp_df_list, index = df.index)
    return pd.concat([df,df_features], axis = 1)     




def KB_detection (articles: list[Doc], df: pd.DataFrame, kb: str) -> pd.DataFrame:
    
    """
        Wrong claims detection with our KB

    """


    with open(kb, "r", encoding = "utf-8") as file:
        data_to_be_cleaned = json.load(file)


    #here we clean our KB, to make sure there are no leading or trailing whitespaces
    cleaned_data = []
    for item in data_to_be_cleaned:
        cleaned_data.append({

            "entity": item["entity"].strip(),
            "aliases": [alias.strip() for alias in item["aliases"]],
            "claims":  [claim.strip() for claim in item["claims"]]

        })    



    #create phrases from the KB, we are looking for in the text
    matcher = PhraseMatcher(nlp.vocab, attr = "LOWER")
    for item in cleaned_data:
        pattern = [nlp.make_doc(item["entity"])]
        for alias in item["aliases"]:
            pattern.append(nlp.make_doc(alias))
        matcher.add(item["entity"], pattern)



    temp_df_list = []

    
    for article in articles:
        sent_lems = []
        #create a lematized. lowered and stop-words-filtered version of every sentence 
        for sent in article.sents:
            sent_lems.append([tok.lemma_.lower() for tok in sent if (tok.is_alpha or tok.is_digit) and tok.lemma_.lower() not in STOP_WORDS])
        

        sent_claims = [set() for _ in sent_lems]
        #apply the matcher we created and extract the parts in the article
        ents = {article[start:end].text for _, start, end in matcher(article)}
        matched_pairs_article = set()

        #KB etentity lookup with a fuzzy matcher just if there are some differences (should not be)
        for ent_text in ents:
            item =  None
            for entry in cleaned_data:
                for name in [entry["entity"]] + entry["aliases"]:
                    if SequenceMatcher(None, ent_text.lower(), name.lower()).ratio() >= 0.7:
                        item = entry
                        break 
                if item:
                    break
            if item is None:
                continue

        
            #now we check the fake claims from our KB
            for claim in item["claims"]:


                #here we lemmatize, lower and filter the KB claims
                temp_claim = nlp(claim)
                claim_lem_stop_low = [tok.lemma_.lower() for tok in temp_claim if (tok.is_alpha or tok.is_digit)
                                    and tok.lemma_.lower() not in STOP_WORDS]
                if len(claim_lem_stop_low) <= 1:
                    continue

                
                #now we utilize wordnet to find synonyms and also lemmatize and lower them
                syns = set()
                for term in claim_lem_stop_low:
                    for syn in wn.synsets(term):
                        for lemma in syn.lemmas():
                            syns.add(lemma.name().lower())


                #calculate if a word is a hit with fuzzy matching ()
                for num, sent_lemmas in enumerate(sent_lems):
                    base_hits = sum(1 for term in claim_lem_stop_low
                        if any(SequenceMatcher(None, term, lem).ratio() >= 0.5 for lem in sent_lemmas))
                    syn_hits = sum(1 for syn in syns
                                   if any(SequenceMatcher(None, syn, lem).ratio() >= 0.5 for lem in sent_lemmas))

                    #now for the combination
                    if syn_hits + base_hits >= 2:
                        if (item["entity"], claim) not in matched_pairs_article:
                            matched_pairs_article.add((item["entity"], claim))
                        sent_claims[num].add((item["entity"], claim))

        total_claims_matched = sum(len(sent) for sent in sent_claims)
        temp_df_list.append(total_claims_matched / len(sent_lems))


    df_features = pd.DataFrame({"fake_match_count": temp_df_list}, index = df.index)
    df_features.fillna(0, inplace=True)
    return pd.concat([df,df_features], axis = 1)     
       



def readability_and_difficulty_scores (articles: list[str], df: pd.DataFrame) -> pd.DataFrame:

    """
    Processing fleschs readability scores and difficult words

    """

    

    temp_df_list = []

    for article in articles:
        inner_temp = {

            #different difficulties
            #https://github.com/kupolak/textstat/blob/master/README.md#spache-readability-formula (for scales)
            "flesch_reading_ease": textstat.flesch_reading_ease(article),
            "automated_readbility_index": textstat.automated_readability_index(article),


            #difficult perc of total words
            "difficult_words_perc": textstat.difficult_words(article) / textstat.lexicon_count(article,
                                                                                    removepunct = True)

        }
        temp_df_list.append(inner_temp)


    #create a pd DF with the same index as df
    df_features = pd.DataFrame(temp_df_list, index = df.index)
    return pd.concat([df,df_features], axis = 1)




def Error_check (articles: list[str], df: pd.DataFrame) -> pd.DataFrame:

    """ 
        Counting and processing the grammar and spelling check
    
    """

    temp_df_list = []
    for article in articles:

        matches = language_tool.check(article)

        #extract the misspellings and grammatical mistakes
        only_s = [entry for entry in matches if entry.ruleIssueType == "misspelling"]    
        only_g = [entry for entry in matches if entry.ruleIssueType == "grammar"]
        total_s = len(only_s)
        total_g = len(only_g)  
        total_words = textstat.lexicon_count(article, removepunct = True)



        #append the relative amount
        inner_temp = {

            "misspellings_perc": total_s / total_words,
            "grammar_perc": total_g / total_words

        }
        temp_df_list.append(inner_temp)    



    #create a pd DF with the same index as df
    df_features = pd.DataFrame(temp_df_list, index = df.index)
    return pd.concat([df,df_features], axis = 1)

In [4]:
def concat_df (df: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:

    """
        Connect the text and headline df
    """

    #change column names for the headlines
    df2_copy = df2.copy()
    df2_copy.columns = ["hl_" + col for col in df2_copy.columns]

    return pd.concat([df, df2_copy], axis = 1)

In [None]:

#load
file_path = Path("C:/Users/Admin/Desktop/Uni/CSS/NLP/without_assessment.jsonl")
kb = "KB.json"
df = load_articles(str(file_path))
df2 = df.copy()




#Text


#prepare text
texts = df["Text"].tolist()
docs  = process_texts(texts)


#POS- and NERfeat
df = POStagging(docs, df)
df = KB_detection(docs, df, kb)


#add emotions
df = emotions_nrc(docs, df)


#readability and difficulty
df = readability_and_difficulty_scores(texts, df)


#grammar and spelling errors
df = Error_check(texts, df)




#Headlines


#prepare text
titels = df2["Title"].tolist()
docs  = process_texts(titels)


#POS- and NERfeat
df2 = POStagging(docs, df2)
df2 = KB_detection(docs, df2, kb)


#add emotions
df2 = emotions_nrc(docs, df2)


#readability and difficulty
df2 = readability_and_difficulty_scores(titels, df2)


#grammar and spelling errors
df2 = Error_check(titels, df2)


df = concat_df(df, df2)


In [2]:
def preparation (df: pd.DataFrame, csv_path_labels: str, labelname: str) -> tuple[pd.Series, pd.DataFrame]:

    """ 
        preparing the df for the ML part
    
    """
        

    df = df.drop(columns = ["Unnamed: 0", "Index", "hl_Index", "Text", "Title", "hl_Title", "hl_Text"])

    #prepare x and y  
    x = df 
    real_fake = pd.read_csv(csv_path_labels)
    y = pd.Series(real_fake[labelname], name = labelname)
  

    return x, y




def grid_search_rf(x: pd.DataFrame, y: pd.Series, seed: int) -> RandomForestClassifier: 

    """ 
        param optimazation with grid search
    
    """

    params = {

            "n_estimators": [100, 110, 120, 130, 140, 150, 160, 170, 180, 190],
            "max_depth": [None, 5, 10, 15, 20],
            "min_samples_leaf": [2, 3, 4, 5, 6],
            "max_features": ["sqrt", 0.3, 0.5]

        }


    #finding the best params for our temp random forrest (accr. metric to beat the score)
    rf = RandomForestClassifier(random_state = seed, n_jobs = -1)
    opt_grid = GridSearchCV(rf, params, cv = RepeatedStratifiedKFold(n_splits = 5,
                            n_repeats = 3, random_state = seed), scoring = "accuracy",
                            n_jobs = -1)
    opt_grid.fit(x, y)


    return opt_grid.best_estimator_




def data_filter (x: pd.DataFrame, y: pd.Series, seed: int) -> tuple[pd.DataFrame, pd.DataFrame,
                                                                    pd.Series, pd.Series, list[str]]:

    """
        80/20 split and preparation for the final model

    """

    # spliting the dataset
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size = 0.2, stratify = y, random_state = seed)
    

    
    #extracting params
    rf = grid_search_rf(x_train, y_train, seed)


    #boruta selection of features 
    boruta = BorutaPy(estimator = rf, n_estimators = "auto", perc = 90, alpha = 0.05, 
                      max_iter = 70, random_state = seed, verbose = 0)
    boruta.fit(x_train, y_train)
    selected = [name for name, keep in zip(x_train.columns, boruta.support_) if keep]



    #reduction of the subsets
    x_train_sel = x_train[selected]
    x_test_sel = x_test[selected]

  
    print(f"Final features: {selected}")
    print(f"Features len: {len(selected)}")
    return x_train_sel, x_test_sel, y_train, y_test, selected




def final_step (x_train_sel: pd.DataFrame, x_test_sel: pd.DataFrame, y_train: pd.Series,
                y_test: pd.Series, seed:int) -> RandomForestClassifier:

    """ 
        Building the last model and report

    """

    rf_final = grid_search_rf(x_train_sel, y_train, seed = seed)
    final_model = rf_final.fit(x_train_sel, y_train)
    y_pred = final_model.predict(x_test_sel)
    report = classification_report(y_test, y_pred, output_dict = True)


    print(f"Test performance(initial model): {report}")
    return final_model




def evaluate_model(x: pd.DataFrame, y: pd.Series, features: list[str], rf_model: RandomForestClassifier, seed: int) -> None:

    """ 
        Here we evaluate our models and the importance of the feats. over 1000 random seeds
    
    """

    #evaluation over 1000 seeds
    random.seed(seed)
    split_seeds = random.sample(range(50, 10001), 1000)

    accuracies = []
    recalls = []
    precision = []
    f1s = []
    feat_imp = []

    accuracies_base = []
    recalls_base = []
    precision_base = []
    f1s_base = []

    rf_params = rf_model.get_params()
    rf_params.pop("random_state")


    for rs in split_seeds:
        x_tr, x_te, y_tr, y_te = train_test_split(x[features], y, test_size = 0.2, stratify = y,
                                                  random_state = rs)

        rf = RandomForestClassifier(**rf_params, random_state = rs)
        rf.fit(x_tr, y_tr)
        y_pred = rf.predict(x_te)


        rpt = classification_report(y_te, y_pred, output_dict = True)
        accuracies.append(rpt["accuracy"])
        recalls.append(rpt["macro avg"]["recall"])
        precision.append(rpt["macro avg"]["precision"])
        f1s.append(rpt["macro avg"]["f1-score"])

        feat_imp.append(rf.feature_importances_)

        #extra baselinemodel
        dummy_compare = DummyClassifier(strategy = "stratified", random_state = rs)
        dummy_compare.fit(x_tr, y_tr)
        y_pred_dummy = dummy_compare.predict(x_te)


        rpt_dummy = classification_report(y_te, y_pred_dummy, output_dict = True)
        accuracies_base.append(rpt_dummy["accuracy"])
        recalls_base.append(rpt_dummy["macro avg"]["recall"])
        precision_base.append(rpt_dummy["macro avg"]["precision"])
        f1s_base.append(rpt_dummy["macro avg"]["f1-score"])


    #95%-CI
    root_n = np.sqrt(1000)
    ci_acc_plus = np.mean(accuracies) + 1.96 * (np.std(accuracies, ddof = 1) / root_n)
    ci_acc_minus = np.mean(accuracies) - 1.96 * (np.std(accuracies, ddof = 1) / root_n)

    ci_rec_plus = np.mean(recalls) + 1.96 * (np.std(recalls, ddof = 1) / root_n)
    ci_rec_minus = np.mean(recalls) - 1.96 * (np.std(recalls, ddof = 1) / root_n)

    ci_prec_plus = np.mean(precision) + 1.96 * (np.std(precision, ddof = 1) / root_n)
    ci_prec_minus = np.mean(precision) - 1.96 * (np.std(precision, ddof = 1) / root_n)

    ci_f1s_plus = np.mean(f1s) + 1.96 * (np.std(f1s, ddof = 1) / root_n)
    ci_f1s_minus = np.mean(f1s) - 1.96 * (np.std(f1s, ddof = 1) / root_n)


    ci_acc_base_plus = np.mean(accuracies_base) + 1.96 * (np.std(accuracies_base, ddof = 1) / root_n)
    ci_acc_base_minus = np.mean(accuracies_base) - 1.96 * (np.std(accuracies_base, ddof = 1) / root_n)

    ci_rec_base_plus = np.mean(recalls_base) + 1.96 * (np.std(recalls_base, ddof = 1) / root_n)
    ci_rec_base_minus = np.mean(recalls_base) - 1.96 * (np.std(recalls_base, ddof = 1) / root_n)

    ci_prec_base_plus = np.mean(precision_base) + 1.96 * (np.std(precision_base, ddof = 1) / root_n)
    ci_prec_base_minus = np.mean(precision_base) - 1.96 * (np.std(precision_base, ddof = 1) / root_n)

    ci_f1s_base_plus = np.mean(f1s_base) + 1.96 * (np.std(f1s_base, ddof = 1) / root_n)
    ci_f1s_base_minus = np.mean(f1s_base) - 1.96 * (np.std(f1s_base, ddof = 1) / root_n)    
    

    feat_imp = pd.Series(np.mean(feat_imp, axis = 0), index = features).sort_values(ascending = False)

    relevant_keys = ["n_estimators", "max_depth", "min_samples_leaf", "max_features", "criterion"]
    rf_params = {key: value for key, value in rf_params.items() if key in relevant_keys}
    rf_params = pd.Series(rf_params).sort_index()
   
    print("\n\n---Random Forrest Classifier Params---")
    print(pd.Series(rf_params).sort_index())

    print("\n\n---For the Random Forrest Classifier---")
    print(f"Accuracy = {np.mean(accuracies): .3f} ± {np.std(accuracies): .3f} (95% CI [{ci_acc_minus: .3f} - {ci_acc_plus: .3f}]) ")
    print(f"Recall = {np.mean(recalls): .3f} ± {np.std(recalls):.3f} (95% CI [{ci_rec_minus: .3f} - {ci_rec_plus: .3f}])")
    print(f"Precision = {np.mean(precision): .3f} ± {np.std(precision): .3f} (95% CI [{ci_prec_minus: .3f} - {ci_prec_plus: .3f}])")
    print(f"F1-Score = {np.mean(f1s): .3f} ± {np.std(f1s): .3f} (95% CI [{ci_f1s_minus: .3f} - {ci_f1s_plus: .3f}])")

    print("\n\n---For the Baseline---")
    print(f"Accuracy = {np.mean(accuracies_base): .3f} ± {np.std(accuracies_base): .3f} (95% CI [{ci_acc_base_minus: .3f} - {ci_acc_base_plus: .3f}])")
    print(f"Recall = {np.mean(recalls_base): .3f} ± {np.std(recalls_base): .3f} (95% CI [{ci_rec_base_minus: .3f} - {ci_rec_base_plus: .3f}])")
    print(f"Precision = {np.mean(precision_base): .3f} ± {np.std(precision_base): .3f} (95% CI [{ci_prec_base_minus: .3f} - {ci_prec_base_plus: .3f}])")
    print(f"F1-Score = {np.mean(f1s_base): .3f} ± {np.std(f1s_base): .3f} (95% CI [{ci_f1s_base_minus: .3f} - {ci_f1s_base_plus: .3f}])")

    print("\n\n---Feature importance---")
    print(feat_imp)


In [3]:
#to safe some time we use output_features_full.csv which are all features we ectracted above

df = pd.read_csv("output_features_full.csv")
csv_path_labels = "group58_stage1.csv"
label = "real_news"
seed = 37


#preparing the data
x, y = preparation(df, csv_path_labels, label)


#spliting the data in 80/20 and optimize params + feat
x_train_sel, x_test_sel, y_train, y_test, features = data_filter(x, y, seed = seed)


#building the final model and report
final_model= final_step(x_train_sel, x_test_sel, y_train, y_test, seed = seed)


#here we evaluate our mode
evaluate_model(x, y, features, final_model, seed = seed)

  _data = np.array(data, dtype=dtype, copy=copy,


Final features: ['adj_perc', 'noun_perc', 'pron_perc', 'propn_perc', 'sconj_perc', 'verb_perc', 'fake_match_count', 'anticipation_perc', 'hl_noun_perc', 'hl_propn_perc', 'hl_punct_perc', 'hl_verb_perc', 'hl_automated_readbility_index']
Features len: 13
Test performance(initial model): {'no': {'precision': 0.9285714285714286, 'recall': 0.9285714285714286, 'f1-score': 0.9285714285714286, 'support': 14.0}, 'yes': {'precision': 0.9375, 'recall': 0.9375, 'f1-score': 0.9375, 'support': 16.0}, 'accuracy': 0.9333333333333333, 'macro avg': {'precision': 0.9330357142857143, 'recall': 0.9330357142857143, 'f1-score': 0.9330357142857143, 'support': 30.0}, 'weighted avg': {'precision': 0.9333333333333333, 'recall': 0.9333333333333333, 'f1-score': 0.9333333333333333, 'support': 30.0}}


---Random Forrest Classifier Params---
criterion           gini
max_depth              5
max_features        sqrt
min_samples_leaf       2
n_estimators         190
dtype: object


---For the Random Forrest Classifier-