In [86]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [87]:
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf

import ipynb.fs.full.parse_datasets as datasets
import ipynb.fs.full.preprocessing as pp
import ipynb.fs.full.bert_fake_news_classifier as bclf
from ipynb.fs.full.notif_email import send_email

# Dataset

In [88]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_data, label_data)

# Data extraction

In [89]:
from sklearn.preprocessing import normalize

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
import demoji
import re

In [90]:
# Stats extraction helper functions
analyzer = SentimentIntensityAnalyzer()

digits = set("0123456789")
printable = set(string.printable)
punctuation = set(string.punctuation)
punctuation.remove('#')


def clean_text(text, remove_punc=True, remove_non_print=True, remove_emojis=True, 
              remove_digits=True, remove_tags=False):
    """ Clean text by removing certain characters (e.g. punctuation) """
    if remove_emojis:
        text = demoji.replace(text, "")
        
    chars = []
    for char in text:
        if not ((remove_punc and char in punctuation) or
            (remove_non_print and char not in printable) or
            (remove_digits and char in digits)):
            chars.append(char)
        
    cleaned = "".join(chars)
    if remove_tags:
        return re.sub('#[A-Z]+#', "", cleaned)
    
    return cleaned


def polarity_scores(text):
    polarity_dict = analyzer.polarity_scores(text)
    return np.asarray([
        polarity_dict['pos'],
        polarity_dict['neu'],
        polarity_dict['neg'],
    ])


def tweets_to_words(user_tweets):
    return np.asarray([
        np.asarray(clean_text(tweet.text).split()) 
        for tweet in user_tweets
    ])


def std_dev(datapoints, mean, num_datapoints=100):
    diff = datapoints - mean
    return np.sqrt(np.sum(diff ** 2, axis=0)/100)


def average_tweet_lengths(user_tweets):
    return np.mean([len(tweet) for tweet in user_tweets])


def std_dev_tweet_lengths(user_tweets):
    tweet_lens = [len(tweet) for tweet in user_tweets]
    return std_dev(
        np.asarray(tweet_lens),
        np.mean(tweet_lens),
    )


def cased_chars(user_tweets, cased):
    return [
        sum([c.isupper() if cased else c.islower() for c in tweet.text]) 
        for tweet in user_tweets
    ]


def emoji_chars(user_tweets):
    return [len(demoji.findall_list(tweet.text)) for tweet in user_tweets]


def punctuation_chars(user_tweets):
    return [
        len([c for c in tweet.text if c in punctuation]) 
        for tweet in user_tweets
    ]

In [99]:
from collections.abc import Iterable

class TweetStatsExtractor:
    def __init__(self, extractors):
        if len(extractors) == 0:
            raise Exception("Must pass at least one extracting function")

        self.extractors = extractors
    
    def transform(self, X, normalize_data=False):
        result = []
        for tweet_feed in X:
            if len(self.extractors) > 1:
                result.append(np.concatenate([self._apply(f, tweet_feed) for f in self.extractors]))
            else:
                result.append(self._apply(self.extractors[0], tweet_feed))
        
        return normalize(result) if normalize_data else result
    
    def _apply(self, extractor, data):
        result = extractor(data)
        if isinstance(result, Iterable):
            return result
        else:
            return np.asarray([result])

# Classifier evaluation

In [101]:
from tqdm import tqdm, trange

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, log_loss

In [106]:
grid_search_estimators = [LogisticRegression, SVC, RandomForestClassifier, GradientBoostingClassifier, KNeighborsClassifier]
grid_search_param_grids = [
    {"Estimator__penalty": ["l1", "l2"], 
     "Estimator__C": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024, 1280], 
     "Estimator__solver": ["newton-cg", "lbfgs", "liblinear"]},
    {"Estimator__C": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024, 1280], 
     "Estimator__kernel": ["linear", "poly", "rbf", "sigmoid"], 
     "Estimator__probability": [True]},
    {"Estimator__n_estimators": [25, 50, 100, 200, 400, 800], 
     "Estimator__criterion": ["gini", "entropy"], 
     "Estimator__min_samples_leaf": [1, 2, 4, 6, 8, 10, 12, 14, 16]},
    {"Estimator__loss": ["deviance", "exponential"], 
     "Estimator__learning_rate": [0.01, 0.05, 0.1, 0.2], 
     "Estimator__n_estimators": [25, 50, 100, 200, 400, 800], 
     "Estimator__min_samples_leaf": [1, 2, 4, 6, 8, 10, 12, 14, 16]},
    {"Estimator__n_neighbors": [2, 3, 4, 5, 6, 7, 8, 9, 10], 
     "Estimator__weights": ["uniform", "distance", ]},
]

def grid_search(X_train, y_train, X_val, y_val, estimators=grid_search_estimators, param_grids=grid_search_param_grids):
    """ 
    Performs a GridSearchCV on the training data, and then evaluates using the validation data.
    Uses a pipeline to find the best K features to use from the training data.
    Returns a list of each estimator with their best parameters, as well as a dataframe containing 
    evaluation data.
    """
    best_df = pd.DataFrame(columns=["Estimator", "K best features", "Mean CV Loss", "Mean CV F1", "Mean CV Accuracy", "Val Loss", "Val Precision", "Val Recall", "Val F1", "Val Accuracy"])
    best_params = []
    ks = list(range(1, len(X_train[0])+1))
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    for i, (estimator, param_grid) in tqdm(enumerate(zip(estimators, param_grids)), desc="Estimators"):
        # Perform a GridSearchCV
        param_grid['SelectKBest__k'] = ks
        search = GridSearchCV(
            Pipeline([('SelectKBest', SelectKBest()), ('Estimator', estimator())]), 
            param_grid, 
            n_jobs=-1, 
            scoring={
                "accuracy": make_scorer(accuracy_score), 
                "f1": make_scorer(f1_score, pos_label="1"),
                "loss": make_scorer(log_loss, greater_is_better=False, needs_proba=True),
            }, 
            refit="loss",
        )
        search.fit(X_train, y_train)
        
        # Collect results
        best_index = search.cv_results_['params'].index(search.best_params_)
        y_train_pred = search.predict(X_train)
        y_val_pred = search.predict(X_val)
        best_df.loc[i] = [estimator.__name__, 
                          search.best_params_['SelectKBest__k'], 
                          search.cv_results_['mean_test_loss'][best_index], 
                          search.cv_results_['mean_test_f1'][best_index],
                          search.cv_results_['mean_test_accuracy'][best_index],
                          log_loss(y_val, search.predict_proba(X_val)), 
                          precision_score(y_val, y_val_pred, pos_label="1"),
                          recall_score(y_val, y_val_pred, pos_label="1"),
                          f1_score(y_val, y_val_pred, pos_label="1"),
                          accuracy_score(y_val, y_val_pred)]
        best_params.append((estimator.__name__, search.best_params_))
    
    return best_params, best_df

# Statistical model
* Extract statistical features from user tweets
* "An Ensemble Model Using N-grams and Statistical Features to Identify Fake News Spreaders on Twitter" paper built a statistical model to support their N-gram model for this task (and won). "FacTweet: Profiling Fake News Twitter Accounts" used statistical information, such as emotions, style and sentiment to profile fake news spreading users.

In [103]:
# Extractor functions (to be used in TweetStatsExtractor)
def average_chars(user_tweets):
    """ Returns the average tweet lengths, in characters, for the user """
    return average_tweet_lengths([tweet.text for tweet in user_tweets])


def std_dev_chars(user_tweets):
    """ Returns the standard deviations of tweet lengths, in characters, for the user """
    return std_dev_tweet_lengths([tweet.text for tweet in user_tweets])


def average_words(user_tweets):
    """ Returns the average tweet lengths, in words, for the user """
    return average_tweet_lengths(tweets_to_words(user_tweets))


def std_dev_words(user_tweets):
    """ Returns the standard deviations of tweet lengths, in words, for the user """
    return std_dev_tweet_lengths(tweets_to_words(user_tweets))


def average_sentiment(user_tweets):
    """ Returns the average sentiment scores of the user """
    return np.mean([polarity_scores(tweet.text) for tweet in user_tweets], axis=0)


def std_dev_sentiment(user_tweets):
    """ Returns the average sentiment scores of the user """
    sentiment = np.asarray([polarity_scores(tweet.text) for tweet in user_tweets])
    return std_dev(sentiment, np.mean(sentiment, axis=0))


def average_word_lengths(user_tweets):
    """ Returns the average length of words used by this user """
    return np.mean([
        len(word) 
        for tweet in user_tweets
        for word in clean_text(tweet.text).split()
    ])


def average_tags(user_tweets, tags=['RT', '#USER#', '#HASHTAG#', '#URL#']):
    """ Returns the average number of tags used by this user """
    return np.mean([
        np.asarray([tweet.text.count(tag) for tag in tags])
                   for tweet in user_tweets
    ], axis=0)


def average_cased_chars(user_tweets):
    """ Returns the average number of cased (uppercase) characters per tweet, for the user """
    return np.mean(cased_chars(user_tweets, True))
    

def std_dev_cased_chars(user_tweets):
    """ Returns the standard deviation of cased characters per tweet, for the user """
    return std_dev(cased_chars(user_tweets, True), average_cased_chars(user_tweets))
    
    
def average_uncased_chars(user_tweets):
    """ Returns the average number of uncased (lowercase) characters per tweet, for the user """
    return np.mean(cased_chars(user_tweets, False))


def std_dev_uncased_chars(user_tweets):
    """ Returns the standard deviation of cased characters per tweet, for the user """
    return std_dev(cased_chars(user_tweets, False), average_uncased_chars(user_tweets))

    
def average_emojis(user_tweets):
    """ Returns the average number of emojis per tweet, for the user """
    return np.mean(emoji_chars(user_tweets))
    
    
def std_dev_emojis(user_tweets):
    """ Returns the standard deviation of emojis per tweet, for the user """
    return std_dev(emoji_chars(user_tweets), average_emojis(user_tweets))


def average_punctuation_chars(user_tweets):
    """ Returns the average number of punctuation characters per tweet, for the user """
    return np.mean(punctuation_chars(user_tweets))

    
def std_dev_punctuation_chars(user_tweets):
    """ Returns the standard deviation of punctuation characters emojis per tweet, for the user """
    return std_dev(punctuation_chars(user_tweets), average_punctuation_chars(user_tweets))

In [104]:
# Stats data
stats_extractor = TweetStatsExtractor(extractors=[
    average_chars,
    std_dev_chars,
    average_words,
    std_dev_words,
    average_word_lengths,
    average_cased_chars,
    std_dev_cased_chars,
    average_uncased_chars,
    std_dev_uncased_chars,
    average_emojis,
    std_dev_emojis,
    average_punctuation_chars,
    std_dev_punctuation_chars,
    average_tags,
    average_sentiment,
    std_dev_sentiment,
])
tweet_stats_train = stats_extractor.transform(tweet_train)
tweet_stats_val = stats_extractor.transform(tweet_val)
tweet_stats_test = stats_extractor.transform(tweet_test)

feature_names = [
    "Average number of characters per tweet",
    "Standard deviation of characters per tweet",
    "Average number of words per tweet",
    "Standard deviation of words per tweet",
    "Average word lengths",
    "Average number of cased characters per tweet",
    "Standard deviation of cased characters per tweet",
    "Average number of uncased characters per tweet",
    "Standard deviation of uncased characters per tweet",
    "Average number of emojis per tweet",
    "Standard deviation of emojis per tweet",
    "Average number of punctuation characters per tweet",
    "Standard deviation number of punctuation characters per tweet",
    "Average number of 'RT' tags per tweet",
    "Average number of '#USER#' tags per tweet",
    "Average number of '#HASHTAG#' tags per tweet",
    "Average number of '#URL#' tags per tweet",
    "Average positive sentiment per tweet",
    "Average neutral sentiment per tweet",
    "Average negative sentiment per tweet",
    "Standard deviation positive sentiment per tweet",
    "Standard deviation neutral sentiment per tweet",
    "Standard deviation negative sentiment per tweet",
]

## Finding the best Classifier
* Classify users based on their (normalized) statistical features
* Performing a grid search on: LogisticRegression, SVC, RandomForestClassifier, GradientBoostingClassifier, KNeighborsClassifier

In [107]:
stats_search_best_params, stats_search_df = grid_search(tweet_stats_train, label_train, tweet_stats_val, label_val)

Estimators: 5it [1:24:46, 1017.25s/it]


In [108]:
stats_search_best_params

[('LogisticRegression',
  {'Estimator__C': 0.5,
   'Estimator__penalty': 'l2',
   'Estimator__solver': 'liblinear',
   'SelectKBest__k': 18}),
 ('SVC',
  {'Estimator__C': 2,
   'Estimator__kernel': 'sigmoid',
   'Estimator__probability': True,
   'SelectKBest__k': 23}),
 ('RandomForestClassifier',
  {'Estimator__criterion': 'gini',
   'Estimator__min_samples_leaf': 1,
   'Estimator__n_estimators': 50,
   'SelectKBest__k': 21}),
 ('GradientBoostingClassifier',
  {'Estimator__learning_rate': 0.1,
   'Estimator__loss': 'deviance',
   'Estimator__min_samples_leaf': 14,
   'Estimator__n_estimators': 25,
   'SelectKBest__k': 18}),
 ('KNeighborsClassifier',
  {'Estimator__n_neighbors': 10,
   'Estimator__weights': 'distance',
   'SelectKBest__k': 20})]

In [109]:
stats_search_df

Unnamed: 0,Estimator,K best features,Mean CV Loss,Mean CV F1,Mean CV Accuracy,Val Loss,Val Precision,Val Recall,Val F1,Val Accuracy
0,LogisticRegression,18,-0.636867,0.693448,0.680952,0.569532,0.8,0.695652,0.744186,0.755556
1,SVC,23,-0.602159,0.663741,0.647619,0.619748,0.677419,0.913043,0.777778,0.733333
2,RandomForestClassifier,21,-0.606469,0.677937,0.695238,0.590736,0.6875,0.478261,0.564103,0.622222
3,GradientBoostingClassifier,18,-0.619664,0.636515,0.642857,0.571609,0.777778,0.608696,0.682927,0.711111
4,KNeighborsClassifier,20,-0.810363,0.655348,0.614286,0.559821,0.703704,0.826087,0.76,0.733333


# Named Entity Recognition Model
* Extract user usage of named entities, and create a feature vector from counts of the different named entities
* "TakeLab at SemEval-2019 Task 4: Hyperpartisan News Detection" paper used an NER counter feature to help classify hyperpartisan news. 

In [None]:
import spacy

spacy_nlp = spacy.load("en_core_web_sm")
spacy_ner_labels = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", 
                    "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]

In [None]:
def named_entities_count_array(user_tweets):
    """ Extract the named entities from a users tweets, and return an array of counts for each entity """
    freq = dict.fromkeys(spacy_ner_labels, 0)
    for tweet in user_tweets:
        cleaned_tweet = clean_text(tweet.text, remove_tags=True)
        tweet_ne = spacy_nlp(cleaned_tweet).ents
        for entity in tweet_ne:
            freq[entity.label_] += 1
    
    return np.asarray(list(freq.values()))

In [None]:
# Extract NER count arrays
ner_stats_extractor = TweetStatsExtractor(extractors=[named_entities_count_array])

tweet_ner_stats_train = ner_stats_extractor.transform(tweet_train)
tweet_ner_stats_val = ner_stats_extractor.transform(tweet_val)
tweet_ner_stats_test = ner_stats_extractor.transform(tweet_test)

## Finding the best Classifier
* Classify users based on their (normalized) counts of named entitities in their text
* Performing a grid search on: LogisticRegression, SVC, RandomForestClassifier, GradientBoostingClassifier, KNeighborsClassifier

In [None]:
ner_search_best_params, ner_search_df = grid_search(tweet_ner_stats_train, label_train, tweet_ner_stats_val, label_val)

In [None]:
ner_search_best_params

In [None]:
ner_search_df.shape

# Readability Model
* Determine user readability scores, and create a feature vector of these different scores
* In "A stylometric Inquiry into Hyperpartisan and Fake News" paper, they used 10 readability scores to help classify hyperpartisan news. "Automatic Detection of Fake News" paper also used readability features, such as the number of characters, complex words, long words, number of syllables, word types, and number of paragraphs.

Readability measurements:
* Flesch Reading Ease - Scores how easy something is to read, using the idea that shorter words and sentences are easier to read. 
    * Looks a the number of total words to total sentences ratio and total syllables to total words ratio. 
    * Range: 0 - 121.22. 
    * Source: https://simple.wikipedia.org/wiki/Flesch_Reading_Ease
* Flesch-Kincaid grade - Similar to the Flesch Reading Ease score, but used to give a grade level to the text.
    * This score is less affected by words with 3+ syllables
    * It's equation is almost identical to the Flesch Reading Ease score
    * Source: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
* SMOG Index - Estimates the years of education needed to understand some text.
    * SMOG is the preferred measure of readability ofr healthcare material. It performed better than the Flesch-Kincaid grade.
    * Multiplies the number of words with 3+ syllables by 30 / total number of sentences.
    * Source: https://en.wikipedia.org/wiki/SMOG
* Coleman-Liau Index - Scores some text by what (US) grade level the reader will have to be in.
    * It calculated the average number of letters per 100 words, and the average number of sentences per 100 words.
    * Source: https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
* Automated Readability Index - Scores how understandable a text is. It produces an approximate representation fo the US grade level needed to comprehend it.
    * It calculates the number of characters per words and number of words per sentences
    * Source: https://en.wikipedia.org/wiki/Automated_readability_index

In [None]:
import textstat

In [None]:
def readability_scores(user_tweets):
    """ 
    Extract a list of average readability scores of a user, and a SMOG score for their 
    concatenated tweets.
    """
    user_scores = []
    user_cleaned_tweets = []
    for tweet in user_tweets:
        cleaned_tweet = clean_text(tweet.text, remove_punc=False, remove_digits=False, 
                                   remove_tags=True)
        user_scores.append([
            textstat.automated_readability_index(cleaned_tweet),
            textstat.flesch_reading_ease(cleaned_tweet),
            textstat.flesch_kincaid_grade(cleaned_tweet),
            textstat.coleman_liau_index(cleaned_tweet),
        ])
        user_cleaned_tweets.append(cleaned_tweet)
    
    user_scores = np.mean(user_scores, axis=0)
    smog_score = textstat.smog_index(". ".join(user_cleaned_tweets))
    return np.append(user_scores, smog_score)

In [None]:
# Extract NER count arrays
read_stats_extractor = TweetStatsExtractor(extractors=[readability_scores])

tweet_read_stats_train = read_stats_extractor.transform(tweet_train)
tweet_read_stats_val = read_stats_extractor.transform(tweet_val)
tweet_read_stats_test = read_stats_extractor.transform(tweet_test)

In [None]:
read_search_best_params, read_search_df = grid_search(
    tweet_read_stats_train, label_train, tweet_read_stats_val, label_val)

In [None]:
read_search_best_params

In [None]:
read_search_df

# Combining the Ensemble model

In [None]:
def load_models():
    return []

In [None]:
models = load_models()