In [1]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [3]:
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf

import ipynb.fs.full.parse_datasets as datasets
import ipynb.fs.full.preprocessing as pp
import ipynb.fs.full.bert_fake_news_classifier as bclf
from ipynb.fs.full.notif_email import send_email

# Dataset

In [4]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

# Statistical model

## Data extractor
* Extract statistical features from tweet text

In [5]:
from sklearn.preprocessing import normalize

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
import demoji

In [219]:
# Stats extraction helper functions
analyzer = SentimentIntensityAnalyzer()

digits = set("0123456789")
printable = set(string.printable)
punctuation = set(string.punctuation)
punctuation.remove('#')


def clean_text(text, remove_punc=True, remove_non_print=True, remove_emojis=True, 
              remove_digits=True):
    """ Clean text by removing certain characters (e.g. punctuation) """
    if remove_emojis:
        text = demoji.replace(text, "")
        
    chars = []
    for char in text:
        if not ((remove_punc and char in punctuation) or
            (remove_non_print and char not in printable) or
            (remove_digits and char in digits)):
            chars.append(char)
        
    return "".join(chars)


def polarity_scores(text):
    polarity_dict = analyzer.polarity_scores(text)
    return np.asarray([
        polarity_dict['pos'],
        polarity_dict['neu'],
        polarity_dict['neg'],
    ])


def tweets_to_words(user_tweets):
    return np.asarray([clean_text(tweet.text).split() for tweet in user_tweets])


def std_dev(datapoints, mean, num_datapoints=100):
    diff = datapoints - mean
    return np.sqrt(np.sum(diff ** 2, axis=0)/100)


def average_tweet_lengths(user_tweets):
    return np.mean([len(tweet) for tweet in user_tweets])


def std_dev_tweet_lengths(user_tweets):
    tweet_lens = [len(tweet) for tweet in user_tweets]
    return std_dev(
        np.asarray(tweet_lens),
        np.mean(tweet_lens),
    )


def cased_chars(user_tweets, cased):
    return [
        sum([c.isupper() if cased else c.islower() for c in tweet.text]) 
        for tweet in user_tweets
    ]


def emoji_chars(user_tweets):
    return [len(demoji.findall_list(tweet.text)) for tweet in user_tweets]


def punctuation_chars(user_tweets):
    return [
        len([c for c in tweet.text if c in punctuation]) 
        for tweet in user_tweets
    ]

In [238]:
# Extractor functions (to be used in TweetStatsExtractor)
def average_chars(user_tweets):
    """ Returns the average tweet lengths, in characters, for the user """
    return average_tweet_lengths([tweet.text for tweet in user_tweets])


def std_dev_chars(user_tweets):
    """ Returns the standard deviations of tweet lengths, in characters, for the user """
    return std_dev_tweet_lengths([tweet.text for tweet in user_tweets])


def average_words(user_tweets):
    """ Returns the average tweet lengths, in words, for the user """
    return average_tweet_lengths(tweets_to_words(user_tweets))


def std_dev_words(user_tweets):
    """ Returns the standard deviations of tweet lengths, in words, for the user """
    return std_dev_tweet_lengths(tweets_to_words(user_tweets))


def average_sentiment(user_tweets):
    """ Returns the average sentiment scores of the user """
    return np.mean([polarity_scores(tweet.text) for tweet in user_tweets], axis=0)


def std_dev_sentiment(user_tweets):
    """ Returns the average sentiment scores of the user """
    sentiment = np.asarray([polarity_scores(tweet.text) for tweet in user_tweets])
    return std_dev(sentiment, np.mean(sentiment, axis=0))


def average_word_lengths(user_tweets):
    """ Returns the average length of words used by this user """
    return np.mean([
        len(word) 
        for tweet in user_tweets
        for word in clean_text(tweet.text).split()
    ])


def average_tags(user_tweets, tags=['RT', '#USER#', '#HASHTAG#', '#URL#']):
    """ Returns the average number of tags used by this user """
    return np.mean([
        np.asarray([tweet.text.count(tag) for tag in tags])
                   for tweet in user_tweets
    ], axis=0)


def average_cased_chars(user_tweets):
    """ Returns the average number of cased (uppercase) characters per tweet, for the user """
    return np.mean(cased_chars(user_tweets, True))
    

def std_dev_cased_chars(user_tweets):
    """ Returns the standard deviation of cased characters per tweet, for the user """
    return std_dev(cased_chars(user_tweets, True), average_cased_chars(user_tweets))
    
    
def average_uncased_chars(user_tweets):
    """ Returns the average number of uncased (lowercase) characters per tweet, for the user """
    return np.mean(cased_chars(user_tweets, False))


def std_dev_uncased_chars(user_tweets):
    """ Returns the standard deviation of cased characters per tweet, for the user """
    return std_dev(cased_chars(user_tweets, False), average_uncased_chars(user_tweets))

    
def average_emojis(user_tweets):
    """ Returns the average number of emojis per tweet, for the user """
    return np.mean(emoji_chars(user_tweets))
    
    
def std_dev_emojis(user_tweets):
    """ Returns the standard deviation of emojis per tweet, for the user """
    return std_dev(emoji_chars(user_tweets), average_emojis(user_tweets))


def average_punctuation_chars(user_tweets):
    """ Returns the average number of punctuation characters per tweet, for the user """
    return np.mean(punctuation_chars(user_tweets))

    
def std_dev_punctuation_chars(user_tweets):
    """ Returns the standard deviation of punctuation characters emojis per tweet, for the user """
    return std_dev(punctuation_chars(user_tweets), average_punctuation_chars(user_tweets))

In [239]:
from collections import Iterable

class TweetStatsExtractor:
    def __init__(self, funcs=[
        average_chars,
        std_dev_chars,
        average_words,
        std_dev_words,
        average_word_lengths,
        average_cased_chars,
        std_dev_cased_chars,
        average_uncased_chars,
        std_dev_uncased_chars,
        average_emojis,
        std_dev_emojis,
        average_punctuation_chars,
        std_dev_punctuation_chars,
        average_tags,
        average_sentiment,
        std_dev_sentiment,
    ]):
        self.funcs = funcs
    
    def transform(self, X, normalize_data=True):
        result = np.asarray([
            np.concatenate([self._apply(f, tweet_feed) for f in self.funcs])
            for tweet_feed in X
        ])
        
        return normalize(result) if normalize_data else result
    
    def _apply(self, func, data):
        result = func(data)
        if isinstance(result, Iterable):
            return result
        else:
            return np.asarray([result])

feature_names = [
    "Average number of characters per tweet",
    "Standard deviation of characters per tweet",
    "Average number of words per tweet",
    "Standard deviation of words per tweet",
    "Average word lengths",
    "Average number of cased characters per tweet",
    "Standard deviation of cased characters per tweet",
    "Average number of uncased characters per tweet",
    "Standard deviation of uncased characters per tweet",
    "Average number of emojis per tweet",
    "Standard deviation of emojis per tweet",
    "Average number of punctuation characters per tweet",
    "Standard deviation number of punctuation characters per tweet",
    "Average number of 'RT' tags per tweet",
    "Average number of '#USER#' tags per tweet",
    "Average number of '#HASHTAG#' tags per tweet",
    "Average number of '#URL#' tags per tweet",
    "Average positive sentiment per tweet",
    "Average neutral sentiment per tweet",
    "Average negative sentiment per tweet",
    "Standard deviation positive sentiment per tweet",
    "Standard deviation neutral sentiment per tweet",
    "Standard deviation negative sentiment per tweet",
]

## Classifier models
* Classify users based on their (normalized) statistical features
* Used a GridSearch to find model optimal parameters as well as TweetStatsExtractor optimal parameters
* We can look at model weights to find the stats which are least and most useful

In [240]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [241]:
# Stats data
stats_extractor = TweetStatsExtractor()
tweet_stats_data = stats_extractor.transform(tweet_data)

(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_stats_data, label_data)

In [242]:
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest


# Training models, selecting the best features
for i in range(1, len(tweet_train[0])+1):
    select = SelectKBest(k=i)
    X_train = select.fit_transform(tweet_train, label_train) 
    X_val = select.transform(tweet_val)
    
    print("-------------Features =", i)
    # Logistic Regression
    log_reg_clf = LogisticRegression("l2")
    log_reg_clf.fit(X_train, label_train)
    print("LogisticRegression")
    print(classification_report(label_val, log_reg_clf.predict(X_val)))

    # Support Vector Classifier
    svc_clf = SVC()
    svc_clf.fit(X_train, label_train)
    print("SVC")
    print(classification_report(label_val, svc_clf.predict(X_val)))
    
    # Random Forest
    forest_clf = RandomForestClassifier()
    forest_clf.fit(X_train, label_train)
    print("RandomForestClassifier")
    print(classification_report(label_val, forest_clf.predict(X_val)))
    
    # Gradient Boosting
    grad_boost_clf = GradientBoostingClassifier()
    grad_boost_clf.fit(X_train, label_train)
    print("GradientBoostingClassifier")
    print(classification_report(label_val, grad_boost_clf.predict(X_val)))

-------------Features = 1
LogisticRegression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.44      1.00      0.62        20

    accuracy                           0.44        45
   macro avg       0.22      0.50      0.31        45
weighted avg       0.20      0.44      0.27        45

SVC
              precision    recall  f1-score   support

           0       0.79      0.60      0.68        25
           1       0.62      0.80      0.70        20

    accuracy                           0.69        45
   macro avg       0.70      0.70      0.69        45
weighted avg       0.71      0.69      0.69        45

RandomForestClassifier
              precision    recall  f1-score   support

           0       0.68      0.60      0.64        25
           1       0.57      0.65      0.60        20

    accuracy                           0.62        45
   macro avg       0.62      0.62      0.62        45
we

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier
              precision    recall  f1-score   support

           0       0.62      0.52      0.57        25
           1       0.50      0.60      0.55        20

    accuracy                           0.56        45
   macro avg       0.56      0.56      0.56        45
weighted avg       0.57      0.56      0.56        45

GradientBoostingClassifier
              precision    recall  f1-score   support

           0       0.60      0.48      0.53        25
           1       0.48      0.60      0.53        20

    accuracy                           0.53        45
   macro avg       0.54      0.54      0.53        45
weighted avg       0.55      0.53      0.53        45

-------------Features = 3
LogisticRegression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.44      1.00      0.62        20

    accuracy                           0.44        45
   macro avg       0.22      0.50

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


GradientBoostingClassifier
              precision    recall  f1-score   support

           0       0.75      0.60      0.67        25
           1       0.60      0.75      0.67        20

    accuracy                           0.67        45
   macro avg       0.68      0.68      0.67        45
weighted avg       0.68      0.67      0.67        45

-------------Features = 4
LogisticRegression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        25
           1       0.44      1.00      0.62        20

    accuracy                           0.44        45
   macro avg       0.22      0.50      0.31        45
weighted avg       0.20      0.44      0.27        45

SVC
              precision    recall  f1-score   support

           0       0.82      0.36      0.50        25
           1       0.53      0.90      0.67        20

    accuracy                           0.60        45
   macro avg       0.67      0.63      0.58        4

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier
              precision    recall  f1-score   support

           0       0.65      0.52      0.58        25
           1       0.52      0.65      0.58        20

    accuracy                           0.58        45
   macro avg       0.58      0.58      0.58        45
weighted avg       0.59      0.58      0.58        45

GradientBoostingClassifier
              precision    recall  f1-score   support

           0       0.70      0.56      0.62        25
           1       0.56      0.70      0.62        20

    accuracy                           0.62        45
   macro avg       0.63      0.63      0.62        45
weighted avg       0.64      0.62      0.62        45

-------------Features = 7
LogisticRegression
              precision    recall  f1-score   support

           0       0.88      0.28      0.42        25
           1       0.51      0.95      0.67        20

    accuracy                           0.58        45
   macro avg       0.69      0.61

              precision    recall  f1-score   support

           0       0.64      0.64      0.64        25
           1       0.55      0.55      0.55        20

    accuracy                           0.60        45
   macro avg       0.59      0.59      0.59        45
weighted avg       0.60      0.60      0.60        45

GradientBoostingClassifier
              precision    recall  f1-score   support

           0       0.71      0.60      0.65        25
           1       0.58      0.70      0.64        20

    accuracy                           0.64        45
   macro avg       0.65      0.65      0.64        45
weighted avg       0.66      0.64      0.65        45

-------------Features = 13
LogisticRegression
              precision    recall  f1-score   support

           0       0.67      0.24      0.35        25
           1       0.47      0.85      0.61        20

    accuracy                           0.51        45
   macro avg       0.57      0.54      0.48        45
w

RandomForestClassifier
              precision    recall  f1-score   support

           0       0.79      0.76      0.78        25
           1       0.71      0.75      0.73        20

    accuracy                           0.76        45
   macro avg       0.75      0.76      0.75        45
weighted avg       0.76      0.76      0.76        45

GradientBoostingClassifier
              precision    recall  f1-score   support

           0       0.70      0.56      0.62        25
           1       0.56      0.70      0.62        20

    accuracy                           0.62        45
   macro avg       0.63      0.63      0.62        45
weighted avg       0.64      0.62      0.62        45

-------------Features = 19
LogisticRegression
              precision    recall  f1-score   support

           0       0.64      0.28      0.39        25
           1       0.47      0.80      0.59        20

    accuracy                           0.51        45
   macro avg       0.55      0.5

### Best classifiers
* Select the best performing classifiers
* Do a grid search for their best parameters

In [243]:
# Importance of each feature
selected_scores = sorted(zip(feature_names, select.scores_), key=lambda a: a[1], reverse=True)
selected_scores

[('Average negative sentiment per tweet', 16.98501592327074),
 ('Average number of punctuation characters per tweet', 9.755842009119732),
 ('Average number of words per tweet', 7.910577870055184),
 ('Standard deviation of emojis per tweet', 7.842691211800912),
 ('Average positive sentiment per tweet', 7.036855751571707),
 ('Standard deviation negative sentiment per tweet', 6.073945243556848),
 ('Average number of uncased characters per tweet', 4.095469612930164),
 ("Average number of '#USER#' tags per tweet", 4.044184545734733),
 ("Average number of 'RT' tags per tweet", 3.5192819489583638),
 ('Standard deviation of words per tweet', 3.517345947563265),
 ('Standard deviation of uncased characters per tweet', 2.840796589938319),
 ('Standard deviation positive sentiment per tweet', 2.757120898898489),
 ('Average number of emojis per tweet', 2.697001810004565),
 ('Standard deviation of characters per tweet', 2.36117832377876),
 ('Standard deviation number of punctuation characters per twe

In [36]:
# Logistic Regression
log_reg_clf = LogisticRegression("l2")
log_reg_clf.fit(tweet_train, label_train)
print("Val accuracy (LogisticRegression):", log_reg_clf.score(tweet_val, label_val))

# Support Vector Classifier
svc_clf = SVC()
svc_clf.fit(tweet_train, label_train)
print("Val accuracy (SVC):", svc_clf.score(tweet_val, label_val))

# Random Forest
forest_clf = RandomForestClassifier()
forest_clf.fit(tweet_train, label_train)
print("Val accuracy (RandomForestClassifier):", forest_clf.score(tweet_val, label_val))

# Gradient Boosting
grad_boost_clf = GradientBoostingClassifier()
grad_boost_clf.fit(tweet_train, label_train)
print("Val accuracy (GradientBoostingClassifier):", grad_boost_clf.score(tweet_val, label_val))

Val accuracy (LogisticRegression): 0.5777777777777777
Val accuracy (SVC): 0.5111111111111111
Val accuracy (RandomForestClassifier): 0.7555555555555555
Val accuracy (GradientBoostingClassifier): 0.7111111111111111


# Combining the Ensemble model