In [27]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [28]:
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf

import ipynb.fs.full.parse_datasets as datasets
import ipynb.fs.full.preprocessing as pp
import ipynb.fs.full.bert_fake_news_classifier as bclf
from ipynb.fs.full.notif_email import send_email

# Dataset

In [29]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

# Statistical model

## Data extractor
* Extract statistical features from tweet text

In [155]:
from sklearn.preprocessing import normalize

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
import demoji

In [333]:
# Stats extraction helper functions
analyzer = SentimentIntensityAnalyzer()

digits = set("0123456789")
printable = set(string.printable)
punctuation = set(string.punctuation)
punctuation.remove('#')


def clean_text(text, remove_punc=True, remove_non_print=True, remove_emojis=True, 
              remove_digits=True):
    """ Clean text by removing certain characters (e.g. punctuation) """
    if remove_emojis:
        text = demoji.replace(text, "")
        
    chars = []
    for char in text:
        if not ((remove_punc and char in punctuation) or
            (remove_non_print and char not in printable) or
            (remove_digits and char in digits)):
            chars.append(char)
        
    return "".join(chars)


def polarity_scores(text):
    polarity_dict = analyzer.polarity_scores(text)
    return np.asarray([
        polarity_dict['pos'],
        polarity_dict['neu'],
        polarity_dict['neg'],
    ])


def tweets_to_words(user_tweets):
    return np.asarray([clean_text(tweet.text).split() for tweet in user_tweets])


def std_dev(datapoints, mean, num_datapoints=100):
    diff = datapoints - mean
    return np.sqrt(np.sum(diff ** 2)/100)


def average_tweet_lengths(user_tweets):
    return np.mean([len(tweet) for tweet in user_tweets])


def std_dev_tweet_lengths(user_tweets):
    tweet_lens = [len(tweet) for tweet in user_tweets]
    return std_dev(
        np.asarray(tweet_lens),
        np.mean(tweet_lens),
    )


def cased_chars(user_tweets, cased):
    return [
        sum([c.isupper() if cased else c.islower() for c in tweet.text]) 
        for tweet in user_tweets
    ]


def emoji_chars(user_tweets):
    return [len(demoji.findall_list(tweet.text)) for tweet in user_tweets]


def punctuation_chars(user_tweets):
    return [
        len([c for c in tweet.text if c in punctuation]) 
        for tweet in user_tweets
    ]


def array(data_func):
    return lambda args: np.asarray([data_func(args)])

In [334]:
# Extractor functions (to be used in TweetStatsExtractor)
def average_chars(user_tweets):
    """ Returns the average tweet lengths, in characters, for the user """
    return average_tweet_lengths([tweet.text for tweet in user_tweets])


def std_dev_chars(user_tweets):
    """ Returns the standard deviations of tweet lengths, in characters, for the user """
    return std_dev_tweet_lengths([tweet.text for tweet in user_tweets])


def average_words(user_tweets):
    """ Returns the average tweet lengths, in words, for the user """
    return average_tweet_lengths(tweets_to_words(user_tweets))


def std_dev_words(user_tweets):
    """ Returns the standard deviations of tweet lengths, in words, for the user """
    return std_dev_tweet_lengths(tweets_to_words(user_tweets))


def average_sentiment(user_tweets):
    """ Returns the average sentiment scores of the user """
    return np.mean([polarity_scores(tweet.text) for tweet in user_tweets], axis=0)


def average_word_lengths(user_tweets):
    """ Returns the average length of words used by this user """
    return np.mean([
        len(word) 
        for tweet in user_tweets
        for word in clean_text(tweet.text).split()
    ])


def average_tags(user_tweets, tags=['RT', '#USER#', '#HASHTAG#', '#URL#']):
    """ Returns the average number of tags used by this user """
    return np.mean([
        np.asarray([tweet.text.count(tag) for tag in tags])
                   for tweet in user_tweets
    ], axis=0)


def average_cased_chars(user_tweets):
    """ Returns the average number of cased (uppercase) characters per tweet, for the user """
    return np.mean(cased_chars(user_tweets, True))
    

def std_dev_cased_chars(user_tweets):
    """ Returns the standard deviation of cased characters per tweet, for the user """
    return std_dev(cased_chars(user_tweets, True), average_cased_chars(user_tweets))
    
    
def average_uncased_chars(user_tweets):
    """ Returns the average number of uncased (lowercase) characters per tweet, for the user """
    return np.mean(cased_chars(user_tweets, False))


def std_dev_uncased_chars(user_tweets):
    """ Returns the standard deviation of cased characters per tweet, for the user """
    return std_dev(cased_chars(user_tweets, False), average_uncased_chars(user_tweets))

    
def average_emojis(user_tweets):
    """ Returns the average number of emojis per tweet, for the user """
    return np.mean(emoji_chars(user_tweets))
    
    
def std_dev_emojis(user_tweets):
    """ Returns the standard deviation of emojis per tweet, for the user """
    return std_dev(emoji_chars(user_tweets), average_emojis(user_tweets))


def average_punctuation_chars(user_tweets):
    """ Returns the average number of punctuation characters per tweet, for the user """
    return np.mean(punctuation_chars(user_tweets))

    
def average_emojis(user_tweets):
    """ Returns the standard deviation of punctuation characters emojis per tweet, for the user """
    return std_dev(punctuation_chars(user_tweets), average_punctuation_chars(user_tweets))

In [335]:
class TweetStatsExtractor:
    def __init__(self, funcs=[
        array(average_chars),
        array(std_dev_chars),
        array(average_words),
        array(std_dev_words),
        array(average_word_lengths),
        array(average_cased_chars),
        array(std_dev_cased_chars),
        array(average_uncased_chars),
        array(std_dev_uncased_chars),
        array(average_emojis),
        array(std_dev_emojis),
        array(average_punctuation_chars),
        array(average_emojis),
        average_tags,
        average_sentiment,
    ]):
        self.funcs = funcs
    
    def transform(self, X, normalize_data=True):
        result = np.asarray([
            np.concatenate([np.asarray(f(tweet_feed)) for f in self.funcs])
            for tweet_feed in X
        ])
        
        return normalize(result) if normalize_data else result

## Classifier models
* Classify users based on their (normalized) statistical features
* Used a GridSearch to find model optimal parameters as well as TweetStatsExtractor optimal parameters
* We can look at model weights to find the stats which are least and most useful

In [336]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [337]:
# Stats data
stats_extractor = TweetStatsExtractor()
tweet_stats_data = stats_extractor.transform(tweet_data)

(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_stats_data, label_data)

In [338]:
# Logistic Regression
log_reg_clf = LogisticRegression("l2")
log_reg_clf.fit(tweet_train, label_train)
print("Val accuracy (LogisticRegression):", log_reg_clf.score(tweet_val, label_val))

# Support Vector Classifier
svc_clf = SVC()
svc_clf.fit(tweet_train, label_train)
print("Val accuracy (SVC):", svc_clf.score(tweet_val, label_val))

# Random Forest
forest_clf = RandomForestClassifier()
forest_clf.fit(tweet_train, label_train)
print("Val accuracy (RandomForestClassifier):", forest_clf.score(tweet_val, label_val))

# Gradient Boosting
grad_boost_clf = GradientBoostingClassifier()
grad_boost_clf.fit(tweet_train, label_train)
print("Val accuracy (GradientBoostingClassifier):", grad_boost_clf.score(tweet_val, label_val))

Val accuracy (LogisticRegression): 0.5777777777777777
Val accuracy (SVC): 0.5111111111111111
Val accuracy (RandomForestClassifier): 0.6666666666666666
Val accuracy (GradientBoostingClassifier): 0.6222222222222222
