In [3]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [4]:
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf

from ipynb.fs.full.parse_datasets import parse_dataset, split_dataset
import ipynb.fs.full.preprocessing as process
import ipynb.fs.full.bert_fake_news_classifier as bclf
from ipynb.fs.full.notif_email import send_email

# Dataset

In [187]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_data, label_data)

In [268]:
# Remove HTML/XML tags
preprocessor = process.BertTweetFeedDataPreprocessor(transformers=[process.replace_xml_and_html])
tweet_train_processed = preprocessor.transform(tweet_train)
tweet_val_processed = preprocessor.transform(tweet_val)
tweet_test_processed = preprocessor.transform(tweet_test)

# Data extraction

In [5]:
from sklearn.preprocessing import normalize

import string
import demoji
import re
from pyphen import Pyphen

In [624]:
# Stats extraction helper functions
digits = set("0123456789")
printable = set(string.printable)
punctuation = set(string.punctuation)
punctuation.remove('#')

pyphen = Pyphen(lang='en')


def clean_text(text, remove_punc=True, remove_non_print=True, remove_emojis=True, 
              remove_digits=True, remove_tags=False):
    """ Clean text by removing certain characters (e.g. punctuation) """
    if remove_emojis:
        text = demoji.replace(text, "")
        
    chars = []
    for char in text:
        if not ((remove_punc and char in punctuation) or
            (remove_non_print and char not in printable) or
            (remove_digits and char in digits)):
            chars.append(char)

    cleaned = "".join(chars)
    if remove_tags:
        return re.sub('#[A-Z]+#', "", cleaned)
    
    return cleaned


def tweets_to_words(user_tweets, **kwargs):
    return [clean_text(tweet, **kwargs).split() for tweet in user_tweets]


def std_dev(datapoints, mean, num_datapoints=100):
    diff = datapoints - mean
    return np.sqrt(np.sum(diff ** 2, axis=0)/100)


def average_tweet_lengths(user_tweets):
    return np.mean([len(tweet) for tweet in user_tweets])


def std_dev_tweet_lengths(user_tweets):
    tweet_lens = [len(tweet) for tweet in user_tweets]
    return std_dev(np.asarray(tweet_lens), np.mean(tweet_lens))


def cased_chars(user_tweets, cased):
    return [
        sum([c.isupper() if cased else c.islower() for c in tweet]) 
        for tweet in user_tweets
    ]


def emoji_chars(user_tweets):
    """ Returns an array of lists of emojis used in each of the users tweets"""
    return [demoji.findall_list(tweet) for tweet in user_tweets]


def punctuation_chars(user_tweets):
    return [
        len([c for c in tweet if c in punctuation]) 
        for tweet in user_tweets
    ]


def syllables(word):
    """ Counts the number of syllables in a word """
    return pyphen.inserted(word).count('-') + 1


def flatten(xss):
    return [x for xs in xss for x in xs]

In [625]:
from collections.abc import Iterable

class TweetStatsExtractor:
    def __init__(self, extractors):
        if len(extractors) == 0:
            raise Exception("Must pass at least one extracting function")

        self.extractors = extractors
    
    def transform(self, X, normalize_data=False):
        result = []
        for user_tweets in X:
            if len(self.extractors) > 1:
                result.append(np.concatenate([self._apply(f, user_tweets) for f in self.extractors]))
            else:
                result.append(self._apply(self.extractors[0], user_tweets))
        
        return np.asarray(normalize(result) if normalize_data else result)
    
    def _apply(self, extractor, data):
        result = extractor(data)
        if isinstance(result, Iterable):
            return result
        else:
            return np.asarray([result])

# Classifier evaluation

In [6]:
from tqdm import tqdm

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score, log_loss

In [627]:
def print_features_by_importance(X, y, feature_names):
    select = SelectKBest(k='all')
    select.fit(X, y)
    ordered_features = sorted(zip(feature_names, select.scores_), key=lambda v: v[1], reverse=True)
    for feature, score in ordered_features:
        print(f"{feature}: {score}")

In [634]:
grid_search_estimators = [LogisticRegression, SVC, RandomForestClassifier, GradientBoostingClassifier, KNeighborsClassifier]
grid_search_param_grids = [
    {"Estimator__penalty": ["l1", "l2"], 
     "Estimator__C": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024, 1280], 
     "Estimator__solver": ["newton-cg", "lbfgs", "liblinear"]},
    {"Estimator__C": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 768, 1024, 1280], 
     "Estimator__kernel": ["linear", "poly", "rbf", "sigmoid"], 
     "Estimator__probability": [True]},
    {"Estimator__n_estimators": [25, 50, 100, 200, 400, 800], 
     "Estimator__criterion": ["gini", "entropy"], 
     "Estimator__min_samples_leaf": [1, 2, 4, 6, 8, 10, 12, 14, 16]},
    {"Estimator__loss": ["deviance", "exponential"], 
     "Estimator__learning_rate": [0.01, 0.05, 0.1, 0.2], 
     "Estimator__n_estimators": [25, 50, 100, 200, 400, 800], 
     "Estimator__min_samples_leaf": [1, 2, 4, 6, 8, 10, 12, 14, 16]},
    {"Estimator__n_neighbors": [2, 3, 4, 5, 6, 7, 8, 9, 10], 
     "Estimator__weights": ["uniform", "distance", ]},
]

def grid_search(X_train, y_train, X_val, y_val, estimators=grid_search_estimators, param_grids=grid_search_param_grids):
    """ 
    Performs a GridSearchCV on the training data, and then evaluates using the validation data.
    Uses a pipeline to find the best K features to use from the training data.
    Returns a list of each estimator with their best parameters, as well as a dataframe containing 
    evaluation data.
    """
    best_df = pd.DataFrame(columns=["Estimator", "K best features", "Mean CV Loss", "Mean CV F1", "Mean CV Accuracy", "Val Loss", "Val Precision", "Val Recall", "Val F1", "Val Accuracy"])
    best_params = []
    ks = list(range(1, len(X_train[0])+1))
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    for i, (estimator, param_grid) in tqdm(enumerate(zip(estimators, param_grids)), desc="Estimators", total=len(estimators)):
        # Perform a GridSearchCV
        param_grid['SelectKBest__k'] = ks
        search = GridSearchCV(
            Pipeline([('SelectKBest', SelectKBest()), ('Estimator', estimator())]), 
            param_grid, 
            n_jobs=-1, 
            scoring={
                "accuracy": make_scorer(accuracy_score), 
                "f1": make_scorer(f1_score, pos_label="1"),
                "loss": make_scorer(log_loss, greater_is_better=False, needs_proba=True),
            }, 
            refit="loss",
        )
        search.fit(X_train, y_train)
        
        # Collect results
        best_index = search.cv_results_['params'].index(search.best_params_)
        y_train_pred = search.predict(X_train)
        y_val_pred = search.predict(X_val)
        best_df.loc[i] = [estimator.__name__, 
                          search.best_params_['SelectKBest__k'], 
                          abs(search.cv_results_['mean_test_loss'][best_index]), 
                          search.cv_results_['mean_test_f1'][best_index],
                          search.cv_results_['mean_test_accuracy'][best_index],
                          log_loss(y_val, search.predict_proba(X_val)), 
                          precision_score(y_val, y_val_pred, pos_label="1"),
                          recall_score(y_val, y_val_pred, pos_label="1"),
                          f1_score(y_val, y_val_pred, pos_label="1"),
                          accuracy_score(y_val, y_val_pred)]
        best_params.append((estimator.__name__, search.best_params_))
    
    return best_params, best_df

# Readability model
* Extract statistical readability from user tweets:
    * Number of tags (hashtags, mentions, URLs) (https://www.aclweb.org/anthology/U19-1003.pdf, http://ceur-ws.org/Vol-2696/paper_189.pdf)
    * Number of emojis (https://www.aclweb.org/anthology/U19-1003.pdf, http://ceur-ws.org/Vol-2696/paper_189.pdf)
    * Ratio of words to sentences and syllables to words:
        * Flesch-Kincaid grade level, on corrected (and non-corrected) tweets (stripped of unicode and tags, spelling corrected) (https://www.aclweb.org/anthology/U19-1003.pdf)
        * Flesch Reading Ease, modified for short tweet lengths (https://arxiv.org/ftp/arxiv/papers/1401/1401.6058.pdf, https://www.aclweb.org/anthology/U19-1003.pdf)
        * Both of these measurements use the ratio of words to sentences and syllables to words. Since we are using these in a model which will apply weights to these ratios, we don't need to use these scoring functions. Instead we will just make features using the ratios themselves.
        * Note that https://arxiv.org/ftp/arxiv/papers/1401/1401.6058.pdf found that tweets are poorly structured and so assumed that each tweet was a single sentence. This reduces the ratio to just the total number of words per tweet.
    * Tweet lengths (in words and characters) (https://www.aclweb.org/anthology/U19-1003.pdf, http://ceur-ws.org/Vol-2696/paper_189.pdf)
    * Type-token ratio (num_unique_words/total_num_words) (http://ceur-ws.org/Vol-2696/paper_189.pdf)
    * Retweet ratio (num_retweets/total_num_tweets) (http://ceur-ws.org/Vol-2380/paper_263.pdf, http://ceur-ws.org/Vol-2380/paper_189.pdf)
    * Number of truncated tweets (end with a ...) (http://ceur-ws.org/Vol-2696/paper_189.pdf, http://ceur-ws.org/Vol-2380/paper_189.pdf)
    * Use of punctuation marks (!,?, etc) (http://ceur-ws.org/Vol-2380/paper_263.pdf)
    * Use of numerical values
    * Use of personal pronouns (https://sml.stanford.edu/ml/2008/01/hancock-dp-on-lying.pdf)
    * Automated Readability Index (ratio of chars to words, and words to sentences)


* "An Ensemble Model Using N-grams and Statistical Features to Identify Fake News Spreaders on Twitter" paper (http://ceur-ws.org/Vol-2696/paper_189.pdf) built a statistical model to support their N-gram model for this task (and won). "FacTweet: Profiling Fake News Twitter Accounts" used statistical information, such as emotions, style and sentiment to profile fake news spreading users.
* In "A stylometric Inquiry into Hyperpartisan and Fake News" paper, they used 10 readability scores to help classify hyperpartisan news. "Automatic Detection of Fake News" paper also used readability features, such as the number of characters, complex words, long words, number of syllables, word types, and number of paragraphs.

In [629]:
from collections import Counter
from functools import reduce, partial

In [630]:
# Extractor functions (to be used in TweetStatsExtractor)
def tag_counts(user_tweets, tags=['RT', '#USER#', '#HASHTAG#', '#URL#']):
    """ Returns the average number of tag used, for each tag in tags """
    return np.mean([[tweet.count(tag) for tag in tags] for tweet in user_tweets], axis=0)


def retweet_ratio(user_tweets):
    """ Returns the ratio of retweets to regular tweets """
    retweets = 0
    for tweet in user_tweets:
        if tweet.startswith("RT"):
            retweets += 1
    
    return retweets / len(user_tweets)

def emojis_count(user_tweets):
    """ Returns the following emoji counts for this user: total number of emojis used, average number of emojis used 
    per tweet, type-token ratio of emojis (uniqueness of emojis used) """
    tweet_emojis = emoji_chars(user_tweets)
    flattened_tweet_emojis = flatten(tweet_emojis)
    
    total_num_emojis = len(flattened_tweet_emojis)
    mean_num_emojis = np.mean(list(map(len, tweet_emojis)))
    emoji_type_token_ratio = (len(Counter(flattened_tweet_emojis)) / total_num_emojis) if total_num_emojis > 0 else 0
    return np.asarray([total_num_emojis, mean_num_emojis, emoji_type_token_ratio])


def syllables_to_words_ratios(user_tweets):
    """ Returns the overall, average, min, and max ratios of the number of syllables to the number of words """
    tweet_words = tweets_to_words(user_tweets, remove_tags=True)
    tweet_syllables = [sum(map(syllables, words)) for words in tweet_words]
    per_tweet_ratios = [tweet_syllables[i] / max(1, len(tweet_words[i])) for i in range(len(tweet_words))]
    
    overall_ratio = sum(tweet_syllables) / max(1, sum(map(len, tweet_words)))
    mean_ratio = np.mean(per_tweet_ratios)
    min_ratio = min(per_tweet_ratios)
    max_ratio = max(per_tweet_ratios)
    return np.asarray([overall_ratio, mean_ratio, min_ratio, max_ratio])


def average_tweet_lengths(user_tweets):
    """ Returns the average tweet lengths in words and characters """
    mean_words = np.mean(list(map(len, tweets_to_words(user_tweets, remove_tags=True))))
    mean_chars = np.mean(list(map(len, map(partial(clean_text, remove_tags=True), user_tweets))))
    return np.asarray([mean_words, mean_chars])


def word_type_to_token_ratio(user_tweets):
    """ Returns the ratio of unique words to the total number of words in all of a users tweets """
    words = flatten(tweets_to_words(user_tweets, remove_tags=True))
    return len(Counter(list(words))) / len(words)


def truncated_tweets(user_tweets):
    """ Returns the number of truncated tweets """
    count = 0
    for tweet in user_tweets:
        if re.match(".*\.\.\.(?: #URL#)?$", tweet) is not None:
            count += 1
        
    return count


def punctuation_counts(user_tweets, punctuation_marks = "!?,:"):
    """ Returns the average number of each punctuation character in the users tweets, for each punctuation character 
    in punctuation_marks. Also returns the punctuation type-to-token ratio of all of the users tweets """
    all_punc = [c for tweet in user_tweets 
                for c in clean_text(tweet, remove_punc=False, remove_tags=True) if c in punctuation]
    punc_ttr = len(Counter(all_punc)) / max(1, len(all_punc))
    punc_counts = [[tweet.count(punctuation) for punctuation in punctuation_marks] for tweet in user_tweets]
    mean_punc_counts = np.mean(punc_counts, axis=0)
    return np.concatenate([mean_punc_counts, [punc_ttr]])


def number_counts(user_tweets):
    """ Returns the following counts: average number of numerical values per tweet (e.g. "7,000"), average number of 
    monetary values per tweet (e.g. "$90,000", "£90 Million") """
    number_matcher = "\d+(?:,\d+)*(?:\.\d+)?"
    mean_numbers = np.mean([
        len(re.findall(f"(?:^| )(?<![£$€]){number_matcher}", tweet)) for tweet in user_tweets
    ])
    mean_money = np.mean([
        len(re.findall(f"[£$€]{number_matcher}", tweet)) for tweet in user_tweets
    ])
    return np.asarray([mean_numbers, mean_money])


def average_personal_pronouns(user_tweets):
    """ Returns the average number of personal pronouns per tweets """
    personal_pronouns_count = []
    for tweet_words in tweets_to_words(user_tweets, remove_tags=True):
        count = 0
        for tag in nltk.pos_tag(tweet_words):
            if tag[1] == 'PRP':
                count += 1
        
        personal_pronouns_count.append(count)

    return np.mean(personal_pronouns_count)


def char_to_words_ratio(user_tweets):
    """ Returns the ratio of characters to words in the users tweets """
    chars = 0
    words = 0
    for tweet in user_tweets:
        cleaned_tweet = clean_text(tweet, remove_digits=False, remove_tags=True)
        chars += len(cleaned_tweet)
        words += len(cleaned_tweet.split())
    
    chars -= words # don't want to count spaces in chars
    return chars / max(1, words)


def quote_counts(user_tweets):
    """ Returns the total and average number of quotes used by the user """
    num_quotes = [len(re.findall("(?:^| )(?:“.*?”|‘.*?’|\".*?\"|\'.*?\')", tweet)) for tweet in user_tweets]
    return np.asarray([sum(num_quotes), np.mean(num_quotes)])


def capitalisation_counts(user_tweets):
    """ Returns the following counts: average number of words with a capitalised first letter, 
    average number of fully capitalised words """
    first_capitalised = []
    fully_capitalised = []
    for tweet in user_tweets:
        cleaned_tweet = clean_text(tweet, remove_tags=True)
        first_capitalised.append(len(re.findall("[A-Z][a-z]+", cleaned_tweet)))
        fully_capitalised.append(len(re.findall("[A-Z]{2,}[^\w]", cleaned_tweet)))
    
    return np.asarray([
        np.mean(first_capitalised),
        np.mean(fully_capitalised),
    ])

In [631]:
# Stats data
read_stats_extractor = TweetStatsExtractor(extractors=[
    tag_counts,
    retweet_ratio,
    emojis_count,
    std_dev_words,
    syllables_to_words_ratios,
    average_tweet_lengths,
    type_to_token_ratio,
    truncated_tweets,
    punctuation_counts,
    number_counts,
    average_personal_pronouns,
    char_to_words_ratio,
    quote_counts,
    capitalisation_counts,
])
tweet_read_stats_train = read_stats_extractor.transform(tweet_train_processed)
tweet_read_stats_val = read_stats_extractor.transform(tweet_val_processed)
tweet_read_stats_test = read_stats_extractor.transform(tweet_test_processed)

read_stats_feature_names = [
    "Average number of 'RT' tags per tweet",
    "Average number of '#USER#' tags per tweet",
    "Average number of '#HASHTAG#' tags per tweet",
    "Average number of '#URL#' tags per tweet",
    "Ratio of retweets to tweets",
    "Total number of emojis",
    "Average number of emojis per tweet",
    "Total emoji type-token ratio",
    "Total syllables-words ratio",
    "Mean syllables-words ratio",
    "Min syllables-words ratio",
    "Max syllables-words ratio",
    "Average tweet lengths in words",
    "Average tweet lengths in characters",
    "Total word type-token ratio",
    "Number of truncated tweets",
    "Average number of !",
    "Average number of ?",
    "Average number of ,",
    "Average number of :",
    "Total punctuation type-token ratio",
    "Average number of numerical values",
    "Average number of monetary values",
    "Average number of personal pronouns",
    "Ratio of characters to words",
    "Total number of quotes",
    "Average number of quotes",
    "Average words with first letter capitalised",
    "Average fully capitalised words",
]

In [632]:
print_features_by_importance(tweet_read_stats_train, label_train, read_stats_feature_names)

Number of truncated tweets: 15.697604815533163
Average fully capitalised words: 13.947330904176685
Average number of '#USER#' tags per tweet: 11.875411892869051
Average number of 'RT' tags per tweet: 11.473491491452878
Ratio of retweets to tweets: 10.589829935140598
Min syllables-words ratio: 8.552992870212982
Total number of quotes: 8.520799109338276
Mean syllables-words ratio: 8.363220646357618
Total punctuation type-token ratio: 6.3187580745070155
Total emoji type-token ratio: 6.15790343224915
Average number of ?: 5.801838320589138
Total number of emojis: 4.920762063094781
Average number of emojis per tweet: 4.920762063094781
Average number of :: 3.407586382136467
Ratio of characters to words: 3.233606472319345
Average number of ,: 2.083923284452045
Total syllables-words ratio: 2.0794155038702145
Max syllables-words ratio: 1.5934925431587281
Average number of '#HASHTAG#' tags per tweet: 1.5501199157565246
Average number of numerical values: 1.408375555173822
Total word type-token ra

In [635]:
read_stats_search_best_params, read_stats_search_df = grid_search(
    tweet_read_stats_train, label_train, tweet_read_stats_val, label_val)

Estimators: 100%|████████████████████████████████████████████████████████████████████| 5/5 [1:41:16<00:00, 1215.35s/it]


In [636]:
read_stats_search_best_params

[('LogisticRegression',
  {'Estimator__C': 1,
   'Estimator__penalty': 'l1',
   'Estimator__solver': 'liblinear',
   'SelectKBest__k': 7}),
 ('SVC',
  {'Estimator__C': 1,
   'Estimator__kernel': 'rbf',
   'Estimator__probability': True,
   'SelectKBest__k': 15}),
 ('RandomForestClassifier',
  {'Estimator__criterion': 'gini',
   'Estimator__min_samples_leaf': 2,
   'Estimator__n_estimators': 25,
   'SelectKBest__k': 29}),
 ('GradientBoostingClassifier',
  {'Estimator__learning_rate': 0.05,
   'Estimator__loss': 'exponential',
   'Estimator__min_samples_leaf': 8,
   'Estimator__n_estimators': 25,
   'SelectKBest__k': 23}),
 ('KNeighborsClassifier',
  {'Estimator__n_neighbors': 9,
   'Estimator__weights': 'distance',
   'SelectKBest__k': 22})]

In [637]:
read_stats_search_df

Unnamed: 0,Estimator,K best features,Mean CV Loss,Mean CV F1,Mean CV Accuracy,Val Loss,Val Precision,Val Recall,Val F1,Val Accuracy
0,LogisticRegression,7,0.636388,0.677224,0.657143,0.662335,0.615385,0.695652,0.653061,0.622222
1,SVC,15,0.620323,0.65069,0.638095,0.606595,0.68,0.73913,0.708333,0.688889
2,RandomForestClassifier,29,0.590008,0.707978,0.695238,0.673525,0.590909,0.565217,0.577778,0.577778
3,GradientBoostingClassifier,23,0.607356,0.683506,0.695238,0.693411,0.545455,0.521739,0.533333,0.533333
4,KNeighborsClassifier,22,0.645925,0.665613,0.62381,0.617215,0.576923,0.652174,0.612245,0.577778


# Named Entity Recognition Model
* Extract user usage of named entities, and create a feature vector from counts of the different named entities
* "TakeLab at SemEval-2019 Task 4: Hyperpartisan News Detection" paper used an NER counter feature to help classify hyperpartisan news. 

In [638]:
import spacy

spacy_nlp = spacy.load("en_core_web_sm")
spacy_ner_labels = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", 
                    "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]

In [639]:
def named_entities_count_array(user_tweets):
    """ Extract the named entities from a users tweets, and return an array of counts for each entity """
    freq = dict.fromkeys(spacy_ner_labels, 0)
    for tweet in user_tweets:
        cleaned_tweet = clean_text(tweet, remove_digits=False, remove_tags=True)
        tweet_ne = spacy_nlp(cleaned_tweet).ents
        for entity in tweet_ne:
            freq[entity.label_] += 1
    
    return np.asarray(list(freq.values()))

In [640]:
# Extract NER count arrays
ner_stats_extractor = TweetStatsExtractor(extractors=[named_entities_count_array])

tweet_ner_stats_train = ner_stats_extractor.transform(tweet_train_processed)
tweet_ner_stats_val = ner_stats_extractor.transform(tweet_val_processed)
tweet_ner_stats_test = ner_stats_extractor.transform(tweet_test_processed)

In [None]:
print_features_by_importance(tweet_ner_stats_train, label_train, spacy_ner_labels)

## Finding the best Classifier
* Classify users based on their (normalized) counts of named entitities in their text
* Performing a grid search on: LogisticRegression, SVC, RandomForestClassifier, GradientBoostingClassifier, KNeighborsClassifier

In [641]:
ner_search_best_params, ner_search_df = grid_search(tweet_ner_stats_train, label_train, tweet_ner_stats_val, label_val)

Estimators: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [33:12<00:00, 398.52s/it]


In [642]:
ner_search_best_params

[('LogisticRegression',
  {'Estimator__C': 1,
   'Estimator__penalty': 'l1',
   'Estimator__solver': 'liblinear',
   'SelectKBest__k': 8}),
 ('SVC',
  {'Estimator__C': 1,
   'Estimator__kernel': 'rbf',
   'Estimator__probability': True,
   'SelectKBest__k': 9}),
 ('RandomForestClassifier',
  {'Estimator__criterion': 'gini',
   'Estimator__min_samples_leaf': 1,
   'Estimator__n_estimators': 25,
   'SelectKBest__k': 17}),
 ('GradientBoostingClassifier',
  {'Estimator__learning_rate': 0.2,
   'Estimator__loss': 'deviance',
   'Estimator__min_samples_leaf': 4,
   'Estimator__n_estimators': 25,
   'SelectKBest__k': 17}),
 ('KNeighborsClassifier',
  {'Estimator__n_neighbors': 10,
   'Estimator__weights': 'distance',
   'SelectKBest__k': 7})]

In [643]:
ner_search_df

Unnamed: 0,Estimator,K best features,Mean CV Loss,Mean CV F1,Mean CV Accuracy,Val Loss,Val Precision,Val Recall,Val F1,Val Accuracy
0,LogisticRegression,8,0.663532,0.662051,0.652381,0.62177,0.695652,0.695652,0.695652,0.688889
1,SVC,9,0.622041,0.645832,0.647619,0.616995,0.708333,0.73913,0.723404,0.711111
2,RandomForestClassifier,17,0.588711,0.667477,0.661905,0.639739,0.73913,0.73913,0.73913,0.733333
3,GradientBoostingClassifier,17,0.597132,0.683889,0.67619,0.718097,0.666667,0.695652,0.680851,0.666667
4,KNeighborsClassifier,7,0.649472,0.635092,0.619048,1.364982,0.625,0.652174,0.638298,0.622222


# Sentiment Model
* Calculate the sentiment of a users tweets, and create a feature vector of these scores

In [614]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [615]:
def tweet_sentiment_scores(user_tweets):
    """ Returns the average, standard deviation, and max/min sentiment scores of the user """
    tweet_polarity = np.asarray([analyzer.polarity_scores(tweet)['compound'] for tweet in user_tweets])
    sent_mean = np.mean(tweet_polarity, axis=0)
    sent_std_dev = std_dev(tweet_polarity, sent_mean)
    sent_max = np.max(tweet_polarity, axis=0)
    sent_min = np.min(tweet_polarity, axis=0)
    
    num_pos, num_neu, num_neg = 0, 0, 0
    for score in tweet_polarity:
        if score >= 0.05:
            num_pos += 1
        elif score <= -0.05:
            num_neg += 1
        else:
            num_neu += 1
    
    return np.asarray([sent_mean, sent_std_dev, sent_max, sent_min, num_pos, num_neu, num_neg])

def overall_sentiment(user_tweets):
    """ Returns the overall sentiment when all of the users tweets have been concatenated """
    return analyzer.polarity_scores(". ".join(user_tweets))['compound']

In [616]:
# Extract NER count arrays
sent_stats_extractor = TweetStatsExtractor(extractors=[tweet_sentiment_scores, overall_sentiment])

tweet_sent_stats_train = sent_stats_extractor.transform(tweet_train_processed)
tweet_sent_stats_val = sent_stats_extractor.transform(tweet_val_processed)
tweet_sent_stats_test = sent_stats_extractor.transform(tweet_test_processed)

In [617]:
sent_feature_names = [
    "Average tweet sentiment",
    "Standard deviation of tweet sentiments",
    "Max tweet sentiment",
    "Min tweet sentiment",
    "Number of positive tweets",
    "Number of neutral tweets",
    "Number of negative tweets",
    "Overall sentiment of the user",
]
print_features_by_importance(tweet_sent_stats_train, label_train, sent_feature_names)

Average tweet sentiment: 16.664553393764
Number of negative tweets: 16.397770308202634
Number of positive tweets: 11.078266816031569
Overall sentiment of the user: 6.734827276462316
Min tweet sentiment: 6.641085215697125
Max tweet sentiment: 3.2282076507667554
Standard deviation of tweet sentiments: 2.3355819591396343
Number of neutral tweets: 0.5141600099561452


In [618]:
sent_search_best_params, sent_search_df = grid_search(
    tweet_sent_stats_train, label_train, tweet_sent_stats_val, label_val)

Estimators: 5it [14:04, 168.99s/it]


In [619]:
sent_search_best_params

[('LogisticRegression',
  {'Estimator__C': 1,
   'Estimator__penalty': 'l2',
   'Estimator__solver': 'liblinear',
   'SelectKBest__k': 2}),
 ('SVC',
  {'Estimator__C': 1,
   'Estimator__kernel': 'rbf',
   'Estimator__probability': True,
   'SelectKBest__k': 3}),
 ('RandomForestClassifier',
  {'Estimator__criterion': 'entropy',
   'Estimator__min_samples_leaf': 8,
   'Estimator__n_estimators': 100,
   'SelectKBest__k': 8}),
 ('GradientBoostingClassifier',
  {'Estimator__learning_rate': 0.05,
   'Estimator__loss': 'deviance',
   'Estimator__min_samples_leaf': 8,
   'Estimator__n_estimators': 25,
   'SelectKBest__k': 8}),
 ('KNeighborsClassifier',
  {'Estimator__n_neighbors': 10,
   'Estimator__weights': 'distance',
   'SelectKBest__k': 6})]

In [620]:
sent_search_df

Unnamed: 0,Estimator,K best features,Mean CV Loss,Mean CV F1,Mean CV Accuracy,Val Loss,Val Precision,Val Recall,Val F1,Val Accuracy
0,LogisticRegression,2,0.668755,0.612354,0.614286,0.569454,0.809524,0.73913,0.772727,0.777778
1,SVC,3,0.657962,0.645456,0.62381,0.567114,0.75,0.782609,0.765957,0.755556
2,RandomForestClassifier,8,0.638942,0.636711,0.628571,0.537438,0.8,0.695652,0.744186,0.755556
3,GradientBoostingClassifier,8,0.65327,0.635511,0.619048,0.602828,0.666667,0.782609,0.72,0.688889
4,KNeighborsClassifier,6,0.851699,0.63301,0.604762,0.573512,0.666667,0.695652,0.680851,0.666667


# All Statistics Model
* Concatenate all of the statistical features and input them into a single model

In [None]:
# Stats data
all_stats_extractor = TweetStatsExtractor(extractors=[
    tag_counts,
    retweet_ratio,
    emojis_count,
    std_dev_words,
    syllables_to_words_ratios,
    average_tweet_lengths,
    type_to_token_ratio,
    truncated_tweets,
    punctuation_counts,
    number_counts,
    average_personal_pronouns,
    char_to_words_ratio,
    quote_counts,
    capitalisation_counts,
    named_entities_count_array,
    tweet_sentiment_scores,
    overall_sentiment,
])
tweet_all_stats_train = all_stats_extractor.transform(tweet_train_processed)
tweet_all_stats_val = all_stats_extractor.transform(tweet_val_processed)
tweet_all_stats_test = all_stats_extractor.transform(tweet_test_processed)

In [None]:
all_search_best_params, all_search_df = grid_search(
    tweet_all_stats_train, label_train, tweet_all_stats_val, label_val)

In [None]:
all_search_best_params

In [None]:
all_search_df

# Combining the Ensemble model

## Training the best performing models

In [7]:
# Readability Model - SVC
readability_model = Pipeline([
    ('SelectKBest', SelectKBest()), 
    ('Estimator', SVC())
])
readability_model.set_params({
    'Estimator__C': 1, 
    'Estimator__kernel': 'rbf',
    'Estimator__probability': True,
    'SelectKBest__k': 15,
})

readability_model.fit(tweet_read_stats_train, label_train)

In [None]:
# NER Model - RandomForestClassifier
ner_model = Pipeline([
    ('SelectKBest', SelectKBest()), 
    ('Estimator', RandomForestClassifier())
])
ner_model.set_params({
    'Estimator__criterion': 'gini',
    'Estimator__min_samples_leaf': 1,
    'Estimator__n_estimators': 25,
    'SelectKBest__k': 17,
})

ner_model.fit(tweet_ner_stats_train, label_train)

In [None]:
# Sentiment Model - RandomForestClassifier
sentiment_model = Pipeline([
    ('SelectKBest', SelectKBest()), 
    ('Estimator', RandomForestClassifier())
])
sentiment_model.set_params({
    'Estimator__criterion': 'entropy',
    'Estimator__min_samples_leaf': 8,
    'Estimator__n_estimators': 100,
    'SelectKBest__k': 8
})

sentiment_model.fit(tweet_sent_stats_train, label_train)

In [None]:
# All Statistics Model - 
all_stats_model = Pipeline([
    ('SelectKBest', SelectKBest()), 
    ('Estimator', ?())
])
all_stats_model.set_params({

})

all_stats_model.fit(tweet_all_stats_train, label_train)