In [1]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [2]:
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf

import ipynb.fs.full.parse_datasets as datasets
import ipynb.fs.full.preprocessing as pp
import ipynb.fs.full.bert_fake_news_classifier as bclf
from ipynb.fs.full.notif_email import send_email

# Dataset

In [4]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

In [5]:
# Preprocess the data
tweet_preprocessor = pp.TweetPreprocessor(
    preprocess_funcs = [
        pp.tag_indicators,
        pp.replace_xml_and_html,
        pp.replace_emojis,
        pp.remove_punctuation,
        pp.replace_tags,
        pp.remove_hashtag_chars,
        pp.replace_accented_chars,
        pp.tag_numbers,
        pp.remove_extra_spacing,
    ])
tweet_preprocessor.preprocess(tweet_data)

# Individual dataset
tweet_data_individual = tweet_preprocessor.get_individual_tweets_dataset()

In [7]:
(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_data_individual, label_data)

## Sentiment

In [14]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment_analyser = SentimentIntensityAnalyzer()

In [67]:
def compound_sentiment(text):
    return sentiment_analyser.polarity_scores(text)['compound']

def sentiments(text):
    scores = sentiment_analyser.polarity_scores(text)
    return np.asarray([scores['neg'], scores['neu'], scores['pos']])

### Sentiment model 1
Measures the sentiment scores for each of the users tweets.
Uses these 100 data points to then profile the user.

In [79]:
X_train = np.asarray([[compound_sentiment(tweet) for tweet in user_tweets] for user_tweets in tweet_train])
X_train = np.sort(X_train, axis=1)
y_train = label_train

X_val = np.asarray([[compound_sentiment(tweet) for tweet in user_tweets] for user_tweets in tweet_val])
X_val = np.sort(X_val, axis=1)
y_val = label_val

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_reg_clf = LogisticRegression()
log_reg_clf.fit(X_train, y_train)

svc_clf = SVC()
svc_clf.fit(X_train, y_train)

SVC()

In [81]:
print("Train accuracy (LogisticRegression):", log_reg_clf.score(X_train, y_train))
print("Train accuracy (SVC):", svc_clf.score(X_train, y_train))

Train accuracy (LogisticRegression): 0.6857142857142857
Train accuracy (SVC): 0.6619047619047619


In [82]:
print("Validation score (LogisticRegression):", log_reg_clf.score(X_val, y_val))
print("Validation score (SVC):", svc_clf.score(X_val, y_val))

Validation score (LogisticRegression): 0.6444444444444445
Validation score (SVC): 0.6222222222222222


### Sentiment model 2
Measures the overall sentiment of the user, taking the mean of their neural, positive, and negative sentiments. So each data point is of shape (3,). This is then used to profile the user.

In [83]:
def format_X_data(data):
    return np.asarray([
        np.mean([sentiments(tweet) for tweet in user_tweets], axis=0) 
        for user_tweets in data
    ])

X_train = format_X_data(tweet_train)
y_train = label_train

X_val = format_X_data(tweet_val)
y_val = label_val

In [84]:
log_reg_clf = LogisticRegression()
log_reg_clf.fit(X_train, y_train)

svc_clf = SVC()
svc_clf.fit(X_train, y_train)

SVC()

In [85]:
print("Train accuracy (LogisticRegression):", log_reg_clf.score(X_train, y_train))
print("Train accuracy (SVC):", svc_clf.score(X_train, y_train))

Train accuracy (LogisticRegression): 0.6476190476190476
Train accuracy (SVC): 0.6619047619047619


In [86]:
print("Validation accuracy (LogisticRegression):", log_reg_clf.score(X_val, y_val))
print("Validation accuracy (SVC):", svc_clf.score(X_val, y_val))

Validation accuracy (LogisticRegression): 0.7111111111111111
Validation accuracy (SVC): 0.6444444444444445
