In [1]:
# TODO: Replace with your Student NET ID
_NAME = "Jason Lee Jia Xuan"
_STUDENT_NUM = 'E0957670'

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import math
import time

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import f1_score
# for tokenizing and extracting bag-of-words vectors
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.exceptions import NotFittedError

# tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")
print("spaCy pipeline: ", nlp.pipe_names)

# multilayer perceptron
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

spaCy pipeline:  ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
/kaggle/input/cs-4248-fact-checking-2420/train.csv
/kaggle/input/cs-4248-fact-checking-2420/test.csv
/kaggle/input/glove-6b/glove.6B.200d.txt
/kaggle/input/glove-6b/glove.6B.50d.txt
/kaggle/input/glove-6b/glove.6B.300d.txt
/kaggle/input/glove-6b/glove.6B.100d.txt


# Import Data

In [3]:
# import data
train_data = pd.read_csv("../input/cs-4248-fact-checking-2420/train.csv")
test_data = pd.read_csv("../input/cs-4248-fact-checking-2420/test.csv")
train_data.head()

Unnamed: 0,Sentence_id,Text,Verdict
0,1,I think we've seen a deterioration of values.,-1
1,2,I think for a while as a nation we condoned th...,-1
2,3,"For a while, as I recall, it even seems to me ...",-1
3,4,"So we've seen a deterioration in values, and o...",-1
4,5,"We got away, we got into this feeling that val...",-1


In [4]:
# import GloVe word embeddings
glove_word_embeddings = {}
word_embedding_dim = 300 # adjust as necessary
with open("/kaggle/input/glove-6b/glove.6B.300d.txt", 'r', encoding="utf-8") as file:
    start = time.time()
    for line in file:
        spl = line.split()
        word = spl[0]
        embedding = spl[1:]
        glove_word_embeddings[word] = np.array(embedding, dtype=np.float64)
    end = time.time()
    print(f"{len(glove_word_embeddings)} words loaded!")
    print(f"time taken: {end - start}")

400000 words loaded!
time taken: 32.65091943740845


# Data Preprocessing
Do some data preprocessing so that the data is of a good quality
- Clean data
- Resolve imbalances
    - Sampling
    - Data augmentation (?)
- Tokenization

## Clean Data
Obtain a standardized set of data
- Data should not contain missing values
- Data should not have duplicates. If there are any duplicates, remove them.

In [5]:
# remove missing values and remove duplicates
def clean_data(data):
    # count missing data, I think kaggle tells us the data does not have missing values
    print("Rows with null Sentence_id: ", sum(data["Sentence_id"].isnull()))
    print("Rows with null Text: ", sum(data["Text"].isnull()))
    print("Rows with null Verdict: ", sum(data["Verdict"].isnull()))

    # remove duplicates from the data
    # set keep=False because we have no idea which label is actually correct
    data_cleaned = data.drop_duplicates(["Text"], keep=False)
    return data_cleaned

## Resolve Class Imbalance
In order to train the model properly, we need to resolve the class imbalance.
We can either upsample or downsample.
- For simplicity, we try downsampling here.

In [6]:
def balance_classes(data):
    # show how many data points there are for each verdict in the training data
    print("Old counts:\n", data.groupby("Verdict").count())        
    # sample from all classes this amount
    min_count = data.groupby("Verdict").count()['Text'].min()
    class1 = data[data['Verdict'] == -1].sample(min_count, random_state=42)
    class2 = data[data['Verdict'] == 0].sample(min_count, random_state=42)
    class3 = data[data['Verdict'] == 1].sample(min_count, random_state=42)
    # combine
    data_balanced = pd.concat([class1, class2, class3], ignore_index=True)
    # verify counts
    print("New counts:\n", data_balanced.groupby("Verdict").count())
    return data_balanced

# Tokenization, Case Folding, Stopword and Punctuation Removal
Perform tokenization on text data:
- make lowercase
- remove stopwords
- remove punctuation
- possible to lemmatize but it is not done here.

If any sentences only contain stopwords, then remove the whole row.

In [7]:
def tokenize(data):
    result = data.copy()
    text = result["Text"]
    tokens = []
    pos = []
    remove = [] # if no tokens are generated, remove it later
    content = []
    for doc in nlp.pipe(text, batch_size=50):
        # remove tokens that we don't want
        doc_clean = [token for token in doc]
        # tokenize
        t = np.array([token.lower_ for token in doc_clean])
        p = np.array([token.pos_ for token in doc_clean])
        x = np.array([token.lemma_.lower() for token in doc_clean if not token.is_stop]) # lemmatized, stopword removed token list
        remove.append(t.shape[0] == 0)
        tokens.append(t)
        content.append(x)
        pos.append(p)

    result["Tokens"] = tokens
    result["Pos"] = pos
    result["Remove"] = remove
    result["Content"] = content
    result = result.drop(result[result["Remove"]].index)
    return result

# Data Split
Split data into training, validation, and test sets for training a model.
We will use a 80-10-10 split.

In [8]:
def split_data(tokenized_data):
    # split by class first
    class1 = tokenized_data[tokenized_data["Verdict"] == -1]
    class2 = tokenized_data[tokenized_data["Verdict"] == 1]
    class3 = tokenized_data[tokenized_data["Verdict"] == 0]
    
    # split tokenized data into 80-20 split before engineering features
    count80 = math.floor(class1.shape[0] * 0.8)
    count20 = class1.shape[0] - count80
    sample1a = class1.sample(count80, random_state=10)
    sample1b = class1.drop(sample1a.index)
    count80 = math.floor(class2.shape[0] * 0.8)
    count20 = class2.shape[0] - count80
    sample2a = class2.sample(count80, random_state=10)
    sample2b = class2.drop(sample2a.index)
    count80 = math.floor(class3.shape[0] * 0.8)
    count20 = class3.shape[0] - count80
    sample3a = class3.sample(count80, random_state=10)
    sample3b = class3.drop(sample3a.index)
    tokenized_train = pd.concat((sample1a, sample2a, sample3a), axis=0)
    tokenized_test = pd.concat((sample1b, sample2b, sample3b), axis=0)
    
    y_train = tokenized_train["Verdict"]
    y_test = tokenized_test["Verdict"]
    return tokenized_train, tokenized_test, y_train, y_test

# Feature Engineering
After processing the Text into tokens, we have to derive features from the tokens. A few approaches available:
- Bag-of-Words representation
- Document term matrix with tf-idf weights
- PPMI term context matrix (?)
- Dense word embedding (Word2Vec)
- Can also apply PCA

In [9]:
def count(doc, words):
    count = 0
    for token in doc:
        if token in words:
            count += 1
    return count

def count_ref_other_people(doc):
    words = set(["you", "others", "people", "yourself", "yourselves", "your", ""])
    return count(doc, words)

def count_ref_self(doc):
    words = set(["i", "self", "me", "myself", "mine", "friends", "friend", "family", "buddy", "mate"])
    return count(doc, words)

def count_causal(doc):
    words = set(["cause", "because", "effect", "hence", "therefore", "thus", "since", "reason", "due"])
    return count(doc, words)

def count_negation(doc):
    words = set(["no", "not", "neither", "none", "nobody", "nothing", "nowhere", "hardly", "seldom", "little"])
    return count(doc, words)

def count_promise(doc):
    words = set(["\'ll", "will", "should", "future", "believe", "think", "consider", "propose", "want", "suspect", "suppose", "time", "come", "upcoming"])
    return count(doc, words)

def count_all_or_nothing(doc):
    words = set(["everything", "nothing", "everyone", "all", "no", "never", "always", ])
    return count(doc, words)

def count_prosocial(doc):
    words = set(["care", "help", "please", "thank", "thanks", "support", "trust", "faith"])
    return count(doc, words)

def count_together(doc):
    words = set(["we", "us", "our", "ours", "together"])
    return count(doc, words)

def count_accusation(doc):
    words = set(["lie", "steal", "cheat", "betray", "deceive", "manipulate", "harm", "ruin", "destroy", "fake", "liar", "cheater", "fraud", "backstabber", "traitor", "thief", "hypocrite", "coward", "fool", "idiot", "moron"])
    return count(doc, words)


In [10]:
# obtain an embedding vector representing each sentence by taking sum over all word embeddings in each sentence
def compute_embeddings(corpus):
    doc_embeddings = []
    for doc in corpus:
        doc_embedding = np.zeros(word_embedding_dim, dtype=np.float64)
        for token in doc:
            if token in glove_word_embeddings:
                doc_embedding = np.add(doc_embedding, glove_word_embeddings[token])
        doc_embeddings.append(doc_embedding)
    return np.array(doc_embeddings)

# def compute_tfidf(tokens, vectorizer):
#     try:
#         vectorizer.transform(tokens)
#     except NotFittedError as e:
#         print("fitting vectorizer!")
#         vectorizer.fit(tokens)

#     return vectorizer.transform(tokens).toarray()

# compute counts of certain words
def get_additional_features(corpus):
    result = []
    for doc in corpus:
        counts = [
            count_ref_other_people(doc),
            count_ref_self(doc),
            count_causal(doc),
            count_negation(doc),
            count_promise(doc),
            count_all_or_nothing(doc),
            count_prosocial(doc),
            count_together(doc),
        ]
        result.append(counts)
    return np.array(result)

# some counts need lemmatized vocabulary
def get_additional_features2(lemmas):
    result = []
    for doc in lemmas:
        counts = [
            count_accusation(doc)
        ]
        result.append(counts)
    return np.array(result)

# def conduct_pca(features, pca):
#     try:
#         pca.transform(features)
#     except NotFittedError as e:
#         print("fitting pca!")
#         pca.fit(features)
#         print("PCA cumulative variance: ", np.cumsum(pca.explained_variance_ratio_))
#     return pca.transform(features)
    

# pipeline which outputs all features, run with train data first
def get_features(tokenized_data):
    features = [
        compute_embeddings(tokenized_data["Tokens"]),
        get_additional_features(tokenized_data["Tokens"]),
        get_additional_features2(tokenized_data["Content"]),
    ]
    features = np.concatenate(features, axis=1)
    return features

# Modelling
For the model, we can choose from these 3 approaches:
- Naive Bayes (generative classifier)
- Logistic Regression (discriminative classifier)
- Multi-Layer Perceptron Neural Network (discriminative classifier)

To obtain a baseline model, we will only do this for now:
- Features: Bag-of-Words, one-hot encoding of documents
- Model: Naive Bayes


In [11]:
# Neural Network works on word embeddings of sentence and other features
class Model(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        # define multilayer perceptron layers
        self.fc1 = nn.Linear(feature_dim, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, 3)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        # x = F.dropout(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        output = F.log_softmax(x, dim=1)
        return output

## Results
Predict results and compute performance of the model

In [12]:
def compute_performance_per_class(model, X_test, y_test):
    # y_pred = model.predict(X_test)
    pred_model = model(X_test)
    _, y_pred = pred_model.max(1)
    # need to convert values of 2 in y_pred back to -1
    y_pred = y_pred.numpy()
    y_test = y_test.numpy()
    y_pred[y_pred == 2] = -1
    y_test[y_test == 2] = -1
    print(y_pred)
    print(y_pred.shape)
    print(y_test.shape)
    print(y_pred[:10])
    # compute separately for each class
    result = []
    for c in [-1, 0, 1]:
        TP = np.sum((y_pred == c) & (y_test == c))
        FP = np.sum((y_pred == c) & (y_test != c))
        FN = np.sum((y_pred != c) & (y_test == c))
        TN = np.sum((y_pred != c) & (y_test != c))
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        F1 = 2 * (precision * recall) / (precision + recall)
        result.append([c, precision, recall, F1])
    return pd.DataFrame(data=np.array(result), columns=["Class", "Precision", "Recall", "F1"])

In [13]:
def compute_macro_f1(f1_scores):
    return np.mean(f1_scores)

In [14]:
def compute_performance_per_class_2(model, X_test, y_test):
    y_pred = model.predict(X_test)
    # compute separately for each class
    result = []
    for c in [-1, 0, 1]:
        TP = np.sum((y_pred == c) & (y_test == c))
        FP = np.sum((y_pred == c) & (y_test != c))
        FN = np.sum((y_pred != c) & (y_test == c))
        TN = np.sum((y_pred != c) & (y_test != c))
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        F1 = 2 * (precision * recall) / (precision + recall)
        result.append([c, precision, recall, F1])
    return pd.DataFrame(data=np.array(result), columns=["Class", "Precision", "Recall", "F1"])

# Run Code

In [15]:
# clean data
cleaned_data = clean_data(train_data)
# balanced data
balanced_data = balance_classes(cleaned_data)

Rows with null Sentence_id:  0
Rows with null Text:  0
Rows with null Verdict:  0
Old counts:
          Sentence_id   Text
Verdict                    
-1             14542  14542
 0              2388   2388
 1              5386   5386
New counts:
          Sentence_id  Text
Verdict                   
-1              2388  2388
 0              2388  2388
 1              2388  2388


In [16]:
# tokenize data
tokenized_data = tokenize(balanced_data)
print("Tokenized data columns: ", tokenized_data.columns)
# split data
tokenized_train, tokenized_test, y_train, y_test = split_data(tokenized_data)
# engineer features
X_train = get_features(tokenized_train)
print("X_train: ", X_train.shape)
X_test = get_features(tokenized_test)
# set feature size
feature_dim = X_train.shape[1]
model = Model(feature_dim)


# convert data to tensors
y_train_mlp = y_train.replace(to_replace=-1, value=2)
y_test_mlp = y_test.replace(to_replace=-1, value=2)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_mlp.to_numpy(), dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_mlp.to_numpy(), dtype=torch.long)

Tokenized data columns:  Index(['Sentence_id', 'Text', 'Verdict', 'Tokens', 'Pos', 'Remove', 'Content'], dtype='object')
X_train:  (5730, 309)


In [17]:
# train neural network

# define model parameters
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

num_epochs = 7000
for n in range(num_epochs):
    model.train()
    y_pred = model(X_train_tensor)
    loss = loss_fn(y_pred, y_train_tensor)
    if n % 1000 == 0:
        print(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

tensor(1.3168, grad_fn=<NllLossBackward0>)
tensor(0.8989, grad_fn=<NllLossBackward0>)
tensor(0.8397, grad_fn=<NllLossBackward0>)
tensor(0.8116, grad_fn=<NllLossBackward0>)
tensor(0.7928, grad_fn=<NllLossBackward0>)
tensor(0.7775, grad_fn=<NllLossBackward0>)
tensor(0.7651, grad_fn=<NllLossBackward0>)


In [18]:
# compute results
results = compute_performance_per_class(model, X_test_tensor, y_test_tensor)
print(results)
macro_f1 = compute_macro_f1(results['F1'])
print("Macro F1: ", macro_f1)

[-1 -1 -1 ...  1  0  0]
(1434,)
(1434,)
[-1 -1 -1 -1 -1  0 -1  0  0 -1]
   Class  Precision    Recall        F1
0   -1.0   0.661710  0.744770  0.700787
1    0.0   0.659389  0.631799  0.645299
2    1.0   0.703196  0.644351  0.672489
Macro F1:  0.6728585432811269


# Export Results

In [19]:
def generate_result(test, y_pred, filename):
    print(np.array(test).shape)
    tokenized_data = tokenize(test)
    print("Tokenized data columns: ", tokenized_data.columns)
    X_test = get_features(tokenized_data)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

    model.eval()
    pred_model = model(X_test_tensor)
    _, y_pred = pred_model.max(1)
    y_pred = y_pred.numpy()
    y_pred[y_pred == 2] = -1
    
    ''' generate csv file base on the y_pred '''
    test['Verdict'] = pd.Series(y_pred)
    test.drop(columns=['Text'], inplace=True)
    test.to_csv(filename, index=False)

output_filename = f"A2_{_NAME}_{_STUDENT_NUM}.csv"
generate_result(test_data, y_pred, output_filename)

(1032, 2)
Tokenized data columns:  Index(['Sentence_id', 'Text', 'Tokens', 'Pos', 'Remove', 'Content'], dtype='object')
