In [1]:
# TODO: Replace with your Student NET ID
_NAME = "Jason Lee Jia Xuan"
_STUDENT_NUM = 'E0957670'

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import time

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import f1_score
# for tokenizing and extracting bag-of-words vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")
nlp.select_pipes(disable=["parser", "ner"]) # disable some components for performance
print("spaCy pipeline: ", nlp.pipe_names)
# vectorizer
vectorizer = TfidfVectorizer(lowercase=False, tokenizer=lambda x:x, token_pattern=None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

spaCy pipeline:  ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']
/kaggle/input/cs-4248-fact-checking-2420/train.csv
/kaggle/input/cs-4248-fact-checking-2420/test.csv
/kaggle/input/glove-twitter-word-embeddings/glove.twitter.27B.200d.txt
/kaggle/input/glove-twitter-word-embeddings/glove.twitter.27B.25d.txt
/kaggle/input/glove-twitter-word-embeddings/glove.twitter.27B.50d.txt
/kaggle/input/glove-twitter-word-embeddings/glove.twitter.27B.100d.txt


# Import Data

In [3]:
# import data
train_data = pd.read_csv("../input/cs-4248-fact-checking-2420/train.csv")
test_data = pd.read_csv("../input/cs-4248-fact-checking-2420/test.csv")
train_data.head()

Unnamed: 0,Sentence_id,Text,Verdict
0,1,I think we've seen a deterioration of values.,-1
1,2,I think for a while as a nation we condoned th...,-1
2,3,"For a while, as I recall, it even seems to me ...",-1
3,4,"So we've seen a deterioration in values, and o...",-1
4,5,"We got away, we got into this feeling that val...",-1


# Data Preprocessing
Do some data preprocessing so that the data is of a good quality
- Clean data
- Resolve imbalances
    - Sampling
    - Data augmentation (?)
- Tokenization

## Clean Data
Obtain a standardized set of data
- Data should not contain missing values
- Data should not have duplicates. If there are any duplicates, remove them.

In [4]:
# remove missing values and remove duplicates
def clean_data(data):
    # count missing data, I think kaggle tells us the data does not have missing values
    print("Rows with null Sentence_id: ", sum(data["Sentence_id"].isnull()))
    print("Rows with null Text: ", sum(data["Text"].isnull()))
    print("Rows with null Verdict: ", sum(data["Verdict"].isnull()))

    # remove duplicates from the data
    # set keep=False because we have no idea which label is actually correct
    data_cleaned = data.drop_duplicates(["Text"], keep=False)
    return data_cleaned

train_data = clean_data(train_data)

Rows with null Sentence_id:  0
Rows with null Text:  0
Rows with null Verdict:  0


## Resolve Class Imbalance
In order to train the model properly, we need to resolve the class imbalance.
We can either upsample or downsample.
- For simplicity, we try downsampling here.

In [5]:
def balance_classes(data):
    # show how many data points there are for each verdict in the training data
    print("Old counts:\n", data.groupby("Verdict").count())
    # obtain number of samples for smallest class
    min_count = data.groupby("Verdict").count()['Text'].min()
    # sample from all classes this amount
    class1 = data[data['Verdict'] == -1].sample(min_count)
    class2 = data[data['Verdict'] == 0].sample(min_count)
    class3 = data[data['Verdict'] == 1].sample(min_count)
    # combine
    data_balanced = pd.concat([class1, class2, class3], ignore_index=True)
    # verify counts
    print("New counts:\n", data_balanced.groupby("Verdict").count())
    return data_balanced

train_data = balance_classes(train_data)
train_data

Old counts:
          Sentence_id   Text
Verdict                    
-1             14542  14542
 0              2388   2388
 1              5386   5386
New counts:
          Sentence_id  Text
Verdict                   
-1              2388  2388
 0              2388  2388
 1              2388  2388


Unnamed: 0,Sentence_id,Text,Verdict
0,20082,"As soon as she releases them, I will release.",-1
1,17364,We have to respect one another.,-1
2,2094,If we turn to Helsinki - I'm glad you raised i...,-1
3,8663,We have to be able to compete in the world mar...,-1
4,21552,And you look at our miners.,-1
...,...,...,...
7159,18245,That was something I concurred with...,1
7160,17623,"We have in the Dole-Kemp economic plan, unless...",1
7161,16520,"Many of the countries below that used to say, ...",1
7162,9886,"Of course, it doesn't come out of the payroll ...",1


# Tokenization, Case Folding, Stopword and Punctuation Removal
Perform tokenization on text data:
- make lowercase
- remove stopwords
- remove punctuation
- possible to lemmatize but it is not done here.

If any sentences only contain stopwords, then remove the whole row.

In [6]:
def tokenize(data):
    text = data["Text"]
    tokens = []
    pos = []
    remove = [] # if no tokens are generated, remove it later
    for doc in nlp.pipe(text, batch_size=50):
        t = np.array([token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop])
        p = np.array([token.pos_ for token in doc if not token.is_punct and not token.is_stop])
        remove.append(t.shape[0] == 0)
        tokens.append(t)
        pos.append(p)
    data["Tokens"], data["Pos"], data["Remove"] = tokens, pos, remove
    data = data.drop(data[data["Remove"]].index)
    return data

tokenized_data = tokenize(train_data)
tokenized_data

Unnamed: 0,Sentence_id,Text,Verdict,Tokens,Pos,Remove
0,20082,"As soon as she releases them, I will release.",-1,"[soon, release, release]","[ADV, VERB, VERB]",False
1,17364,We have to respect one another.,-1,[respect],[VERB],False
2,2094,If we turn to Helsinki - I'm glad you raised i...,-1,"[turn, helsinki, glad, raise, mr., uh, frankel]","[VERB, PROPN, ADJ, VERB, PROPN, INTJ, PROPN]",False
3,8663,We have to be able to compete in the world mar...,-1,"[able, compete, world, market]","[ADJ, VERB, NOUN, NOUN]",False
4,21552,And you look at our miners.,-1,"[look, miner]","[VERB, NOUN]",False
...,...,...,...,...,...,...
7159,18245,That was something I concurred with...,1,[concur],[VERB],False
7160,17623,"We have in the Dole-Kemp economic plan, unless...",1,"[dole, kemp, economic, plan, home, worth, 500,...","[PROPN, PROPN, ADJ, NOUN, NOUN, NOUN, NUM, NOUN]",False
7161,16520,"Many of the countries below that used to say, ...",1,"[country, problem, demand, problem, work, coop...","[NOUN, NOUN, NOUN, NOUN, VERB, ADV, PROPN, NOUN]",False
7162,9886,"Of course, it doesn't come out of the payroll ...",1,"[course, come, payroll, tax]","[ADV, VERB, NOUN, NOUN]",False


# Feature Engineering
After processing the Text into tokens, we have to derive features from the tokens. A few approaches available:
- Bag-of-Words representation
- Document term matrix with tf-idf weights
- PPMI term context matrix (?)
- Dense word embedding (Word2Vec)
- Can also apply PCA

In [7]:
# obtain a document vector representing each sentence
def fit_transform(tokens):
    return vectorizer.fit_transform(tokens).toarray()

# obtain counts of adjectives, adverbs and numbers as additional features
def compute_adj_adv_num_counts(pos):
    result = []
    for p in pos:
        adj_count = np.sum(p == 'ADJ')
        adv_count = np.sum(p == 'ADV')
        num_count = np.sum(p == 'NUM')
        total_count = p.shape[0]
        result.append(np.divide(np.array([adj_count, adv_count, num_count]), total_count))
        
    return pd.DataFrame(result, columns=["ADJ", "ADV", "NUM"])

word_features = fit_transform(tokenized_data["Tokens"])
pos_features = compute_adj_adv_num_counts(tokenized_data["Pos"])
features = np.concatenate((word_features, pos_features), axis=1)
print(features.shape)

(7135, 5939)


## Data Split
Split data into training, validation, and test sets for training a model.
We will use a 80-10-10 split.

In [8]:
X, y = features, tokenized_data["Verdict"]
assert features.shape[0] == tokenized_data["Verdict"].shape[0]

X_train, X_a, y_train, y_a = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_a, y_a, test_size=0.5, random_state=24)
print("Number of rows")
print("X_train: ", X_train.shape[0], "y_train: ", y_train.shape[0])
print("X_valid: ", X_valid.shape[0], "y_valid: ", y_valid.shape[0])
print("X_test: ", X_test.shape[0], "y_test: ", y_test.shape[0])

Number of rows
X_train:  5708 y_train:  5708
X_valid:  713 y_valid:  713
X_test:  714 y_test:  714


# Modelling
For the model, we can choose from these 3 approaches:
- Naive Bayes (generative classifier)
- Logistic Regression (discriminative classifier)
- Multi-Layer Perceptron Neural Network (discriminative classifier)

To obtain a baseline model, we will only do this for now:
- Features: Bag-of-Words, one-hot encoding of documents
- Model: Naive Bayes


In [9]:
# Logistic Regression Model with tf-idf encoding of sentences.
class Model:
    def __init__(self):
        self.classifier = LogisticRegression(max_iter=1000)
    
    def train(self, X_train, y_train):
        self.classifier.fit(X_train, y_train)

    def predict(self, X_test):
        return self.classifier.predict(X_test)

In [10]:
# train model
model = Model()
model.train(X_train, y_train)

## Results
Predict results and compute performance of the model

In [11]:
def compute_performance_per_class(model, X_test, y_test):
    y_pred = model.predict(X_test)
    # compute separately for each class
    result = []
    for c in [-1, 0, 1]:
        TP = np.sum((y_pred == c) & (y_test == c))
        FP = np.sum((y_pred == c) & (y_test != c))
        FN = np.sum((y_pred != c) & (y_test == c))
        TN = np.sum((y_pred != c) & (y_test != c))
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        F1 = 2 * (precision * recall) / (precision + recall)
        result.append([c, precision, recall, F1])
    return pd.DataFrame(data=np.array(result), columns=["Class", "Precision", "Recall", "F1"])
results = compute_performance_per_class(model, X_test, y_test)
results

Unnamed: 0,Class,Precision,Recall,F1
0,-1.0,0.652174,0.632184,0.642023
1,0.0,0.567901,0.592275,0.579832
2,1.0,0.568807,0.563636,0.56621


In [12]:
def compute_macro_f1(f1_scores):
    return np.mean(f1_scores)

macro_f1 = compute_macro_f1(results['F1'])
print("Macro F1: ", macro_f1)

Macro F1:  0.5960217749129039


# Export Results

In [13]:
def generate_result(test, y_pred, filename):
    ''' generate csv file base on the y_pred '''
    test['Verdict'] = pd.Series(y_pred)
    test.drop(columns=['Text'], inplace=True)
    test.to_csv(filename, index=False)

# output_filename = f"A2_{_NAME}_{_STUDENT_NUM}.csv"
# generate_result(test, y_pred, output_filename)