In [1]:
# TODO: Replace with your Student NET ID
_NAME = "Jason Lee Jia Xuan"
_STUDENT_NUM = 'E0957670'

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import f1_score
# for tokenizing and extracting bag-of-words vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cs-4248-fact-checking-2420/train.csv
/kaggle/input/cs-4248-fact-checking-2420/test.csv


# Import Data

In [3]:
# import data
train_data = pd.read_csv("../input/cs-4248-fact-checking-2420/train.csv")
test_data = pd.read_csv("../input/cs-4248-fact-checking-2420/test.csv")
train_data.head()

Unnamed: 0,Sentence_id,Text,Verdict
0,1,I think we've seen a deterioration of values.,-1
1,2,I think for a while as a nation we condoned th...,-1
2,3,"For a while, as I recall, it even seems to me ...",-1
3,4,"So we've seen a deterioration in values, and o...",-1
4,5,"We got away, we got into this feeling that val...",-1


# Data Preprocessing
Do some data preprocessing so that the data is of a good quality
- Clean data
- Resolve imbalances
    - Sampling
    - Data augmentation (?)
- Tokenization

## Clean Data
Obtain a standardized set of data
- Data should not contain missing values
- Data should not have duplicates. If there are any duplicates, remove them.

In [4]:
# remove missing values and remove duplicates
def clean_data(data):
    # count missing data, I think kaggle tells us the data does not have missing values
    print("Rows with null Sentence_id: ", sum(data["Sentence_id"].isnull()))
    print("Rows with null Text: ", sum(data["Text"].isnull()))
    print("Rows with null Verdict: ", sum(data["Verdict"].isnull()))

    # remove duplicates from the data
    # set keep=False because we have no idea which label is actually correct
    data_cleaned = data.drop_duplicates(["Text"], keep=False)
    return data_cleaned

train_data = clean_data(train_data)

Rows with null Sentence_id:  0
Rows with null Text:  0
Rows with null Verdict:  0


## Resolve Class Imbalance
In order to train the model properly, we need to resolve the class imbalance.
We can either upsample or downsample.
- For simplicity, we try downsampling here.

In [5]:
def balance_classes(data):
    # show how many data points there are for each verdict in the training data
    print("Old counts:\n", data.groupby("Verdict").count())
    # obtain number of samples for smallest class
    min_count = data.groupby("Verdict").count()['Text'].min()
    # sample from all classes this amount
    class1 = data[data['Verdict'] == -1].sample(min_count)
    class2 = data[data['Verdict'] == 0].sample(min_count)
    class3 = data[data['Verdict'] == 1].sample(min_count)
    # combine
    data_balanced = pd.concat([class1, class2, class3], ignore_index=True)
    # verify counts
    print("New counts:\n", data_balanced.groupby("Verdict").count())
    return data_balanced

train_data = balance_classes(train_data)
train_data

Old counts:
          Sentence_id   Text
Verdict                    
-1             14542  14542
 0              2388   2388
 1              5386   5386
New counts:
          Sentence_id  Text
Verdict                   
-1              2388  2388
 0              2388  2388
 1              2388  2388


Unnamed: 0,Sentence_id,Text,Verdict
0,5150,His experience has been different from mine.,-1
1,4868,"Now, there are people who go to bed hungry in ...",-1
2,2155,It's my strong feeling that we ought to sell a...,-1
3,4220,"Sure, there's more work to do.",-1
4,9963,And if a few more people had gone to the polls...,-1
...,...,...,...
7159,5566,And that state we controlled spending.,1
7160,9924,"While I was Governor, more than eight years ag...",1
7161,7993,And yet even though Ambassador Smith and Ambas...,1
7162,18521,It's not driven by politics.,1


## Data Split
Split data into training, validation, and test sets for training a model.
We will use a 80-10-10 split.

In [6]:
X, y = train_data["Text"], train_data["Verdict"]
X_train, X_a, y_train, y_a = train_test_split(X, y, test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_a, y_a, test_size=0.5, random_state=24)
print("Number of rows")
print("X_train: ", X_train.shape[0], "y_train: ", y_train.shape[0])
print("X_valid: ", X_valid.shape[0], "y_valid: ", y_valid.shape[0])
print("X_test: ", X_test.shape[0], "y_test: ", y_test.shape[0])

Number of rows
X_train:  5731 y_train:  5731
X_valid:  716 y_valid:  716
X_test:  717 y_test:  717


In [7]:
# Naive Bayes Model with one-hot encoding vectors of words (Bag of Words)
class Model:
    def __init__(self):
        self.vectorizer = CountVectorizer(lowercase=True)
        self.classifier = MultinomialNB()
    
    def train(self, X_train, y_train):
        # fit the vectorizer and learn the vocabulary
        X_train_features = self.vectorizer.fit_transform(X_train).toarray()
        # fit the classifier to learn from the extracted vectors
        self.classifier.fit(X_train_features, y_train)

    def predict(self, X_test):
        X_test_features = self.vectorizer.transform(X_test)
        return self.classifier.predict(X_test_features)

# Feature Engineering and Modelling
To engineer features, we need to firstly process the Text strings via:
- Tokenization (compulsory)
- Case Folding
- Stemming
- Lemmatization
- Segmentation

To obtain a baseline model, we will only do:
- Tokenization

After processing the Text into tokens, we have to derive features from the tokens. A few approaches available:
- Bag-of-Words representation
- Document term matrix with tf-idf weights
- PPMI term context matrix (?)
- Dense word embedding (Word2Vec)
- Can also apply PCA

For the model, we can choose from these 3 approaches:
- Naive Bayes (generative classifier)
- Logistic Regression (discriminative classifier)
- Multi-Layer Perceptron Neural Network (discriminative classifier)

To obtain a baseline model, we will only do this for now:
- Features: Bag-of-Words, one-hot encoding of documents
- Model: Naive Bayes


In [8]:
# train model
model = Model()
model.train(X_train, y_train)

## Results
Predict results and compute performance of the model

In [9]:
def compute_performance_per_class(model, X_test, y_test):
    y_pred = model.predict(X_test)
    # compute separately for each class
    result = []
    for c in [-1, 0, 1]:
        TP = np.sum((y_pred == c) & (y_test == c))
        FP = np.sum((y_pred == c) & (y_test != c))
        FN = np.sum((y_pred != c) & (y_test == c))
        TN = np.sum((y_pred != c) & (y_test != c))
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        F1 = 2 * (precision * recall) / (precision + recall)
        result.append([c, precision, recall, F1])
    return pd.DataFrame(data=np.array(result), columns=["Class", "Precision", "Recall", "F1"])
results = compute_performance_per_class(model, X_test, y_test)
results

Unnamed: 0,Class,Precision,Recall,F1
0,-1.0,0.645522,0.670543,0.657795
1,0.0,0.660465,0.565737,0.609442
2,1.0,0.57265,0.644231,0.606335


In [10]:
def compute_macro_f1(f1_scores):
    return np.mean(f1_scores)

macro_f1 = compute_macro_f1(results['F1'])
print("Macro F1: ", macro_f1)

Macro F1:  0.6245238595069599


# Export Results

In [11]:
def generate_result(test, y_pred, filename):
    ''' generate csv file base on the y_pred '''
    test['Verdict'] = pd.Series(y_pred)
    test.drop(columns=['Text'], inplace=True)
    test.to_csv(filename, index=False)

# output_filename = f"A2_{_NAME}_{_STUDENT_NUM}.csv"
# generate_result(test, y_pred, output_filename)