In [1]:
from base import *
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import time
import scipy

In [2]:
LIAR = Dataset('./liar_dataset')

TEXT_TRAIN = LIAR.train['statement'].apply(text_preprocess)
TEXT_TEST = LIAR.test['statement'].apply(text_preprocess)

CT_TRAIN = LIAR.train['context'].apply(context_preprocess)
CT_TEST = LIAR.test['context'].apply(context_preprocess)

SJ_TRAIN = LIAR.train['subject'].apply(subject_preprocess)
SJ_TEST = LIAR.test['subject'].apply(subject_preprocess)

JOB_TRAIN = LIAR.train['job'].apply(job_preprocess)
JOB_TEST = LIAR.test['job'].apply(job_preprocess)

ST_TRAIN = LIAR.train['state'].str.lower()
ST_TEST = LIAR.test['state'].str.lower()

AF_TRAIN = LIAR.train['affiliation'].str.lower()
AF_TEST = LIAR.test['affiliation'].str.lower()

In [3]:
tfidf = TfidfVectorizer()
cvtr = CountVectorizer(binary=True)

ct_train = tfidf.fit_transform(CT_TRAIN)
ct_test = tfidf.transform(CT_TEST)
sj_train = cvtr.fit_transform(SJ_TRAIN)
sj_test = cvtr.transform(SJ_TEST)
job_train = cvtr.fit_transform(JOB_TRAIN)
job_test = cvtr.transform(JOB_TEST)
st_train = cvtr.fit_transform(ST_TRAIN)
st_test = cvtr.transform(ST_TEST)
af_train = cvtr.fit_transform(AF_TRAIN)
af_test = cvtr.transform(AF_TEST)

context_train = scipy.sparse.hstack(( ct_train, sj_train, job_train, st_train, af_train ))
context_test = scipy.sparse.hstack(( ct_test, sj_test, job_test, st_test, af_test ))

text_train = tfidf.fit_transform(TEXT_TRAIN)
text_test = tfidf.transform(TEXT_TEST)

data_train = scipy.sparse.hstack((text_train, context_train))
data_test = scipy.sparse.hstack((text_test, context_test))

In [4]:
class XGB():
    def __init__(self):
        
        self.clf = XGBClassifier(
                    n_estimators = 100,
                    objective = 'multi:softmax',
                    learning_rate = .1,
                    max_depth = 6)
        
        self.map = {
            'pants-fire'  : 0, 
            'false'       : 1, 
            'barely-true' : 2, 
            'half-true'   : 3, 
            'mostly-true' : 4, 
            'true'        : 5
        }
        self.prediction = None

    def fit(self, X, y):
        start = time.time()
        y_ = y.map(self.map)
        self.clf.fit(X, y_)
        end = time.time()
        execution_time = end - start
        print(f'accuracy: {accuracy_score(y_, self.clf.predict(X))*100:.2f} %')
        print(f'training time: {execution_time:.2f}s')

    def predict(self, X, y):
        start = time.time()
        y_ = y.map(self.map)
        self.prediction = self.clf.predict(X)
        end = time.time()
        execution_time = end - start
        print(f'accuracy: {accuracy_score(y_, self.prediction)*100:.2f} %')
        print(f'prediction time: {execution_time:.2f}s')

In [5]:
clf = XGB()
print('Training phase:')
clf.fit(data_train, LIAR.y_train)
print('\nTesting phase:')
clf.predict(data_test, LIAR.y_test)

Training phase:
accuracy: 55.57 %
training time: 29.55s

Testing phase:
accuracy: 27.98 %
prediction time: 0.01s
