In [1]:
from base import *
import numpy as np
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
LIAR = Dataset('./liar_dataset')

TEXT_TRAIN = LIAR.train['statement'] 
TEXT_TEST = LIAR.test['statement']

text_train = TEXT_TRAIN.apply(text_preprocess)
text_test = TEXT_TEST.apply(text_preprocess)

tfidf = TfidfVectorizer()
tfidf_train = tfidf.fit_transform(text_train)
tfidf_test = tfidf.transform(text_test)

data_train = tfidf_train
data_test = tfidf_test

## Proposed method

In [43]:
class ProposedMethod():
    def __init__(self):

        self.prediction = None
        self.final_model = XGBClassifier(
                                n_estimators = 100,
                                objective = 'multi:softmax',
                                learning_rate = .05,
                                max_leaves = 12,
                                max_depth = 3)
        self.final_map = {
            'pants-fire'  : 0, 
            'false'       : 1, 
            'barely-true' : 2, 
            'half-true'   : 3, 
            'mostly-true' : 4, 
            'true'        : 5
        }
        
        self.base_models = [
            XGBClassifier(
                n_estimators = 100,
                learning_rate = .3,
                max_depth = 2),

            XGBClassifier(
                n_estimators = 50,
                learning_rate = .3,
                max_depth = 2),

            XGBClassifier(
                n_estimators = 50,
                learning_rate = .1,
                max_depth = 2),

            XGBClassifier(
                n_estimators = 100,
                learning_rate = .3,
                max_depth = 2),
        ]
        
        self.maps = [
            {
                'pants-fire'  : 0, 
                'false'       : 0, 
                'barely-true' : 1, 
                'half-true'   : 1, 
                'mostly-true' : 1, 
                'true'        : 1
            },
            {
                'pants-fire'  : 0, 
                'false'       : 0, 
                'barely-true' : 1, 
                'half-true'   : 1, 
                'mostly-true' : 0, 
                'true'        : 0
            },
            {
                'pants-fire'  : 0, 
                'false'       : 0, 
                'barely-true' : 0, 
                'half-true'   : 0, 
                'mostly-true' : 1, 
                'true'        : 1
            },
            {
                'pants-fire'  : 0, 
                'false'       : 0, 
                'barely-true' : 0, 
                'half-true'   : 1, 
                'mostly-true' : 1, 
                'true'        : 1
            }
        ]
        self.scaler = MinMaxScaler()
        
    def fit_(self, X, y):
        base_preds = np.empty((y.shape[0], 4))
        for i in range(4):
            start = time.time()
            y_ = y.map(self.maps[i])
            X_ = X
            self.base_models[i].fit(X_, y_)
            base_preds[:, i] = self.base_models[i].predict_proba(X_)[:, 0]
            end = time.time()
            execution_time = end - start
            print(f'Model {i+1}:')
            print(f'accuracy: {accuracy_score(y_, self.base_models[i].predict(X_))*100:.2f} %')
            print(f'execution time: {execution_time:.2f}s')
        start = time.time()
        y_ = y.map(self.final_map)
        X_ = base_preds
        X_ = self.scaler.fit_transform(X_)
        self.final_model.fit(X_, y_)
        end = time.time()
        execution_time = end - start
        print('-'*60)
        print('Final model:')
        print(f'accuracy: {accuracy_score(y_, self.final_model.predict(X_))*100:.2f} %')
        print(f'execution time: {execution_time:.2f}s')
        

    def predict_(self, X, y):
        base_preds = np.empty((y.shape[0], 4))
        for i in range(4):
            start = time.time()
            y_ = y.map(self.maps[i])
            X_ = X
            base_preds[:, i] = self.base_models[i].predict_proba(X_)[:, 0]
            end = time.time()
            execution_time = end - start
            print(f'Model {i}:')
            print(f'accuracy: {accuracy_score(y_, self.base_models[i].predict(X_))*100:.2f} %')
            print(f'execution time: {execution_time:.2f}s')
        start = time.time()
        y_ = y.map(self.final_map)
        X_ = base_preds
        X_ = self.scaler.transform(X_)
        self.prediction = self.final_model.predict(X_)
        end = time.time()
        execution_time = end - start
        print('-'*60)
        print('Final model:')
        print(f'accuracy: {accuracy_score(y_, self.prediction)*100:.2f} %')
        print(f'execution time: {execution_time:.2f}s')
        return self.prediction

    def fit(self, X, y):
        base_preds = np.empty((y.shape[0], 4))
        for i in range(4):
            y_ = y.map(self.maps[i])
            X_ = X
            self.base_models[i].fit(X_, y_)
            base_preds[:, i] = self.base_models[i].predict_proba(X_)[:, 0]
        y_ = y.map(self.final_map)
        X_ = base_preds
        X_ = self.scaler.fit_transform(X_)
        self.final_model.fit(X_, y_)

    def predict(self, X, y):
        base_preds = np.empty((y.shape[0], 4))
        for i in range(4):
            y_ = y.map(self.maps[i])
            X_ = X
            base_preds[:, i] = self.base_models[i].predict_proba(X_)[:, 0]
        y_ = y.map(self.final_map)
        X_ = base_preds
        X_ = self.scaler.transform(X_)
        self.prediction = self.final_model.predict(X_)
        return self.prediction

In [44]:
clf = ProposedMethod()
start = time.time()
clf.fit(data_train, LIAR.y_train)
end = time.time()
execution_time = end - start
print(f'Model training time: {execution_time:.2f}s')

Model training time: 4.54s


In [45]:
print('Details on training phase')
clf.fit_(data_train, LIAR.y_train)
print()
print('Details on testing phase')
print('='*60)
pred = clf.predict_(data_test, LIAR.y_test)

Details on training phase
Model 1:
accuracy: 73.38 %
execution time: 1.41s
Model 2:
accuracy: 65.05 %
execution time: 0.70s
Model 3:
accuracy: 65.10 %
execution time: 0.78s
Model 4:
accuracy: 65.57 %
execution time: 1.35s
------------------------------------------------------------
Final model:
accuracy: 34.05 %
execution time: 0.25s

Details on testing phase
Model 0:
accuracy: 73.81 %
execution time: 0.00s
Model 1:
accuracy: 63.21 %
execution time: 0.00s
Model 2:
accuracy: 64.46 %
execution time: 0.00s
Model 3:
accuracy: 59.94 %
execution time: 0.00s
------------------------------------------------------------
Final model:
accuracy: 24.40 %
execution time: 0.00s
