In [1]:
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from xgboost import XGBClassifier
import time
import scipy
from scipy.spatial import distance

import joblib

In [2]:
STOPWORDS = set(stopwords.words('english'))
PUNCT_TRANS = str.maketrans('', '', string.punctuation)

# Functions
## Dataset

In [3]:
def read_data(path='./liar_dataset'):
    features = ['ID', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'affiliation',
         'barely-true', 'false', 'half-true', 'mostly-true', 'pant-on-fire', 'context']
    
    # merge train.tsv + valid.tsv
    TRAIN = pd.concat([pd.read_csv(path+'/train.tsv', delimiter='\t', names=features, quoting=3),
                        pd.read_csv(path+'/valid.tsv', delimiter='\t', names=features, quoting=3)],
                        ignore_index=True)

    TEST = pd.read_csv(path+'/test.tsv', delimiter='\t', names=features, quoting=3)
    
    return TRAIN, TEST

## Preprocessing

In [4]:
def subtract_current_credit(row):
    """
    
    Subtract the current label from the credit history of current statement.
        
    """
    label = row['label'] 
    try:
        row[label] -= 1  
    except:
        pass
    return row
    
def fill_na(data):
    """

    Preproces steps of LIAR dataset which is include:
        * Define which are text features
        * Define which are numeric features (credit history)
        * Fill the blanks (missing cell) of text features with word [unknow]
        * Prevent data leakage from numeric features
            According to dataset author: 
                "Credit history include the count of the current statement, 
                it is important to subtract the current label from the credit history when using this 
                meta data vector in prediction experiments."

    """
    text_features = ['statement', 'subject', 'speaker', 'job', 'state', 'affiliation', 'context']
    num_features = ['barely-true', 'false', 'half-true', 'mostly-true', 'pant-on-fire']
    data[text_features] = data[text_features].fillna("unknown").astype(str)
    data[num_features] = data[num_features].fillna(0).astype(int)
    
    # subtract current_credit
    data = data.apply(subtract_current_credit, axis=1)
    return data

In [5]:
def TXT_preprocess(text):
    """
    
    Clean statement feature.
    
    """
    text = text.lower()
    text = text.translate(PUNCT_TRANS)
    text = ' '.join([word for word in word_tokenize(text) if word not in STOPWORDS])
    return text

def context_preprocess(text):
    """
    
    Clean context feature.
    
    """
    text = text.lower()
    text = re.sub('e mail|e-mail|email|mailer','mail', text)
    text = re.sub('television','tv', text)
    text = re.sub('website','web', text)
    text = text.translate(PUNCT_TRANS)
    text = ' '.join([word for word in word_tokenize(text) if word not in STOPWORDS])
    return text
    
def subject_preprocess(text):
    """
    
    Clean subject feature.
    
    """
    text = text.lower()
    text = ' '.join(text.split(','))
    return text

def job_preprocess(text):
    """
    
    Clean job feature.
    
    """
    text = text.lower()
    text = text.translate(PUNCT_TRANS)
    return text


In [6]:
class TXT_Transformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X['statement'].apply(TXT_preprocess)

class CT_Transformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        output = pd.DataFrame()
        output['context'] = X['context'].apply(context_preprocess)
        output['subject'] = X['subject'].apply(subject_preprocess)
        output['job'] = X['job'].apply(job_preprocess)
        output['state'] = X['state'].str.lower()
        output['affiliation'] = X['affiliation'].str.lower()
        return output
        
class CT_Vectorizer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.context_vtr = TfidfVectorizer()
        self.subject_vtr = CountVectorizer(binary=True)
        self.job_vtr = CountVectorizer(binary=True)
        self.state_vtr = CountVectorizer(binary=True)
        self.affiliation_vtr = CountVectorizer(binary=True)
        
    def fit(self, X, y=None):
        self.context_vtr.fit(X['context'])
        self.subject_vtr.fit(X['subject'])
        self.job_vtr.fit(X['job'])
        self.state_vtr.fit(X['state'])
        self.affiliation_vtr.fit(X['affiliation'])
        return self
    def transform(self, X):
        return scipy.sparse.hstack((
            self.context_vtr.transform(X['context']),
            self.subject_vtr.transform(X['subject']),
            self.job_vtr.transform(X['job']),
            self.state_vtr.transform(X['state']),
            self.affiliation_vtr.transform(X['affiliation'])
        ))
        
class CH_Transformer(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[['barely-true', 'false', 'half-true', 'mostly-true', 'pant-on-fire']].values


## Modeling

In [7]:
class Binary_models(TransformerMixin, BaseEstimator):
    def __init__(self, *, param_grid=None):
        self.models = []
        self.param_grid = param_grid
        self.maps = [
                {
                    'pants-fire'  : 0, 
                    'false'       : 0, 
                    'barely-true' : 1, 
                    'half-true'   : 1, 
                    'mostly-true' : 1, 
                    'true'        : 1
                },
                {
                    'pants-fire'  : 0, 
                    'false'       : 0, 
                    'barely-true' : 1, 
                    'half-true'   : 1, 
                    'mostly-true' : 0, 
                    'true'        : 0
                },
                {
                    'pants-fire'  : 0, 
                    'false'       : 0, 
                    'barely-true' : 0, 
                    'half-true'   : 0, 
                    'mostly-true' : 1, 
                    'true'        : 1
                },
                {
                    'pants-fire'  : 0, 
                    'false'       : 0, 
                    'barely-true' : 0, 
                    'half-true'   : 1, 
                    'mostly-true' : 1, 
                    'true'        : 1
                }
            ]
    def fit(self, X, y=None):
        for i in range(4):
            y_ = np.vectorize(self.maps[i].get)(y)
            search = GridSearchCV(
                estimator=XGBClassifier(),
                param_grid=self.param_grid,
                scoring= 'neg_log_loss',
                cv=3
            )
            search.fit(X, y_)
            self.models.append(search)
        return self

    def transform(self, X):
        pred = np.empty((X.shape[0], 4))
        for i in range(4):
            pred[:, i] = self.models[i].predict_proba(X)[:, 0]
        return pred

class MultiClass_model(ClassifierMixin, BaseEstimator):
    
    def __init__(self, *, param_grid=None):
        self.model = None
        self.param_grid = param_grid
        self.map = {
            'pants-fire'  : 0, 
            'false'       : 1, 
            'barely-true' : 2, 
            'half-true'   : 3, 
            'mostly-true' : 4, 
            'true'        : 5
        }
        self.truth_class = {
            0 : 'pants-fire', 
            1 : 'false'     ,
            2 : 'barely-true',
            3 : 'half-true'  ,
            4 : 'mostly-true',
            5 : 'true'
        }
    def fit(self, X, y=None):
        y_ = np.vectorize(self.map.get)(y)
        search = GridSearchCV(
            estimator=XGBClassifier(),
            param_grid=self.param_grid,
            scoring= 'neg_log_loss',
            cv=5
        )
        self.model = search.fit(X, y_)
        return self
        
    def predict(self, X):
        pred = self.model.predict(X)
        return np.vectorize(self.truth_class.get)(pred)


In [8]:
TRAIN, TEST = read_data()

y_train = TRAIN['label'].values
y_test = TEST['label'].values

X_train = fill_na(TRAIN).drop(columns=['ID', 'label', 'speaker'])
X_test = fill_na(TEST).drop(columns=['ID', 'label', 'speaker'])

In [9]:
bin_param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [.1, .2, .3, .4, .5],
        'max_depth': [2, 3, 4]
    }
multi_param_grid = {
    'objective' : ['multi:softmax'],
    'n_estimators': [50, 100, 150],
    'learning_rate': [.01, .1, .5],
    'max_depth': [3, 6, 9]
}
k_params = [500, 200, 100, 50]

In [10]:
for k in k_params:
    TXT_pipeline = Pipeline([
            ('TXT_clean', TXT_Transformer()),
            ('TXT_vector', TfidfVectorizer()),
            ('TXT_chi2', SelectKBest(score_func=chi2)),
            ('TXT_binary', Binary_models(param_grid=bin_param_grid))
        ])
    CT_pipeline = Pipeline([
            ('CT_clean', CT_Transformer()),
            ('CT_vector', CT_Vectorizer()),
            ('CT_binary', Binary_models(param_grid=bin_param_grid))
        ])
    CH_pipeline = Pipeline([('CH_clean', CH_Transformer())])
    
    model_pipeline = Pipeline(
        [
            ('Merge', FeatureUnion(
                [
                    ('TXT', TXT_pipeline),
                    ('CT', CT_pipeline),
                    ('CH', CH_pipeline)
                ]
                ,n_jobs=-1
            )),
            ('Scaler', MinMaxScaler()),
            ('Classifier', MultiClass_model(param_grid=multi_param_grid))
        ])
    model_pipeline.set_params(Merge__TXT__TXT_chi2__k= k)
    model_pipeline.fit(X_train, y_train)
    
    pred_train = model_pipeline.predict(X_train)
    pred_test = model_pipeline.predict(X_test)
    
    score_train = accuracy_score(y_train, pred_train)
    score_test = accuracy_score(y_test, pred_test)
    print(f'K={k}')
    print(f'TRAINING accuracy: {score_train*100:.2f}%')
    print(f'TEST accuracy: {score_test*100:.2f}%\n','-'*20)
    joblib.dump(model_pipeline, f'models/M_chi2_{k}.pkl')

K=500
TRAINING accuracy: 68.74%
TEST accuracy: 46.53%
 --------------------
K=200
TRAINING accuracy: 67.70%
TEST accuracy: 48.01%
 --------------------
K=100
TRAINING accuracy: 66.67%
TEST accuracy: 49.49%
 --------------------
K=50
TRAINING accuracy: 88.85%
TEST accuracy: 54.79%
 --------------------
