In [1]:
    import sklearn
    import numpy as np
import pandas as pd

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import jaccard_similarity_score, hamming_loss, f1_score, accuracy_score

import pandas as pd


In [2]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Select dataframe column, can be used in pipelines 
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, *_):
        return self

    def transform(self, df):
        return df[self.key]


In [3]:
class LabelTransformer(BaseEstimator, TransformerMixin):
    """
    Join 5 label columns into list, correct 2 observed label errors 
    
    """
    def __init__(self):
        return None
    
    def fit(self, df, *_):
        return self
        
    def transform(self, df, *_):
        df = df.filter(['Stance category', 'second stance category', 'third', 'fourth', 'fifth'])
        df.replace('concession/contrarines', np.NaN, inplace = True)
        df.replace('hypotheticallity', 'hypotheticality', inplace = True)
        y = df.stack().groupby(level = 0).apply(list)
        return y

In [4]:
class MyMultiLabelBinarizer(TransformerMixin):
    
    """
    Wrap MultiLabelBinarizer so it can be used in pipeline.
    See https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize
     for problem explanation.
    """
    def __init__(self,*args, **kwargs):
        
        self.classes = ['volition',
         'prediction',
         'tact/rudeness',
         'necessity',
         'hypotheticality',
         'certainty',
         'agreement/disagreement',
         'contrariety',
         'source of knowledge',
         'uncertainty']
        
        self.encoder = MultiLabelBinarizer(classes = self.classes, *args, **kwargs)
        
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    
    def transform(self, x, y=0):
        xt = self.encoder.transform(x)
        return xt
    
    def inverse_transform(self, xt):
        x = self.encoder.inverse_transform(xt)
        return x

In [5]:
from collections import defaultdict, Counter
from nltk.tokenize.nist import NISTTokenizer
import nltk
import re

class SentenceFeatures(BaseEstimator, TransformerMixin):
    """
    Extract sentence features in format supporting Pipelines.
    
    Uses the top 10 discriminating features from Simaki (2018)) paper:
    'Evaluating stance-annotated sentences from the Brexit
    Blog Corpus: A quantitative linguistic analysis'
    
    These are:
    1. Average word length
    2. Conjunction frequency
    3. Sentence length in words
    4. Comma frequency
    5. Full stop frequency
    6. Hapax Legomena (number of words appearing in utterance only once)
    7. Number of different words used
    8. Sentence length in characters
    9. Punctuation frequency
    10. Hapax dislegomena (number of words appearing in utterance only twice)
    """
    
    def __init__(self):
        self.TK = NISTTokenizer()
        self.punct = re.compile('^[^a-zA-Z0-9_]$')
        
    def fit(self, *_):
        return self
        
    
    def transform(self, X, *_ ):
        result = []
        av_sent_length_words = 0
        
        for sent in X:
            #print(sent)
            features = defaultdict(int)
            num_words = len(sent.split())
            tokens = self.TK.tokenize(sent, lowercase = True)
            tags = nltk.pos_tag((tokens))            
            features['sent length/words'] = num_words       
            counts = Counter()
            for i, token in enumerate(tokens):
                                
                if self.punct.match(token):
                    features['punctuation'] += 1
                    if token == ',':
                        features['comma'] += 1
                    if token == '.':
                        features['period'] += 1
                
                else:
                    if tags[i][1] == 'CC':
                        features['conjunctions'] += 1
                    
                    num_chars = len(re.sub(r'\W', '', token))
                    features['mean word length'] += num_chars
                    features['sent length/chars'] += num_chars 
                    counts.update([token])
                    
            
            features['mean word length'] /= num_words
            features['hapax legomera'] = sum([1 for k,v in counts.items() if v == 1])
            features['hapax dislegomera'] = sum([1 for k,v in counts.items() if v == 2])
            #print(counts)
            features['different words'] = len(counts.keys())
            result.append(features)
            #print(features)   
        return result


class HapaxLegomera(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.TK = NISTTokenizer()
        self.punct = re.compile('^[^a-zA-Z0-9_]$')
        
    def compile_counts(self, X, *_):
        word_counts = Counter()
        for sent in X:        
            tokens = self.TK.tokenize(sent, lowercase = True)
            
            for i, token in enumerate(tokens):
                if not self.punct.match(token):
                    word_counts.update([token])
                    
        return word_counts
        
    def fit(self, X, *_):
        return self
    
    def transform(self, X, *_):
        word_counts = self.compile_counts(X)
        result= []
        for sent in X:        
            features = defaultdict(int)
            tokens = self.TK.tokenize(sent, lowercase = True)
            for i, token in enumerate(tokens):
                if not self.punct.match(token):
                    if word_counts[token] == 1:
                        features['hapax_legomera'] += 1
                    elif word_counts[token] == 2:
                        features ['hapax_dislegomera'] += 1
            result.append(features)
        return result
        
    
    

In [6]:

class Model(object):
    '''
    Multi label classifier model
    '''
    
    def __init__(self):
        self.trainset = pd.read_csv("../data/train_set.csv")
        self.testset = pd.read_csv("../data/test_set.csv")
        self.cv = CountVectorizer(ngram_range=(0,2))        
        self.model = OneVsRestClassifier(LogisticRegression())
        self.build_pipe()
    
    def build_pipe(self):
        sent_features = Pipeline(
            [('select', Selector(key='Utterance')),
             ('extract', SentenceFeatures()),
             ('vectorize', DictVectorizer())])
        
        hapax = Pipeline([
            ('select', Selector(key='Utterance')),
            ('extract', HapaxLegomera()),
            ('vectorize', DictVectorizer())])
        
        CV = Pipeline([
            ('select', Selector(key='Utterance')),
            ('cv', CountVectorizer(ngram_range=(0,2)))])
        
        self.pipe = Pipeline([('union', FeatureUnion(transformer_list = [('features', sent_features),('hapax', hapax), ('Ngrams', CV )]))])
        
        self.mlb = Pipeline([('lt', LabelTransformer()),
                            ('mmlb', MyMultiLabelBinarizer())])
        
    def train(self):
        X = self.pipe.fit_transform(self.trainset)
        y = self.mlb.fit_transform(self.trainset)
        self.model.fit(X,y)
        
    def test(self):
        X = self.pipe.transform(self.testset)
        self.y_test = self.mlb.transform(self.testset)
        self.y_test_pred = self.model.predict(X)
        self.print_scores(self.y_test, self.y_test_pred)        
        #return self.y_test_pred
    
    def print_scores(self,y, y_pred):
        hamm = hamming_loss(self.y_test, self.y_test_pred)
        print('\n{:25s}{:>10.3f}\n'.format('Hamming Loss:', hamm))
        
        classes = ['volition',
         'prediction',
         'tact/rudeness',
         'necessity',
         'hypotheticality',
         'certainty',
         'agreement/disagreement',
         'contrariety',
         'source of knowledge',
         'uncertainty']
        
        print('f1 scores \n -----------')
        f1 = f1_score(y, y_pred, average = None)
        scores = zip(classes, f1)
        for sc in sorted(scores, key = lambda s: s[1], reverse = True):
            print('{:25s}{:>10.3f}'.format(sc[0].capitalize() +':', sc[1]))
        
        
        f1_macro = f1_score(y, y_pred, average = 'macro')
        f1_micro = f1_score(y, y_pred, average = 'micro')
        
        print('\n{:25s}{:10.3f}'.format('Micro-f1 score:',f1_micro))
        print('{:25s}{:>10.3f}'.format('Macro-f1 score:',f1_macro))
        
        accuracy= accuracy_score(y, y_pred)
        print('\n{:25s}{:10.3f}'.format('Accuracy', accuracy))
        
        
    def run_model(self):
        self.train()
        self.test()
        
    def distribution(self, which):
        if which == 'test':
            df = self.testset
        elif which == 'train':
            df = self.trainset
        
        labels = df.filter(['Stance category', 'second stance category', 'third', 'fourth', 'fifth'])
        labels = labels.stack()
        print(labels.value_counts(True))
        
    def unique_labels(self):
        pass
        
    
        
    
        
    
        

In [7]:
    model = Model()
    model.run_model()


Hamming Loss:                 0.065

f1 scores 
 -----------
Contrariety:                  0.848
Uncertainty:                  0.797
Certainty:                    0.786
Prediction:                   0.777
Necessity:                    0.765
Source of knowledge:          0.732
Hypotheticality:              0.703
Volition:                     0.696
Tact/rudeness:                0.667
Agreement/disagreement:       0.667

Micro-f1 score:               0.774
Macro-f1 score:               0.744

Accuracy                      0.504


In [14]:
model.distribution('train')
print('\n')
model.distribution('test')

contrariety               0.202923
prediction                0.159501
source of knowledge       0.153912
uncertainty               0.148323
necessity                 0.108340
hypotheticality           0.086844
certainty                 0.054600
agreement/disagreement    0.033964
tact/rudeness             0.025365
volition                  0.024936
concession/contrarines    0.000860
hypotheticallity          0.000430
dtype: float64


contrariety               0.220615
prediction                0.166365
source of knowledge       0.151899
uncertainty               0.119349
necessity                 0.106691
hypotheticality           0.095841
certainty                 0.061483
volition                  0.027125
agreement/disagreement    0.025316
tact/rudeness             0.025316
dtype: float64


In [15]:
"""
To Do list

fix hapax done
investigate hamming loss 
investigate multitask learning
look at number of unique labels - harnessing output space or just relatins
label embeddings
explanation of predictions

keras, allen AI, tensorflow


1. decide what we will start with - embeddings/ multi task/ etc 
2. 

"""

'\nTo Do list\n\nfix hapax\ninvestigate hamming loss \ninvestigate multitask learning\nlook at number of unique labels - harnessing output space or just relatins\nlabel embeddings\nexplanation of predictions\n\nkeras, allen AI, tensorflow\n\n\n1. decide what we will start with - embeddings/ multi task/ etc \n2. \n\n'