In [236]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score

import pandas as pd


In [237]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, *_):
        return self

    def transform(self, df):
        return df[self.key]


In [238]:
from collections import defaultdict, Counter
from nltk.tokenize.nist import NISTTokenizer
import re

class SentenceFeatures(BaseEstimator, TransformerMixin):
    # lets use the top 10 discriminating features from the simaki paper:
    # 1. average word length - done
    # 2. conjunction frequency - done 
    # 3. average sentence length in words
    # 4. comma frequency- done
    # 5. full stop frequency - done
    # 6. hapax Legomena
    # 7. different words
    # 8. average sentence length in characters
    # 9. punctuation
    # 10. hapax dislegomena
    
    def __init__(self):
        self.TK = NISTTokenizer()
        self.punct = re.compile('^[^a-zA-Z0-9_]$')
        
    def fit(self, *_):
        return self
        
    
    def transform(self, X, *_ ):
        result = []
        av_sent_length_words = 0
        
        for sent in X:
            #print(sent)
            features = defaultdict(int)
            num_words = len(sent.split())
            tokens = self.TK.tokenize(sent, lowercase = True)
            tags = nltk.pos_tag((tokens))            
            features['sent length/words'] = num_words       
            counts = Counter()
            for i, token in enumerate(tokens):
                                
                if self.punct.match(token):
                    features['punctuation'] += 1
                    if token == ',':
                        features['comma'] += 1
                    if token == '.':
                        features['period'] += 1
                
                else:
                    if tags[i][1] == 'CC':
                        features['conjunctions'] += 1
                    
                    num_chars = len(re.sub(r'\W', '', token))
                    features['mean word length'] += num_chars
                    features['sent length/chars'] += num_chars 
                    counts.update([token])
                    
            
            features['mean word length'] /= num_words
            features['hapax legomera'] = sum([1 for k,v in counts.items() if v == 1])
            features['hapax dislegomera'] = sum([1 for k,v in counts.items() if v == 2])
            #print(counts)
            features['different words'] = len(counts.keys())
            result.append(features)
            #print(features)   
        return result

In [239]:
class Model(object):
    
    def __init__(self):
        self.trainset = pd.read_csv("../data/train_set.csv")
        self.testset = pd.read_csv("../data/test_set.csv")
        self.cv = CountVectorizer(ngram_range=(0,2))
        self.lb = LabelEncoder()
        self.model = LogisticRegression(multi_class='multinomial', solver
                                        ='newton-cg' )
        self.build_pipe()
    
    def build_pipe(self):
        sent_features = Pipeline([('select', Selector(key='Utterance')),
                                 ('SF', SentenceFeatures()),
                                 ('dv', DictVectorizer())])
        
        CV = Pipeline([('select', Selector(key='Utterance')),
                      ('cv', CountVectorizer(ngram_range=(0,2)))])
        
        union = Pipeline([('union', FeatureUnion(transformer_list = [('features', sent_features),('Ngrams', CV )]))])
        self.pipe = union
        
    def train(self):
        X = self.pipe.fit_transform(self.trainset)
        y = self.lb.fit_transform(self.trainset['Stance category'])
        #print(y)
        self.model.fit(X,y)
        
    def test(self):
        X = self.pipe.transform(self.testset)
        y = self.model.predict(X)
        self.y_pred = self.lb.inverse_transform(y)
        score = f1_score(self.testset['Stance category'], self.y_pred, average = 'micro')
        print("f1-Micro score: {:.3}\n".format(score))
        return self.y_pred
    
    def compare_distributions(self):
        y_pred = pd.Series(self.y_pred)
        y = self.testset['Stance category']
        diff = (y_pred.value_counts()-y.value_counts())/y.value_counts()
        print(diff)
        

In [240]:
model = Model()
model.train()
model.test()
model.compare_distributions()

f1-Micro score: 0.307

agreement/disagreement   -0.700000
certainty                -0.352941
contrariety               0.157143
hypotheticality           0.058824
necessity                 0.219512
prediction                0.000000
source of knowledge       0.280702
tact/rudeness            -0.555556
uncertainty              -0.384615
volition                 -0.625000
dtype: float64


  if diff:
