In [101]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score



In [102]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, *_):
        return self

    def transform(self, df):
        return df[self.key]


In [182]:
class Model(object):
    
    def __init__(self):
        self.trainset = pd.read_csv("../data/train_set.csv")
        self.testset = pd.read_csv("../data/test_set.csv")
        self.cv = CountVectorizer(ngram_range=(0,2))
        self.lb = LabelEncoder()
        #self.y_pipe = Pipeline
        #self.X_pipe = Pipeline([('select', Selector(key='Utterance'))] + [('cv', CountVectorizer())])
        self.model = LogisticRegression(multi_class='multinomial', solver='newton-cg' )
        
        
    def train(self):
        X = self.cv.fit_transform(self.trainset['Utterance'])
        y = self.lb.fit_transform(self.trainset['Stance category'])
        #print(y)
        self.model.fit(X,y)
        
    def test(self):
        X = self.cv.transform(self.testset['Utterance'])
        y = self.model.predict(X)
        self.y_pred = self.lb.inverse_transform(y)
        score = f1_score(self.testset['Stance category'], self.y_pred, average = 'micro')
        print("f1-Micro score: {:.3}\n".format(score))
        return self.y_pred
    
    def compare_distributions(self):
        y_pred = pd.Series(self.y_pred)
        y = self.testset['Stance category']
        diff = (y_pred.value_counts()-y.value_counts())/y.value_counts()
        print(diff)
        

In [183]:
model = Model()
model.train()
model.test()
model.compare_distributions()

f1-Micro score: 0.296

agreement/disagreement   -0.900000
certainty                -0.411765
contrariety               0.242857
hypotheticality           0.147059
necessity                 0.195122
prediction                0.060000
source of knowledge       0.192982
tact/rudeness            -0.666667
uncertainty              -0.435897
volition                 -0.625000
dtype: float64


  if diff:
