In [8]:
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import sys
from sklearn.svm import SVC
import csv
from csv import writer
from csv import reader
import gensim
import numpy as np
import gensim.downloader as api
from sklearn.metrics import classification_report


def extract_features_and_labels(trainingfile):
    """Extract features from trainingdata
    Extract labels from trainingdata"""
    
    data = []
    targets = []
    with open(trainingfile, 'r', encoding='utf8') as infile:
        for line in infile:
            components = line.rstrip('\n').split()
            if len(components) > 0:
                offsetword = components[0]
                prev_tokens_one = components[1]
                prev_tokens_two = components[2]
                prev_tokens_three = components[3]
                prev_tokens_four = components[4]
                next_tokens_one = components[5]
                next_tokens_two = components[6]
                next_tokens_three = components[7]
                pos_prev_one = components[7]
                pos_prev_two = components[8]
                pos_prev_three = components[9]
                pos_prev_four = components[10]
                pos_next_one = components[11]
                pos_next_two = components[12]
                pos_next_three = components[13]
                neg_prev_one = components[14]
                neg_prev_two = components[15]
                neg_prev_three = components[16]
                neg_prev_four = components[17]
                neg_next_one = components[18]
                neg_next_two = components[19]
                neg_next_three = components[20]
                neg = components[21]
                
                feature_dict = {'Event': offsetword,
                                'Previous token one': prev_tokens_one[0],
                                'Previous token two': prev_tokens_two[0],
                                'Previous token three': prev_tokens_three[0],
                                'Previous token four': prev_tokens_four[0],
                                'Next token one': next_tokens_one[0],
                                'Next token two': next_tokens_two[0],
                                'Next token three': next_tokens_three[0],
                                'POS previous token one': pos_prev_one[0],
                                'POS previous token two': pos_prev_two[0],
                                'POS previous token three': pos_prev_three[0],
                                'POS previous token four': pos_prev_four[0],
                                'POS next token one': pos_next_one[0],
                                'POS next token two': pos_next_two[0],
                                'POS next token three': pos_next_three[0],
                                'Negcue prev token one': neg_prev_one[0],
                                'Negcue prev token two': neg_prev_two[0],
                                'Negcue prev token three': neg_prev_three[0],
                                'Negcue prev token four': neg_prev_four[0],
                                'Negcue next token one': neg_next_one[0],
                                'Negcue next token two': neg_next_two[0],
                                'Negcue next token three': neg_next_three[0],
                                'Negation': neg}
                data.append(feature_dict)
                targets.append(components[-1])
    return data, targets

def create_classifier(train_features, train_targets, modelname):
    """Create classifier, feed it with the training features and labels"""   
    
    modelname == 'SVM'
    model = SVC()
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)#.toarray()
    model.fit(features_vectorized, train_targets)
    
    return model, vec

def get_predicted_and_gold_labels(model, vec, inputdata): #outputfile):
    """Make predictions from the test data and write it to an outputfile"""  

    features, gold_labels = extract_features_and_labels(inputdata)
    features = vec.transform(features)
    predictions = model.predict(features)

    return gold_labels, predictions

def print_confusion_matrix(predictions, goldlabels):
    '''
    Function that prints out a confusion matrix
    :param predictions: predicted labels
    :param goldlabels: gold standard labels
    :type predictions, goldlabels: list of strings
    :returns: confusion matrix
    '''

    # based on example from https://datatofish.com/confusion-matrix-python/
    data = {'Gold': goldlabels, 'Predicted': predictions}
    df = pd.DataFrame(data, columns=['Gold', 'Predicted'])

    confusion_matrix = pd.crosstab(df['Gold'], df['Predicted'], rownames=['Gold'], colnames=['Predicted'])
    print(confusion_matrix)
    
    return confusion_matrix


def print_precision_recall_fscore(predictions, goldlabels):
    '''
    Function that prints out precision, recall and f-score in a complete report
    :param predictions: predicted output by classifier
    :param goldlabels: original gold labels
    :type predictions, goldlabels: list of strings
    '''

    report = classification_report(goldlabels,predictions,digits = 3)

    print('METRICS: ')
    print()
    print(report)

    
def main(argv=None):
    """Run all the above functions"""    

    trainingfile = 'train.tsv'
    inputfile = 'test.tsv'
    outputfile = 'outputfile' + '.tsv'
    
    training_features, gold_labels = extract_features_and_labels(trainingfile)
    for modelname in ['SVM']:
        ml_model, vec = create_classifier(training_features, gold_labels, modelname)
        predictions, goldlabels = get_predicted_and_gold_labels(ml_model, vec, inputfile)
        #classify_data(ml_model, vec, inputfile)#, outputfile.replace('.tsv','.' + modelname + '.tsv'), data[-1])
        print()
        #print('---->'+ modelname + ' with ' + ' and '.join(selected_features) + ' as features <----')
        print_precision_recall_fscore(predictions, goldlabels)
        print('------')
        
        outfile = open(outputfile, 'w')
        counter = 0
        for line in open(inputfile, 'r'):
            if len(line.rstrip('\n').split()) > 0:
                outfile.write(line.rstrip('\n') + '\t' + predictions[counter] + '\n')
                counter += 1
        outfile.close()
    
#    return predictions

if __name__ == '__main__':
    main()


METRICS: 

              precision    recall  f1-score   support

     Negated      0.383     0.756     0.508        41
    Negation      1.000     1.000     1.000         1
  NotNegated      0.982     0.918     0.949       609

    accuracy                          0.908       651
   macro avg      0.788     0.891     0.819       651
weighted avg      0.945     0.908     0.921       651

------
