# Imports

In [5]:
import pandas as pd
import numpy as np
import csv
import nltk
import sklearn
import pyphen
import random
import time
import string
import itertools
import time
random.seed(42)

# Scoring

In [6]:
def report_score(gold_labels, predicted_labels, detailed=False):
    macro_F1 = sklearn.metrics.f1_score(gold_labels, predicted_labels, average='macro')
    print("macro-F1: {:.2f}".format(macro_F1))
    if detailed:
        scores = sklearn.metrics.precision_recall_fscore_support(gold_labels, predicted_labels)
        print("{:^10}{:^10}{:^10}{:^10}{:^10}".format("Label", "Precision", "Recall", "F1", "Support"))
        print('-' * 50)
        print("{:^10}{:^10.2f}{:^10.2f}{:^10.2f}{:^10}".format(0, scores[0][0], scores[1][0], scores[2][0], scores[3][0]))
        print("{:^10}{:^10.2f}{:^10.2f}{:^10.2f}{:^10}".format(1, scores[0][1], scores[1][1], scores[2][1], scores[3][1]))
    print()
    
    
def get_score(gold_labels, predicted_labels): 
    macro_F1 = sklearn.metrics.f1_score(gold_labels, predicted_labels, average='macro')
    return macro_F1


# DataSet

In [7]:
import pandas as pd
class Dataset(object):

    def __init__(self, language):
        self.language = language

        trainset_path = "../datasets/{}/{}_Train.tsv".format(language, language.capitalize())
        devset_path = "../datasets/{}/{}_Dev.tsv".format(language, language.capitalize())

        self.trainset = self.read_dataset(trainset_path)
        self.devset = self.read_dataset(devset_path)

    def read_dataset(self, file_path):
        with open(file_path) as file:
            fieldnames = ['hit_id', 'sentence', 'start_offset', 'end_offset', 'target_word', 'native_annots',
                          'nonnative_annots', 'native_complex', 'nonnative_complex', 'gold_label', 'gold_prob']
            
            dataset = pd.read_csv(file, names = fieldnames, sep = "\t")


        return dataset

# Model


In [59]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer
import scipy

class Baseline(object):

    def __init__(self, language, model):
        self.language = language
        # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
        if language == 'english':
            self.avg_word_length = 5.3
        else:  # spanish
            self.avg_word_length = 6.2
            
        self.d = pyphen.Pyphen(lang='en')
        #self.dv = DictVectorizer(sparse = False)

        if model == "dtc":
            self.model = DecisionTreeClassifier(random_state=0)
        elif model == "nb":
            self.model = GaussianNB()
            
        elif model == "lr":
            self.model = LogisticRegression()
            
        else:
            print("Error: choose model")
            
    def extract_word_features(self, word, *args):
        # here are my basic features:
        # - len chars = word length
        # - len tokens = phrase length
        # - len uniq =  ratio of unique characters in word
        # - len vowels = ratio of vowels in word
        # - len const = ratio of constonants in word
        # - len syl = number of syllables
        
        # - final baseline system uses tokens, uniq, and const based on feature analyis
        
        len_chars = len(word) / self.avg_word_length
        len_tokens = len(word.split(' '))
        len_uniq = len(set(word))/len(word)
        len_vowels = len([letter for letter in word.split() if letter in set("aeiou")])/len(word)
        len_const = len([letter for letter in word.split() if letter not in set("aeiou")])/len(word)
        len_syl = len(self.d.inserted(word).split("-"))
   
        # dictionary to store the features in, in order to access later when testing individual features
        
        features_dict = {"chars":len_chars,"tokens": len_tokens, "unique": len_uniq,
                    "vowels": len_vowels, "const":len_const, "syl": len_syl,}
            
        if args:
            features = []
            for f in args:
                features.append(features_dict[f])
            return features
        else:
            features = [len_chars, len_tokens, len_uniq,
                    len_vowels, len_const,  len_syl,]
            return features
    

        
    def train(self, trainset, *args):
        X = []
        y = []
        
        for idx,sent in trainset.iterrows():
            X.append(self.extract_word_features(sent['target_word'], *args))
            y.append(sent['gold_label'])
            
        #X = self.dv.fit_transform(X)
            
        return self.model.fit(X, y)
        
    def test(self, testset, *args):
        X = []

        y = []
        
        for idx,sent in testset.iterrows():
            X.append(self.extract_word_features(sent['target_word'], *args))
            y.append(sent['gold_label'])
        
        #X = self.dv.transform(X)

        return self.model.predict(X)
    
    def feature_importances(self):
        return self.model.feature_importances_
    

In [57]:
def run_model(language, model, feature_analysis = False, *args):
    model = Baseline(language, model)
    data = Dataset(language)
        
    if feature_analysis == True:
        features= ["chars", "tokens", "unique", 
                    "vowels", "const", "syl"]
        f1 = []
        for f in features:
            model.train(data.trainset, f)
            predictions = model.test(data.devset, f)
            gold_labels = data.devset['gold_label']
            f1.append(get_score(gold_labels, predictions))
        return f1

            
            
        
    else:
        
        print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset)))

        model.train(data.trainset, *args)

        predictions = model.test(data.devset, *args)

        gold_labels = data.devset['gold_label']

        report_score(gold_labels, predictions)


In [None]:

start = time.time()   
stats = []
for model in ["dtc", "nb", "lr"]:
    print("------ %s ------" %model)
    stats.append(run_model('english', model = model, feature_analysis = True))

fin = time.time()

print(fin-start)

------ dtc ------
------ nb ------
------ lr ------


In [52]:
df = pd.DataFrame(stats, index = ["Decision Tree Classifier", "Naive Bayes", "Logistic Regression"],
                  columns = ["chars","tokens", "unique","vowels", "const", "syl"]).T
df.head(10)

Unnamed: 0,Decision Tree Classifier,Naive Bayes,Logistic Regression
chars,0.708899,0.588843,0.666493
tokens,0.574645,0.574992,0.574992
unique,0.662108,0.615873,0.621597
vowels,0.36881,0.369113,0.368261
const,0.697302,0.670847,0.671383
syl,0.660097,0.571551,0.660327


In [56]:
start = time.time()   
3
for model in ["dtc", "nb", "lr"]:
    features = []
    print("------ %s ------" %model)
    run_model('english', model = model)
    run_model('spanish', model = model)
    

fin = time.time()

print(fin-start)

------ dtc ------
english: 27299 training - 3328 dev
macro-F1: 0.71

spanish: 13750 training - 1622 dev
macro-F1: 0.72

------ nb ------
english: 27299 training - 3328 dev
macro-F1: 0.66

spanish: 13750 training - 1622 dev
macro-F1: 0.65

------ lr ------
english: 27299 training - 3328 dev
macro-F1: 0.69

spanish: 13750 training - 1622 dev
macro-F1: 0.72

10.550925016403198
