# Imports

In [6]:
import pandas as pd
import numpy as np
import csv
import nltk
import sklearn
import pyphen
import random
import time
import string
import itertools
import time
random.seed(42)

# Scoring

In [7]:
def report_score(gold_labels, predicted_labels, detailed=False):
    macro_F1 = sklearn.metrics.f1_score(gold_labels, predicted_labels, average='macro')
    print("macro-F1: {:.2f}".format(macro_F1))
    if detailed:
        scores = sklearn.metrics.precision_recall_fscore_support(gold_labels, predicted_labels)
        print("{:^10}{:^10}{:^10}{:^10}{:^10}".format("Label", "Precision", "Recall", "F1", "Support"))
        print('-' * 50)
        print("{:^10}{:^10.2f}{:^10.2f}{:^10.2f}{:^10}".format(0, scores[0][0], scores[1][0], scores[2][0], scores[3][0]))
        print("{:^10}{:^10.2f}{:^10.2f}{:^10.2f}{:^10}".format(1, scores[0][1], scores[1][1], scores[2][1], scores[3][1]))
    print()
    
    
def get_score(gold_labels, predicted_labels): 
    macro_F1 = sklearn.metrics.f1_score(gold_labels, predicted_labels, average='macro')
    return macro_F1


# DataSet

In [207]:
import pandas as pd
class Dataset(object):

    def __init__(self, language):
        self.language = language

        trainset_path = "../datasets/{}/{}_Train.tsv".format(language, language.capitalize())
        devset_path = "../datasets/{}/{}_Dev.tsv".format(language, language.capitalize())

        self.trainset = self.read_dataset(trainset_path)
        self.devset = self.read_dataset(devset_path)

    def read_dataset(self, file_path):
        with open(file_path) as file:
            fieldnames = ['hit_id', 'sentence', 'start_offset', 'end_offset', 'target_word', 'native_annots',
                          'nonnative_annots', 'native_complex', 'nonnative_complex', 'gold_label', 'gold_prob']
            
            dataset = pd.read_csv(file, names = fieldnames, sep = "\t")

            #dataset = [sent for sent
        return dataset

# Custom Transformers

In [216]:
# adapted from 
# https://opendevincode.wordpress.com/2015/08/01/building-a-custom-python-scikit-learn-transformer-for-machine-learning/
# and http://michelleful.github.io/code-blog/2015/06/20/pipelines/
# http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html#sphx-glr-auto-examples-hetero-feature-union-py

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    
    def fit(self, X, *_):
        return self
    
    def transform(self, df):
        return df[self.key]
    
class Suffix_Extractor(BaseEstimator, TransformerMixin):
    def _init__(self):
        pass
    
    def transform(self, X, *_):
        result = []
        for word in X:
            row_dict = {word:1}
            result.append(row_dict)
        return result
            
    def fit(self, *_):
        return self


class WordFeatureExtractor(BaseEstimator, TransformerMixin):
    # here are my basic features:
        # - len chars = word length
        # - len tokens = phrase length
        # - len uniq =  ratio of unique characters in word
        # - len vowels = ratio of vowels in word
        # - len const = ratio of constonants in word
        # - len syl = number of syllables
        
        # - final baseline system uses tokens, uniq, and const based on feature analyis
        
    def __init__(self,language):
        language = language
        # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
        if language == 'english':
            self.avg_word_length = 5.3
        else:  # spanish
            self.avg_word_length = 6.2
            
        self.d = pyphen.Pyphen(lang='en')
    
    def transform(self, X, *_):
        result = []
        for word in X:
            len_chars = len(word) / self.avg_word_length
            len_tokens = len(word.split(' '))
            len_uniq = len(set(word))/len(word)
            len_vowels = len([letter for letter in word.split() if letter in set("aeiou")])/len(word)
            len_const = len([letter for letter in word.split() if letter not in set("aeiou")])/len(word)
            len_syl = len(self.d.inserted(word).split("-"))

            # dictionary to store the features in, in order to access later when testing individual features
            row_dict = {"chars":len_chars,"tokens": len_tokens, "unique": len_uniq,
                        "vowels": len_vowels, "const":len_const, "syl": len_syl,}

            result.append(row_dict)
        return result
    
    def fit(self, *_):
        return self
    

# Model


In [217]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

import scipy

class DTC_Model(object):

    def __init__(self, language):
        self.language = language
        # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
        if language == 'english':
            self.avg_word_length = 5.3
        else:  # spanish
            self.avg_word_length = 6.2
            
        self.d = pyphen.Pyphen(lang='en')
        self.dv = DictVectorizer()
        self.cv = CountVectorizer(ngram_range = (1,3))
        #self.cv = TfidfVectorizer(ngram_range = (2,2))
        self.model = DecisionTreeClassifier(class_weight = "balanced", random_state=0)
        self.build_pipe()
        
    def build_pipe(self):
        word_features = Pipeline([('select', Selector(key="target_word"))] +
            [( 'wfe', WordFeatureExtractor(language) )] +
            [( 'dv', DictVectorizer() )])

        Ngrams = Pipeline([('select', Selector(key = "sentence"))]+
            [('cv', CountVectorizer())])
        
        suffix = Pipeline([('select', Selector(key='target_word'))]+
                          [('suf',Suffix_Extractor())]+
                          [( 'dv', DictVectorizer())])


        f_union = Pipeline([('union', FeatureUnion(transformer_list = [
                            ('words', word_features),('ngrams', Ngrams),('sffx', suffix)]))])
        self.pipe = f_union
        

    def extract_word_features(self, word, *args):
        # here are my basic features:
        # - len chars = word length
        # - len tokens = phrase length
        # - len uniq =  ratio of unique characters in word
        # - len vowels = ratio of vowels in word
        # - len const = ratio of constonants in word
        # - len syl = number of syllables
        
        # - final baseline system uses tokens, uniq, and const based on feature analyis
        
        len_chars = len(word) / self.avg_word_length
        len_tokens = len(word.split(' '))
        len_uniq = len(set(word))/len(word)
        len_vowels = len([letter for letter in word.split() if letter in set("aeiou")])/len(word)
        len_const = len([letter for letter in word.split() if letter not in set("aeiou")])/len(word)
        len_syl = len(self.d.inserted(word).split("-"))
   
        # dictionary to store the features in, in order to access later when testing individual features
        features_dict = {"chars":len_chars,"tokens": len_tokens, "unique": len_uniq,
                    "vowels": len_vowels, "const":len_const, "syl": len_syl,}
            
        return features_dict
    
    def extract_context_features(self, sent):
        
        ## update this feature properly
        sentence = sent['sentence']
        word = sent['target_word']
        sentence = [sentence.replace(word,"")]
        return(self.cv.transform(sentence))


    def get_features(self, trainset):
        pass
        
        
    def train(self, trainset, *args):
        X = self.pipe.fit_transform(trainset)
        y = trainset['gold_label']
       
        return self.model.fit(X, y)
        
    def test(self, testset, *args):
        X = self.pipe.transform(testset)
        return self.model.predict(X)
    
    def feature_importances(self):
        return self.model.feature_importances_
    

In [218]:
def run_model(language, *args):
    data = Dataset(language)
    print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset)))

    model = DTC_Model(language)

    model.train(data.trainset, *args)


    predictions = model.test(data.devset, *args)
    
    gold_labels = data.devset['gold_label']

    #gold_labels = [sent['gold_label'] for sent in data.devset]

    report_score(gold_labels, predictions)
    
    #print(model.feature_importances())
    
    

In [219]:

start = time.time()   

run_model('english',)
run_model('spanish',)

fin = time.time()

print(fin-start)

english: 27299 training - 3328 dev
macro-F1: 0.77

spanish: 13750 training - 1622 dev
macro-F1: 0.70

20.55146098136902


In [28]:
import string
import itertools
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

alphabet = "abcde"
char_bigrams = [ x[0]+x[1] for x in itertools.product(alphabet, repeat = 2)]
print(char_bigrams[:5])


cv = CountVectorizer()
cv.fit(char_bigrams)
cv.get_params()

x = cv.transform(["aa ab"])
x.toarray()

['aa', 'ab', 'ac', 'ad', 'ae']


array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])

In [46]:
dv = DictVectorizer()
x = [{"chars":6,"syl":3},{"chars":3, "syl": 1}]
dv.fit(x)

dv.transform(x)


<2x2 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

'hi my  is dan'

In [119]:
CV = CountVectorizer()
words_1 = ["hello world"]
words_2 = ["Hi dan"]
CV.fit(words_1)
CV.fit(words_2)
words = words_1 + words_2

sent = ["The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict."]

y = CV.transform(words).toarray()
y = CV.fit_transform(sent).toarray()
y

array([[3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
        1, 1, 1, 1, 2, 1]], dtype=int64)

['hello', 'my', 'name', 'is', 'dan']

In [121]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

class WordFeatureExtractor(BaseEstimator, TransformerMixin):
    # here are my basic features:
        # - len chars = word length
        # - len tokens = phrase length
        # - len uniq =  ratio of unique characters in word
        # - len vowels = ratio of vowels in word
        # - len const = ratio of constonants in word
        # - len syl = number of syllables
        
        # - final baseline system uses tokens, uniq, and const based on feature analyis
        
    def __init__(self,language):
        language = language
        # from 'Multilingual and Cross-Lingual Complex Word Identification' (Yimam et. al, 2017)
        if language == 'english':
            self.avg_word_length = 5.3
        else:  # spanish
            self.avg_word_length = 6.2
            
        self.d = pyphen.Pyphen(lang='en')
    
    def transform(self, X, *_):
        result = []
        for word in X:
            len_chars = len(word) / self.avg_word_length
            len_tokens = len(word.split(' '))
            len_uniq = len(set(word))/len(word)
            len_vowels = len([letter for letter in word.split() if letter in set("aeiou")])/len(word)
            len_const = len([letter for letter in word.split() if letter not in set("aeiou")])/len(word)
            len_syl = len(self.d.inserted(word).split("-"))

            # dictionary to store the features in, in order to access later when testing individual features
            row_dict = {"chars":len_chars,"tokens": len_tokens, "unique": len_uniq,
                        "vowels": len_vowels, "const":len_const, "syl": len_syl,}

            result.append(row_dict)
        return result
    
    def fit(self, *_):
        return self

In [165]:

language = "english"
data = Dataset(language)

#data.trainset['gold_label']


In [194]:
suffix = Pipeline([('select', Selector(key='target_word'))]+
                  [('suf',Suffix_Extractor())])

SFX = suffix.fit_transform(data.trainset)
print(SFX[:10])
LB = CountVectorizer()
#LB = LabelBinarizer()
print(len(SFX))


LB.fit_transform(SFX).shape

['ren', 'nds', 'efs', 'nds', 'ral', 'ops', 'ops', 'ing', 'ved', 'ich']
27299


(27299, 1460)

In [205]:
class My_One_Hot(BaseEstimator, TransformerMixin):
    def __init(self):
        pass
    def fit(self, X, *_):
        return self
    def transform(self,X,*_):
        X = X.apply((lambda x: x[-3:]))
        result = pd.get_dummies(X)
        return result

In [206]:
MOH = My_One_Hot()
moh = MOH.fit_transform(data.trainset['target_word'])
moh

Unnamed: 0,GD,II,MP,TV,ad,al,as,at,by,go,...,zow,Âğâ,âr,çuz,ène,íly,üle,نان,央大學,子大学
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
pipeline = Pipeline([('select', Selector(key="target_word"))] +
    [( 'wfe', WordFeatureExtractor(language) )] +
    [( 'dv', DictVectorizer() )])

pipeline2 = Pipeline([('select', Selector(key = "sentence"))]+
    [('cv', CountVectorizer())])


union_pipeline = Pipeline([('union', FeatureUnion(transformer_list = [('words', pipeline),('contexts', pipeline2)]))])
X = union_pipeline.fit_transform(data.trainset)

X

<27299x7494 sparse matrix of type '<class 'numpy.float64'>'
	with 798522 stored elements in Compressed Sparse Row format>

In [110]:
class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df[self.key]

In [114]:
S = Selector(key = 'target_word')
selected = S.fit_transform(data.trainset)
for i in range(10):
    print(selected[i])

barren
barren islands
reefs
islands
coral
coral outcrops
outcrops
overlapping
believed
rich
