In [8]:
import nltk
import random

In [9]:
from nltk.corpus import names
nltk.download('names')

[nltk_data] Downloading package names to
[nltk_data]     /Users/danielyakubov/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [69]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = []

def name_feature_extraction(name):
    return {'last_letter': name[-1], 'last_two_letters': name[-2:]}

# build a list of names labeled by gender
for m_name, f_name in zip(male_names, female_names):
    labeled_names.append((m_name, 'M'))
    labeled_names.append((f_name, 'F'))

random.shuffle(labeled_names)

#adding features
labeled_names = [(name_feature_extraction(name), label) for name, label in labeled_names]

# dividing into train, test
tenth = len(labeled_names)//10
train = labeled_names[:tenth*9]
test = labeled_names[tenth*9:]

# training naive bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train)

In [70]:
# some small tests to see effects
for name in ['Neo', 'Trinity', 'Morpheus', 'Daniel', 'Isabela']:
    name_features = name_feature_extraction(name)
    print(name, classifier.classify(name_features))

Neo M
Trinity F
Morpheus M
Daniel M
Isabela F


In [71]:
# actual accuracy
print(nltk.classify.accuracy(classifier, test))

0.7828282828282829


In [72]:
# most informative
classifier.show_most_informative_features()

Most Informative Features
        last_two_letters = 'na'                F : M      =     89.2 : 1.0
        last_two_letters = 'la'                F : M      =     59.8 : 1.0
        last_two_letters = 'us'                M : F      =     36.9 : 1.0
        last_two_letters = 'ia'                F : M      =     35.4 : 1.0
             last_letter = 'a'                 F : M      =     33.1 : 1.0
        last_two_letters = 'ta'                F : M      =     32.8 : 1.0
        last_two_letters = 'rt'                M : F      =     32.3 : 1.0
        last_two_letters = 'ra'                F : M      =     25.9 : 1.0
        last_two_letters = 'sa'                F : M      =     25.7 : 1.0
             last_letter = 'k'                 M : F      =     18.7 : 1.0


In [44]:
print(len(labeled_names))

5886


In [73]:
# different task
# POS tagging

In [92]:
from nltk.corpus import brown

In [114]:
def pos_features(prev_word, word, suc_word):
    return {'suf1': word[-1],
           'suf2': word[-2:],
           'pre1': word[0],
           'pre2': word[:2],
           'previous_word': prev_word,
           'succeeding_word': suc_word}

tagged_data = [(w.lower(), tag) for w, tag in brown.tagged_words(categories='news')]
# first item built in
tagged_features = [(pos_features('NA', tagged_data[0][0], tagged_data[1][0]), tagged_data[0][1])]

for i, (word, tag) in enumerate(list(tagged_data)[1:-1]):
    prev_word, _ = tagged_data[i-1]
    suc_word, _ = tagged_data[i+1]
    tagged_features.append((pos_features(prev_word, word, suc_word), tag))
# last item
tagged_features.append((pos_features(tagged_data[-2][0], tagged_data[-1][0], 'NA'), tagged_data[-1][1]))

random.shuffle(tagged_features)

tenth = len(tagged_features)//10
test_set = tagged_features[:tenth]
dev_set = tagged_features[tenth:tenth*2]
train_set = tagged_features[tenth*2:]

# training the classifier
pos_classifier = nltk.NaiveBayesClassifier.train(train_set)

# sanity check
QA = ['Becky', 'said', 'the', 'car', 'is', 'red']
QA = [w.lower() for w in QA]

QA_features = [pos_features('NA', QA[0], QA[1])]

for i, word in enumerate(QA[1:-1]):
    QA_features.append(pos_features(QA[i-1], word, QA[i+1]))
QA_features.append(pos_features(QA[-2], QA[-1], 'NA'))

for item in QA_features:
    print(pos_classifier.classify(item))

BE
VBD
AT
NN
BEZ
VBN


In [116]:
# most informative features
pos_classifier.show_most_informative_features()

Most Informative Features
                    suf1 = '.'                 . : NN     =   6304.0 : 1.0
                    suf2 = 'he'               AT : NN     =   4946.1 : 1.0
                    pre2 = 'an'               CC : IN     =   4215.1 : 1.0
                    pre1 = 'c'               MD* : IN     =   2765.7 : 1.0
                    suf2 = 'ho'              WPS : NN     =   2688.2 : 1.0
                    pre1 = "'"                '' : NNS    =   2615.6 : 1.0
                    suf2 = 'to'               TO : JJ     =   2007.1 : 1.0
                    pre1 = 'l'               RBR : IN     =   1958.3 : 1.0
                    suf1 = 'h'               ABX : NNS    =   1951.8 : 1.0
                    pre2 = 'it'              PPS : NN     =   1779.8 : 1.0


In [117]:
# accuracy
print(nltk.classify.accuracy(pos_classifier, dev_set))
print(nltk.classify.accuracy(pos_classifier, test_set))

0.8189955246146196
0.8203878667329687


In [25]:
# Sklearn
from sklearn import feature_extraction
from sklearn import metrics
from sklearn import model_selection
from sklearn import naive_bayes

# corpus
import nltk
from nltk.corpus import udhr
nltk.download('udhr')

import random

from collections import Counter

[nltk_data] Downloading package udhr to
[nltk_data]     /Users/danielyakubov/nltk_data...
[nltk_data]   Package udhr is already up-to-date!


In [8]:
# what do we have available here?
udhr.fileids()

['Abkhaz-Cyrillic+Abkh',
 'Abkhaz-UTF8',
 'Achehnese-Latin1',
 'Achuar-Shiwiar-Latin1',
 'Adja-UTF8',
 'Afaan_Oromo_Oromiffa-Latin1',
 'Afrikaans-Latin1',
 'Aguaruna-Latin1',
 'Akuapem_Twi-UTF8',
 'Albanian_Shqip-Latin1',
 'Amahuaca',
 'Amahuaca-Latin1',
 'Amarakaeri-Latin1',
 'Amuesha-Yanesha-UTF8',
 'Arabela-Latin1',
 'Arabic_Alarabia-Arabic',
 'Asante-UTF8',
 'Ashaninca-Latin1',
 'Asheninca-Latin1',
 'Asturian_Bable-Latin1',
 'Aymara-Latin1',
 'Balinese-Latin1',
 'Bambara-UTF8',
 'Baoule-UTF8',
 'Basque_Euskara-Latin1',
 'Batonu_Bariba-UTF8',
 'Belorus_Belaruski-Cyrillic',
 'Belorus_Belaruski-UTF8',
 'Bemba-Latin1',
 'Bengali-UTF8',
 'Beti-UTF8',
 'Bichelamar-Latin1',
 'Bikol_Bicolano-Latin1',
 'Bora-Latin1',
 'Bosnian_Bosanski-Cyrillic',
 'Bosnian_Bosanski-Latin2',
 'Bosnian_Bosanski-UTF8',
 'Breton-Latin1',
 'Bugisnese-Latin1',
 'Bulgarian_Balgarski-Cyrillic',
 'Bulgarian_Balgarski-UTF8',
 'Cakchiquel-Latin1',
 'Campa_Pajonalino-Latin1',
 'Candoshi-Shapra-Latin1',
 'Caquinte-Latin

In [23]:
# the data
eng = udhr.sents(fileids='English-Latin1')
ger = udhr.sents(fileids='German_Deutsch-Latin1')
data = [(sent , 'English') for sent in eng] + [(sent, 'German') for sent in ger]
random.shuffle(data)
X, Y = zip(*data)
print(X[0], Y[0])

['Parents', 'have', 'a', 'prior', 'right', 'to', 'choose', 'the', 'kind', 'of', 'education', 'that', 'shall', 'be', 'given', 'to', 'their', 'children', '.'] English


In [37]:
ger_chars = set(''.join([''.join(s).lower() for s in ger]))
eng_chars = set(''.join([''.join(s).lower() for s in eng]))

ger_only_chars = ger_chars - eng_chars # want to find out some obvious giveaways that a text is german
print(ger_only_chars)
eng_only_chars = eng_chars - ger_chars # the letter 'q' apparently
print(eng_only_chars)

{'ü', 'ä', '(', ')', 'ö', 'ß'}
{'q'}


In [65]:
ger_only_chars = ['ü', 'ä', 'ö', 'ß'] # manually removed symbols
eng_only_chars = ['q']
chars_to_check = ger_only_chars + eng_only_chars

def feature_extraction_ge(sent_toks):
    """takes a list and returns a dict of features
    the features I care about for this are 
    1) has certain characters (obvious giveaway)
    2) how many words are title cased, German title cases all nouns
    3) Average word length """
    C = Counter()
    
    # we don't really need counts of these, 0,1 work fine, so using a set is okay
    chars_in_sent = set(''.join(sent_toks).lower())
    for char in chars_to_check:
        if char in chars_in_sent:
            C[f'has_let_{char}'] += 1
        else:
            C[f'has_let_{char}'] = 0
    
    for tok in sent_toks:
        if tok.istitle():
            C['title_case_cnt'] += 1
            
    avg_word_len = sum([len(tok) for tok in sent_toks])/len(sent_toks)
    C["avg_word_length"] += avg_word_len
    
    return C

In [66]:
# now to convert our X to features
X_features = [feature_extraction_ge(sent) for sent in X]

#dict vectorize
vectorizer = feature_extraction.DictVectorizer(sparse=False)
vect_X = vectorizer.fit_transform(X_features)

In [98]:
# train test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(vect_X, Y, test_size=0.2)
# classifier
classifier = naive_bayes.MultinomialNB(alpha=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)

0.9230769230769231
[[ 9  0]
 [ 2 15]]


In [99]:
# what was the mistake in classification?
for x, pred, gold in zip(X_test, y_pred, y_test):
    if pred != gold:
        print(f"""{x}
            pred = {pred}
            gold = {gold}""")

[8. 0. 0. 0. 0. 0. 2.]
            pred = English
            gold = German
[6.28571429 0.         0.         0.         0.         0.
 4.        ]
            pred = English
            gold = German


In [102]:
fun_text = ['Hello my name is daniel how do you feel today? I feel fine its actually awesome',
           'There is nothing in the world so irresistibly contagious as laughter and good humor.',
           'Der Ball ist rund. Das Spiel dauert 90 Minuten',
            'Shall I compare thee to a summer\'s day?',
           'Alles hat ein Ende, nur die Wurst hat zwei',
           'Übung macht den Meister',
           'Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious jewel in his head.']

gold_y = ['English', 'English', 'German', 'English', 'German', 'German', 'English']
split_text = [s.split() for s in fun_text]
features = [feature_extraction_ge(s) for s in split_text]
fun_vects = vectorizer.fit_transform(features)
pred_y = classifier.predict(fun_vects)

for t, g, p in zip(fun_text, gold_y, pred_y):
    print(f"""text = {t}
    gold = {g}
    pred = {p}""")

text = Hello my name is daniel how do you feel today? I feel fine its actually awesome
    gold = English
    pred = English
text = There is nothing in the world so irresistibly contagious as laughter and good humor.
    gold = English
    pred = English
text = Der Ball ist rund. Das Spiel dauert 90 Minuten
    gold = German
    pred = German
text = Shall I compare thee to a summer's day?
    gold = English
    pred = English
text = Alles hat ein Ende, nur die Wurst hat zwei
    gold = German
    pred = English
text = Übung macht den Meister
    gold = German
    pred = German
text = Sweet are the uses of adversity which, like the toad, ugly and venomous, wears yet a precious jewel in his head.
    gold = English
    pred = English
