In [53]:
import re
import pandas as pd
import numpy as np
import math
import random

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.corpus import genesis
from nltk.corpus import udhr

# Creating corpus

In [54]:
english = genesis.words("english-kjv.txt")
english_web = genesis.words("english-web.txt")
finnish = genesis.words("finnish.txt")
french = genesis.words("french.txt")
portuguese = genesis.words("portuguese.txt")

languages = ['English-Latin1', 'German_Deutsch-Latin1', 'French_Francais-Latin1', 'Spanish-Latin1']

d = {'genesis_corpus': [english, english_web, french, finnish,portuguese], 'language': [1,1, 0,0,0]}
for language in languages:
    if language != 'English-Latin1':
        d['genesis_corpus'].append(udhr.words(language))
        d['language'].append(0)
    else:
        d['genesis_corpus'].append(udhr.words(language))
        d['language'].append(1)
df = pd.DataFrame(data=d)

# Pre-processing

In [55]:
def remove_nonalpha(string):
    results = [word for word in string if re.match(r'[a-zA-Z]+',word)]
    return results

In [56]:
df['genesis_corpus']=df['genesis_corpus'].apply(lambda cw : remove_nonalpha(cw))

In [57]:
def remove_stopwords(string):
    stop_words = set(stopwords.words('english') + stopwords.words('french') + 
    stopwords.words('finnish') + stopwords.words('portuguese') + 
    stopwords.words('german')+ stopwords.words('spanish'))
    results = []
    for word in string:
        if word not in stop_words: results.append(word.lower())
    return ' '.join(results)

In [58]:
df['genesis_corpus']=df['genesis_corpus'].apply(lambda cw : remove_stopwords(cw))

# Tokenization and creation of the BoW

In [59]:
import nltk
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
#lemmatizer = nltk.stem.WordNetLemmatizer()

In [60]:
def bagofwords(corpus):
    wordfreq = {}
    tokens = w_tokenizer.tokenize(corpus)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1
    return wordfreq

In [61]:
df['genesis_corpus'] = df['genesis_corpus'].apply(lambda cw : bagofwords(cw))

# Creating labeled corpus in the correct format

In [62]:
no_eng = list(np.where(df['language'] == 0)[0])
eng = list(np.where(df['language'] == 1)[0])

In [63]:
labeled_corpusno = [
    ({word:freq}, 'non-english') 
    for corp in df.iloc[no_eng]['genesis_corpus']
    for word,freq in corp.items()
]
labeled_corpuseng = [
    ({word:freq}, 'english') 
    for corp in df.iloc[eng]['genesis_corpus']
    for word,freq in corp.items()
    ]
featureset = labeled_corpuseng + labeled_corpusno
random.shuffle(featureset)

## Creating training and testing set

In [64]:
train = featureset[:math.ceil(2*(len(featureset)/3))]
test = featureset[math.ceil(2*(len(featureset)/3)):]

# Creating the classifier

In [65]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [66]:
classifier.show_most_informative_features(15)

Most Informative Features
                 abimael = 1              englis : non-en =      4.5 : 1.0
                   altar = 13             englis : non-en =      4.5 : 1.0
                   alvan = 1              englis : non-en =      4.5 : 1.0
                 archers = 1              englis : non-en =      4.5 : 1.0
                   bedad = 1              englis : non-en =      4.5 : 1.0
              concubines = 1              englis : non-en =      4.5 : 1.0
                 dodanim = 1              englis : non-en =      4.5 : 1.0
                    ebal = 1              englis : non-en =      4.5 : 1.0
                 ellasar = 2              englis : non-en =      4.5 : 1.0
                    gaza = 1              englis : non-en =      4.5 : 1.0
                  gopher = 1              englis : non-en =      4.5 : 1.0
                   hamul = 1              englis : non-en =      4.5 : 1.0
                   heber = 1              englis : non-en =      4.5 : 1.0

In [67]:
print(nltk.classify.accuracy(classifier, test))

0.7861942577886377


In [68]:
test_wolabes = [item[0] for item in test]

In [69]:
test_classified = classifier.classify_many(test_wolabes)

In [70]:
reference = [elem[1]
    for elem in test 
]

In [71]:
from nltk.metrics import ConfusionMatrix
cm = ConfusionMatrix(reference,test_classified)

In [72]:
print(cm)

            |         n |
            |         o |
            |         n |
            |         - |
            |    e    e |
            |    n    n |
            |    g    g |
            |    l    l |
            |    i    i |
            |    s    s |
            |    h    h |
------------+-----------+
    english | <458>1325 |
non-english |   75<4690>|
------------+-----------+
(row = reference; col = test)



In [73]:
cm.precision('english')

0.8592870544090057

In [74]:
cm.recall('english')

0.2568704430734717

In [75]:
cm.precision('non-english')

0.7797173732335827

In [76]:
cm.recall('non-english')

0.9842602308499475

In [77]:
cm.f_measure('english')

0.3955094991364422

In [78]:
cm.f_measure('non-english')

0.8701298701298701