In [1]:
import pandas as pd

import scipy
import re
import string
import nltk
import numpy
import math
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [2]:
train = pd.read_csv('../data/processed/df_train.csv',index_col=0)


In [3]:
test = pd.read_csv('../data/processed/df_test.csv',index_col=0)

In [4]:
test_set_labels = pd.read_csv('../data/processed/test_set_labels.csv',index_col=0)

## Create a function that would perform identification on 1 test input

In [5]:
test_case1 = list(test['cs'])
test_case2 = list(test['en'])
test_case3 = list(test['sv'])
test_case4 = list(test['de'])
test_case5 = list(test['es'])
test_case6 = list(test['fr'])


In [6]:
def identifyLanguage(traindf, testInput):
    #initialize empty dict
    entropy_dict = {}
    #iterate through each column in train dataset (1 column = 1 language)
    for (columnName, columnData) in traindf.iteritems(): 
        #relative entropy taken from Scipy
        entropy = scipy.stats.entropy(testInput, qk=columnData)
        #add key(language), value(entropy) pair to dict
        entropy_dict[columnName] = entropy
    #return minimum value for most likely language
    return(min(entropy_dict, key=entropy_dict.get))


In [7]:
print(identifyLanguage(train, test_case1))
print(identifyLanguage(train, test_case2))
print(identifyLanguage(train, test_case3))
print(identifyLanguage(train, test_case4))
print(identifyLanguage(train, test_case5))
print(identifyLanguage(train, test_case6))

cs
en
sv
de
es
fr


# Demo for Aurelien

#### Some functions create to preprocess new text. Taken from preprocessing notebook

In [8]:
def word2ngrams(text, n, exact=True):
    """ Convert text into character ngrams. """
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

In [9]:
def preprocessInput(text):
    #lower case
    processed = text.lower()
    #remove numbers
    processed = re.sub("\d+", " ", processed)
    #remove punctuation
    processed = re.sub('['+string.punctuation+']', '', processed)
    #remove whitespace
    processed = processed.strip()
    #word tokenize
    processed = nltk.word_tokenize(processed)
    return processed

In [10]:
def createTrigrams(wordTokens,traindf):
    trigrams = []
    for j in wordTokens:
        x = word2ngrams(j,3)
        trigrams.append(x)
        trigrams = [x for x in trigrams if x != []]        
    trigrams_joined = [j for i in trigrams for j in i]
    b = pd.Series(trigrams_joined)
    trigrams_joined_series_value_counts = b.value_counts()
    tempdf = pd.DataFrame(index=train.index)
    trigrams_joined_series_value_counts_df = pd.DataFrame(trigrams_joined_series_value_counts)
    df_test = tempdf.join(trigrams_joined_series_value_counts_df)
    df_test = df_test.div(df_test.sum())
    df_test = df_test.fillna(0)
    test_case = list(df_test.iloc[:, 0])
    return test_case

## Aurelien please key in new input text here:

In [11]:
inputText = 'Lors de ses brillantes études de droit à luniversité Brown, Joe Paterno joue au football américain et est entraîné par Rip Engle. Ce dernier,'

### Run following code to see if it works

In [12]:
processed = preprocessInput(inputText)

In [13]:
test_case_final = createTrigrams(processed,train)

In [14]:
print(identifyLanguage(train, test_case_final))

fr


# Using Train Set to Perform language identification on Test Set

In [16]:
predictionList = []
#loop through columns in test df (each column is 1 test sample)
for (columnNamei, columnDatai) in test.iteritems():
    #use identifylanguage function write earlier
    prediction = identifyLanguage(train, columnDatai)
    #append predictions into list
    predictionList.append(prediction)

In [17]:
#This is my y_test
testList = test_set_labels['language']

In [20]:
#just to make the classification report look nice
target_label = train.columns

In [21]:
print(classification_report(testList,predictionList,labels=target_label))

              precision    recall  f1-score   support

          bg       1.00      0.99      1.00       394
          cs       0.99      1.00      1.00       315
          da       1.00      1.00      1.00       693
          de       1.00      1.00      1.00       643
          el       1.00      1.00      1.00       383
          en       1.00      1.00      1.00       599
          es       1.00      1.00      1.00       607
          et       1.00      1.00      1.00       306
          fi       1.00      1.00      1.00       597
          fr       1.00      1.00      1.00       607
          hu       1.00      1.00      1.00       309
          it       1.00      1.00      1.00       551
          lt       1.00      1.00      1.00       405
          lv       1.00      1.00      1.00       392
          nl       1.00      1.00      1.00       731
          pl       1.00      1.00      1.00       314
          pt       1.00      1.00      1.00       586
          ro       1.00    

#### Slight inaccuracy between Bulgarian and Czech languages. Both from the Slavic family.