In [505]:
import numpy as np
import re
import math
import string
import random
from random import randrange
from pprint import pprint

In [506]:
n = [2,3,4]  # bigrams, trigrams and quadgrams
k = 40 # top 40 n-grams to calculate distance metric

In [507]:
def generate_n_grams(words, k, num_gram): # top k frequent n-grams
    ngrams = {}
    
    sentence = re.sub('['+string.punctuation+']', '', words)
    sentence = sentence.lower()
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = sentence.replace(' ','_')

    for i in range(len(sentence) - num_gram):
        idx = sentence[i : i + num_gram]
        if idx in ngrams:
            ngrams[idx] += 1
        else:
            ngrams[idx] = 1
            
    total_freq = len(sentence) - num_gram + 1
    
    for key in ngrams.keys():
        reduction_factor = 1 
        if '_' in key: reduction_factor = 2
        ngrams[key] = round(math.log(ngrams[key] / (reduction_factor * total_freq)), 3)                             
        
    ngrams = sorted(ngrams.items(), key = lambda item : item[1], reverse = True) 

    final_ngrams = []
    log_probability = [] 
    
    for i in range(len(ngrams)):
        final_ngrams.append(ngrams[i][0])
        log_probability.append(ngrams[i][1])
        
    return final_ngrams, log_probability

In [508]:
bigrams = {}
trigrams = {}
quadgrams = {}

languages = ['assamese', 'odiya', 'punjabi', 'tamil']

for language in languages:
    with open ('input_' + language + '.txt', 'r') as filename:
        file = filename.read()
        bigrams[language] = generate_n_grams(file, k, n[0])
        trigrams[language] = generate_n_grams(file, k, n[1])
        quadgrams[language] = generate_n_grams(file, k, n[2])       
ngrams = {2 : bigrams, 3 : trigrams, 4 : quadgrams}

In [509]:
def calculate_distance(ngrams_test, ngrams_list, n):
    dist = {language: 0 for language in languages} 
    for ngram in ngrams_test[0]:
        for language in ngrams_list.keys():
            idx2 = ngrams_test[0].index(ngram)
            if ngram in ngrams[n][language][0] : 
                idx = ngrams[n][language][0].index(ngram)
                dist[language] += abs(ngrams[n][language][1][idx] - ngrams_test[1][idx2])
            else: 
                dist[language] += abs(ngrams_test[1][idx2])
    return dist

In [510]:
def identify_language(input_file, filetype):
    test_file = []  
    bigrams_test = []
    trigrams_test = []
    quadgrams_test = []
    total_distance = {}
    
    if filetype == 'file': 
        tempfile = input_file
        with open(tempfile, 'r', errors = 'ignore') as filename: 
            input_file = filename.read()

    bigrams_test = generate_n_grams(input_file, k, 2)
    trigrams_test = generate_n_grams(input_file, k, 3)
    quadgrams_test = generate_n_grams(input_file, k, 4)
    
    bigrams_distance = calculate_distance(bigrams_test, bigrams, 2) 
    trigrams_distance = calculate_distance(trigrams_test, trigrams, 3)
    quadgrams_distance = calculate_distance(quadgrams_test, quadgrams, 4) 

    for language in bigrams_distance.keys():
        total_distance[language] = bigrams_distance[language] + trigrams_distance[language] + quadgrams_distance[language]
    distance_sum = 0
    for distance in total_distance.values():
        distance_sum += distance
    for language in total_distance.keys():
        total_distance[language] /= distance_sum
    distance_list = sorted(total_distance.items(), key= lambda x:x[1])     

    print ('Predicted language :' + distance_list[0][0] + '\n')
    return distance_list[0][0]


In [511]:
inputfile = 'input_assamese.txt'
identify_language(inputfile, 'file')

Predicted language :assamese



'assamese'

In [512]:
inputfile = 'input_odiya.txt'
identify_language(inputfile, 'file')

Predicted language :odiya



'odiya'

In [513]:
inputfile = 'input_punjabi.txt'
identify_language(inputfile, 'file')

Predicted language :punjabi



'punjabi'

In [514]:
inputfile = 'input_tamil.txt'
identify_language(inputfile, 'file')

Predicted language :tamil



'tamil'

In [515]:
count = 0
accuracy = 0
misclassified = [] 

for i in range(1000):
    idx = randrange(0,4)
    language = languages[idx]
    file = "test_"+language+".txt"
    lines = random.choice(open(file, errors = "ignore").readlines())
    if len(lines) <= 20: 
        continue
    count += 1    
    print (lines)
    print ("Actual language : " + language )
    predicted = identify_language(lines, 'test_sentences')

    if (predicted == language): 
        accuracy += 1
    else:
        misclassified.append((lines, language, predicted))


here you all o of he a her n

Actual language : tamil
Predicted language :tamil

a om who bar at an oit line oo

Actual language : tamil
Predicted language :punjabi

o is yui fas se  a e e e ane ane aas 

Actual language : punjabi
Predicted language :punjabi

av moe to caofle  toval bush her ar ta hales ar

Actual language : odiya
Predicted language :punjabi

ge en her oo er o fir mos er monvers

Actual language : assamese
Predicted language :assamese

boa of or her te shot evers e ton a

Actual language : odiya
Predicted language :punjabi

hor rou or er bo for er orhor mo for hur whom o is i

Actual language : assamese
Predicted language :assamese

wor ho morer lon preve raher

Actual language : tamil
Predicted language :assamese

oa e sor o so wo mo ol or o wofor olon

Actual language : assamese
Predicted language :assamese

i bote ash wot  ho tor caveawosh 

Actual language : odiya
Predicted language :punjabi

ma as e ately o you e

Actual language : punjabi
Predicted language :punj

Predicted language :punjabi

temar  canish ol yet hav e

Actual language : punjabi
Predicted language :punjabi

ther her msasse has reso

Actual language : assamese
Predicted language :punjabi

eomo and o or to her

Actual language : assamese
Predicted language :assamese

blu ate slaro wo pa cravel shel a herwo ater a aten a

Actual language : odiya
Predicted language :punjabi

ge en her oo er o fir mos er monvers

Actual language : assamese
Predicted language :assamese

oh who ro ahe i iver ata a

Actual language : tamil
Predicted language :punjabi

ho glar i o clas o to in ye ar tat e tle or

Actual language : odiya
Predicted language :punjabi

opar rove l heer te  tear aa terpefoco e ler or

Actual language : odiya
Predicted language :punjabi

goere er e wor as if has is
Actual language : punjabi
Predicted language :punjabi

widen  a  e odor ha e sees er

Actual language : punjabi
Predicted language :punjabi

rer consiner i eres ane i nin met

Actual language : punjabi
Predicted lan


widen  a  e odor ha e sees er

Actual language : punjabi
Predicted language :punjabi

howm nos had ein an  an e i as 

Actual language : punjabi
Predicted language :punjabi

ge en her oo er o fir mos er monvers

Actual language : assamese
Predicted language :assamese

wole a plocpa co cower coa ar

Actual language : odiya
Predicted language :punjabi

wo re oh ooh wo a alo lin ara

Actual language : tamil
Predicted language :punjabi

wo omer infaconye to to

Actual language : assamese
Predicted language :punjabi

ree ter at e er sir a is

Actual language : punjabi
Predicted language :punjabi

you or on true o ere

Actual language : tamil
Predicted language :assamese

mo ol he e ol pes ad ser sho setat

Actual language : punjabi
Predicted language :punjabi

howm nos had ein an  an e i as 

Actual language : punjabi
Predicted language :punjabi

o is yui fas se  a e e e ane ane aas 

Actual language : punjabi
Predicted language :punjabi

rom  who ao wo  wo more o worble o who ano o

Actua


hownis kast hern l at ser hode dor te 

Actual language : punjabi
Predicted language :punjabi

the wo wer ar oe  ol er on all e

Actual language : tamil
Predicted language :tamil

oho mo ies o horan ho ero in wo

Actual language : assamese
Predicted language :assamese

a women non e bor ter on  terter

Actual language : tamil
Predicted language :punjabi

ha par ho go at a pa ca foe  pa  bluo ol a a

Actual language : odiya
Predicted language :punjabi

o erbl u are han aongo ith e  none eron

Actual language : tamil
Predicted language :punjabi

got can ar e more e cara o bo to ave i porola ers far bote a

Actual language : odiya
Predicted language :odiya

worer has a no all her ee

Actual language : tamil
Predicted language :tamil

i aall moru bat gorhola o ato a  soshe hor la o

Actual language : odiya
Predicted language :punjabi

mohare wittle go hor er

Actual language : tamil
Predicted language :assamese

emoo who forho we or ser ofnfir eher e a her e wos or

Actual language : assa


o who wo did  sul boto heera tran e he trar  ar pass o no qi aper

Actual language : odiya
Predicted language :punjabi

a mon or a wol wel if your ae

Actual language : tamil
Predicted language :assamese

group ar har though ata her er a y fer ru mera as sidn asuh

Actual language : punjabi
Predicted language :punjabi

tat hor book a gru pepoprable er

Actual language : odiya
Predicted language :assamese

om d mein a cal ha tas ha oi at roa at o ta lush

Actual language : odiya
Predicted language :punjabi

wo omo mofol  o on who

Actual language : assamese
Predicted language :assamese

l are that trouble boles si a ol to for the the te blo o ar go  a

Actual language : odiya
Predicted language :odiya

ou  mo her mir o your

Actual language : assamese
Predicted language :assamese

or herer as the hom i o

Actual language : assamese
Predicted language :punjabi

goger who ro lot moover fermn earl o

Actual language : tamil
Predicted language :assamese

oes a hayo e rur  el so

Actual lan


  wo per in ter wish tac to war

Actual language : punjabi
Predicted language :punjabi

tat hor book a gru pepoprable er

Actual language : odiya
Predicted language :assamese

he to oor tal a e  a child an bre ea tear tolea

Actual language : odiya
Predicted language :punjabi

goere er e wor as if has is
Actual language : punjabi
Predicted language :punjabi

not co betoa ta aol ra bo yo a

Actual language : odiya
Predicted language :punjabi

hownis kast hern l at ser hode dor te 

Actual language : punjabi
Predicted language :punjabi

lac as hope i go a ton

Actual language : odiya
Predicted language :punjabi

oo gro or wot fo te es er her hor e

Actual language : assamese
Predicted language :assamese

wo wo wo om o more an

Actual language : assamese
Predicted language :assamese

bok cavl ac al  ol to co col toco taa

Actual language : odiya
Predicted language :punjabi

me hower youter hot to ter wet smooth s 

Actual language : punjabi
Predicted language :punjabi

wowho her ar fo he


rol o mo i her o o co ofher e

Actual language : tamil
Predicted language :assamese

hah o lottebo wol e ar

Actual language : odiya
Predicted language :punjabi

ee et t e t t t i      i i i i i i i e

Actual language : tamil
Predicted language :punjabi

ter own we o who of fie

Actual language : tamil
Predicted language :assamese

ala a ltl e ar atat bo wo ar trar yo ber ar cacter oro dele shata i a ar a

Actual language : odiya
Predicted language :odiya

but who hor tatoo far o

Actual language : odiya
Predicted language :punjabi

i aall moru bat gorhola o ato a  soshe hor la o

Actual language : odiya
Predicted language :punjabi

yhae or i a te o to erer

Actual language : tamil
Predicted language :punjabi

ah thro wol er il wo a ee as

Actual language : punjabi
Predicted language :punjabi

i you on a tho o on a

Actual language : tamil
Predicted language :tamil

ooo o o ofor tomer vordof mo povers e afom teon cns e

Actual language : assamese
Predicted language :assamese

her r th

Predicted language :punjabi

to wom ooeras ril er wo ofo fhoh

Actual language : assamese
Predicted language :assamese

frome e a te o to ar paa hat  over fur otos o te do oshul bo or

Actual language : odiya
Predicted language :odiya

ah thro wol er il wo a ee as

Actual language : punjabi
Predicted language :punjabi

o mo no wo o mo o hor o

Actual language : assamese
Predicted language :assamese

or love te eatom thera

Actual language : odiya
Predicted language :punjabi

wutene how an atol far ei i cas

Actual language : punjabi
Predicted language :punjabi

  wo per in ter wish tac to war

Actual language : punjabi
Predicted language :punjabi

oa ran o on o to ong  lo can yote her tan er the wor

Actual language : tamil
Predicted language :tamil

rofer hom mor fo or oore mor ars e har emoer mor

Actual language : assamese
Predicted language :assamese

hemo how eron eror romer haron

Actual language : tamil
Predicted language :punjabi

hownis kast hern l at ser hode dor te 

Actual 

In [516]:
print ('Accuracy :', accuracy/count*100)
print (accuracy, count)

Accuracy : 50.67750677506775
374 738


In [517]:
pprint(set(misclassified))

{('  e to cel he e wosh you\n', 'tamil', 'punjabi'),
 ('  er at a an er as verer wo hes aer\n', 'tamil', 'punjabi'),
 (' a ov hol er ivl totes shop o  ho e he the a tavel a or year o sho how\n',
  'odiya',
  'punjabi'),
 (' eer a e her who her ties senoos\n', 'assamese', 'punjabi'),
 (' hor wer on not non hoor an more terepor it \n', 'tamil', 'assamese'),
 (' pe o maser e ofol te\n', 'tamil', 'punjabi'),
 ('a ah ol ot have long\n', 'tamil', 'odiya'),
 ('a cal blo cup o pato a to eat blo ar\n', 'odiya', 'punjabi'),
 ('a cala lou a o ho ol eri\n', 'tamil', 'assamese'),
 ('a glota a ad cate toble goher a\n', 'odiya', 'punjabi'),
 ('a goo shan ar  at to par tey wush\n', 'odiya', 'punjabi'),
 ('a groupn n an and  women a en ande ena\n', 'tamil', 'punjabi'),
 ('a man tatco tarte as posh ea me\n', 'odiya', 'punjabi'),
 ('a mirl hon worve mo wor ol ol o han a all\n', 'tamil', 'assamese'),
 ('a mon or a wol wel if your ae\n', 'tamil', 'assamese'),
 ('a mora ate a cau de li goere ta aro\n', 'odi

In [539]:
count = 0
accuracy = 0
misclassified = [] 

for i in range(100):
    file = "test_assamese.txt"
    lines = random.choice(open(file, errors = "ignore").readlines())
    if len(lines) <= 20: 
        continue
    count += 1    
    print (lines)
    print ("Actual language : assamese")
    predicted = identify_language(lines, 'test_sentences')

    if (predicted == language): 
        accuracy += 1
    else:
        misclassified.append((lines, language, predicted))

moo o he ro foer who to ov her so tie e wosmoter e o oe

Actual language : assamese
Predicted language :assamese

hero o o go ofer o wo to yo hom

Actual language : assamese
Predicted language :assamese

oome or er asterte co er ser

Actual language : assamese
Predicted language :punjabi

o mo no wo o mo o hor o

Actual language : assamese
Predicted language :assamese

hero o o go ofer o wo to yo hom

Actual language : assamese
Predicted language :assamese

mo woo ors mo as he o ser gro

Actual language : assamese
Predicted language :assamese

morker cermot wor on wo vers e e ver e

Actual language : assamese
Predicted language :assamese

ther her msasse has reso

Actual language : assamese
Predicted language :punjabi

morker cermot wor on wo vers e e ver e

Actual language : assamese
Predicted language :assamese

oois o who herfor ho wor on motel for mon

Actual language : assamese
Predicted language :assamese

ro more or es sil first ton o

Actual language : assamese
Predicted langua

In [540]:
print ('Accuracy :', accuracy/count*100)
print (accuracy, count)

Accuracy : 30.18867924528302
16 53


In [547]:
count = 0
accuracy = 0
misclassified = [] 

for i in range(80):
    file = "test_odiya.txt"
    lines = random.choice(open(file, errors = "ignore").readlines())
    if len(lines) <= 20: 
        continue
    count += 1    
    print (lines)
    print ("Actual language : odiya")
    predicted = identify_language(lines, 'test_sentences')

    if (predicted == language): 
        accuracy += 1
    else:
        misclassified.append((lines, language, predicted))

e mo bron artar to wer of sot tave palo boer to aa

Actual language : odiya
Predicted language :punjabi

a mora ate a cau de li goere ta aro

Actual language : odiya
Predicted language :punjabi

go o cole lo a ca alee bro be ar a a

Actual language : odiya
Predicted language :odiya

but who hor tatoo far o

Actual language : odiya
Predicted language :punjabi

om d mein a cal ha tas ha oi at roa at o ta lush

Actual language : odiya
Predicted language :punjabi

l gruper pash soshoot we a ro tover ine other si ora

Actual language : odiya
Predicted language :punjabi

col co n ar ar home ar 

Actual language : odiya
Predicted language :punjabi

a man tatco tarte as posh ea me

Actual language : odiya
Predicted language :punjabi

sha ave toable gro o e to e apa

Actual language : odiya
Predicted language :punjabi

the par i o bor on eo o i e wal  a a

Actual language : odiya
Predicted language :tamil

buti how for go par at o le i a al ete to are

Actual language : odiya
Predicted language

In [548]:
print ('Accuracy :', accuracy/count*100)
print (accuracy, count)

Accuracy : 77.02702702702703
57 74


In [551]:
count = 0
accuracy = 0
misclassified = [] 

for i in range(30):
    file = "test_punjabi.txt"
    lines = random.choice(open(file, errors = "ignore").readlines())
    if len(lines) <= 20: 
        continue
    count += 1    
    print (lines)
    print ("Actual language : punjabi")
    predicted = identify_language(lines, 'test_sentences')

    if (predicted == language): 
        accuracy += 1
    else:
        misclassified.append((lines, language, predicted))

ree ter at e er sir a is

Actual language : punjabi
Predicted language :punjabi

mo ol he e ol pes ad ser sho setat

Actual language : punjabi
Predicted language :punjabi

goere er e wor as if has is
Actual language : punjabi
Predicted language :punjabi

ash ean er it ter havie car

Actual language : punjabi
Predicted language :punjabi

ah thro wol er il wo a ee as

Actual language : punjabi
Predicted language :punjabi

mortur a hap ne amee is ash i tat a easit

Actual language : punjabi
Predicted language :punjabi

hownis kast hern l at ser hode dor te 

Actual language : punjabi
Predicted language :punjabi

group ar har though ata her er a y fer ru mera as sidn asuh

Actual language : punjabi
Predicted language :punjabi

ree ter at e er sir a is

Actual language : punjabi
Predicted language :punjabi

oe sar er as mors ar  

Actual language : punjabi
Predicted language :punjabi

ah andwee rop yr a for lottle on  herm of tro

Actual language : punjabi
Predicted language :assamese

as a

In [552]:
print ('Accuracy :', accuracy/count*100)
print (accuracy, count)

Accuracy : 90.0
18 20


In [555]:
count = 0
accuracy = 0
misclassified = [] 

for i in range(200):
    file = "test_tamil.txt"
    lines = random.choice(open(file, errors = "ignore").readlines())
    if len(lines) <= 20: 
        continue
    count += 1    
    print (lines)
    print ("Actual language : tamil")
    predicted = identify_language(lines, 'test_sentences')

    if (predicted == language): 
        accuracy += 1
    else:
        misclassified.append((lines, language, predicted))

throm gormor o om o your wol

Actual language : tamil
Predicted language :assamese

her on er wol movig had a er a no e

Actual language : tamil
Predicted language :tamil

o gol revo anebl ter ao

Actual language : tamil
Predicted language :punjabi

you are ono on o e li or ferver rol  o

Actual language : tamil
Predicted language :assamese

ooe ho and oor al is

Actual language : tamil
Predicted language :punjabi

the an e fen ar ou ho wor ae
Actual language : tamil
Predicted language :punjabi

mo son ron to in o o

Actual language : tamil
Predicted language :punjabi

her her rer fon a len an wo od for anas he her fore

Actual language : tamil
Predicted language :assamese

ol me on he ate aon ao

Actual language : tamil
Predicted language :punjabi

h ayou ar hao the ferverle hery he tur

Actual language : tamil
Predicted language :punjabi

on who hor  me thon o o pobe 

Actual language : tamil
Predicted language :assamese

a warer ors ter an hof won et

Actual language : tamil
Predict

 pe o maser e ofol te

Actual language : tamil
Predicted language :punjabi

ol wo trew ua ter of a ro ba t who ye

Actual language : tamil
Predicted language :punjabi

here you all o of he a her n

Actual language : tamil
Predicted language :tamil

a mirl hon worve mo wor ol ol o han a all

Actual language : tamil
Predicted language :assamese

o all no you yor o or 

Actual language : tamil
Predicted language :assamese

ihero a her her ow her eohor

Actual language : tamil
Predicted language :assamese

a her ho heever hor be a o ov a bon an an en an

Actual language : tamil
Predicted language :tamil

a groupn n an and  women a en ande ena

Actual language : tamil
Predicted language :punjabi

ah a a ye o how wor her o fam so

Actual language : tamil
Predicted language :assamese

ran canon to ter robler o wor oo

Actual language : tamil
Predicted language :assamese

wo whoare a o  woer trew of the i of  her ho feree

Actual language : tamil
Predicted language :assamese

a on ado e  i mae

In [556]:
print ('Accuracy :', accuracy/count*100)
print (accuracy, count)

Accuracy : 54.74452554744526
75 137
