In [1]:
import pandas as pd
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

#### Funzione per transformare una frase in una lista ngrams

In [2]:
def my_ngram_function(sentence, n):
    ngrams_sentence = ngrams(sentence, n)
    return ["".join(grams) for grams in ngrams_sentence]


#### Funzione Bag of word e costruzione dizionario

In [3]:
def bow_count(dataset, count_vectorizer):
    if count_vectorizer == None:
        count_vectorizer = CountVectorizer()
        X = count_vectorizer.fit_transform(dataset)
    else:
        X = count_vectorizer.transform(dataset)
        
    return X.toarray(), count_vectorizer

## Import dataset

In [4]:
URL = "https://raw.githubusercontent.com/ProfAI/natural-language-processing/main/datasets/Lezione_4-language_detection/"

In [5]:
dataset = pd.read_csv(URL+"dataset.csv")
dataset

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


In [6]:
set(dataset["language"])

{'Arabic',
 'Chinese',
 'Dutch',
 'English',
 'Estonian',
 'French',
 'Hindi',
 'Indonesian',
 'Japanese',
 'Korean',
 'Latin',
 'Persian',
 'Portugese',
 'Pushto',
 'Romanian',
 'Russian',
 'Spanish',
 'Swedish',
 'Tamil',
 'Thai',
 'Turkish',
 'Urdu'}

In [7]:
languages = set(dataset["language"])

for l in languages:
    print(l+" : "+ str(len(dataset[dataset["language"] == l])))

Thai : 1000
Swedish : 1000
Hindi : 1000
French : 1000
Pushto : 1000
English : 1000
Dutch : 1000
Japanese : 1000
Spanish : 1000
Tamil : 1000
Turkish : 1000
Indonesian : 1000
Portugese : 1000
Korean : 1000
Estonian : 1000
Romanian : 1000
Persian : 1000
Latin : 1000
Arabic : 1000
Russian : 1000
Urdu : 1000
Chinese : 1000


#### New dataset with specific language only

In [8]:
new_dataset = dataset[dataset["language"].isin(["English","Spanish", "Portugese"])]

In [9]:
new_dataset

Unnamed: 0,Text,language
11,barocco pt escândalo de ª página é um filme fr...,Portugese
17,diante destes gerenciamentos podendo ser feito...,Portugese
18,besemer s van der geest v murray j bijleveld c...,Portugese
21,en navidad de poco después de que interpretó ...,Spanish
37,in johnson was awarded an american institute ...,English
...,...,...
21959,para colmo las tropas albanesas atacaban conti...,Spanish
21970,mc ^ o kashiwa reysol na edição de foi o time...,Portugese
21975,fue fundado el de octubre de el día de ese ...,Spanish
21983,el investigador ha recibido varios reconocimie...,Spanish


In [10]:
labels = new_dataset["language"]

In [11]:
set(labels)

{'English', 'Portugese', 'Spanish'}

In [12]:
text = new_dataset["Text"]

In [13]:
text

11       barocco pt escândalo de ª página é um filme fr...
17       diante destes gerenciamentos podendo ser feito...
18       besemer s van der geest v murray j bijleveld c...
21       en navidad de  poco después de que interpretó ...
37       in  johnson was awarded an american institute ...
                               ...                        
21959    para colmo las tropas albanesas atacaban conti...
21970    mc ^ o kashiwa reysol na edição de  foi o time...
21975    fue fundado el  de octubre de  el día  de ese ...
21983    el investigador ha recibido varios reconocimie...
21997    con motivo de la celebración del septuagésimoq...
Name: Text, Length: 3000, dtype: object

## Data Cleaning

In [14]:
ngram_datatset = [" ".join(my_ngram_function(sentence,2)) for sentence in text]

In [15]:
ngram_datatset

['ba ar ro oc cc co o   p pt t   e es sc câ ân nd da al lo o   d de e   ª ª   p pá ág gi in na a   é é   u um m   f fi il lm me e   f fr ra an nc cê ês s   d di ir ri ig gi id do o   p po or r   a an nd dr ré é   t té éc ch hi in né é   e es st tr re ea ad do o   e em m      e e   p pr ro ot ta ag go on ni iz za ad do o   p po or r   g gé ér ra ar rd d   d de ep pa ar rd di ie eu u   e e   i is sa ab be el ll le e   a ad dj ja an ni',
 'di ia an nt te e   d de es st te es s   g ge er re en nc ci ia am me en nt to os s   p po od de en nd do o   s se er r   f fe ei it to os s   p pe el la a   p pr ró óp pr ri ia a   e em mp pr re es sa a   m ma ai is s   c co om mu um m   e em m   m mé éd di ia a   e e   g gr ra an nd de es s   e em mp pr re es sa as s   o ou u   a at té é   m me es sm mo o   p po or r   e em mp pr re es sa as s   t te er rc ce ei ir ra as s   e es sp pe ec ci ia al li iz za ad da as s   e em m   m mo on ni it to or ra am me en nt to o   d de e   r re ed de es s   n no o

In [16]:
len(ngram_datatset)

3000

In [17]:
ngram_dataset_bow, count_vectorizer = bow_count(ngram_datatset, None)

In [18]:
len(ngram_dataset_bow[0])

1400

In [19]:
count_vectorizer.vocabulary_

{'ba': 40,
 'ar': 17,
 'ro': 626,
 'oc': 528,
 'cc': 75,
 'co': 86,
 'pt': 581,
 'es': 171,
 'sc': 661,
 'câ': 98,
 'ân': 939,
 'nd': 488,
 'da': 110,
 'al': 11,
 'lo': 421,
 'de': 114,
 'pá': 586,
 'ág': 920,
 'gi': 241,
 'in': 320,
 'na': 485,
 'um': 749,
 'fi': 206,
 'il': 318,
 'lm': 419,
 'me': 449,
 'fr': 212,
 'ra': 612,
 'an': 13,
 'nc': 487,
 'cê': 102,
 'ês': 995,
 'di': 118,
 'ir': 324,
 'ri': 620,
 'ig': 313,
 'id': 310,
 'do': 124,
 'po': 577,
 'or': 543,
 'dr': 127,
 'ré': 644,
 'té': 728,
 'éc': 970,
 'ch': 80,
 'hi': 279,
 'né': 517,
 'st': 678,
 'tr': 714,
 're': 616,
 'ea': 153,
 'ad': 3,
 'em': 165,
 'pr': 579,
 'ot': 545,
 'ta': 698,
 'ag': 6,
 'go': 247,
 'on': 539,
 'ni': 493,
 'iz': 332,
 'za': 880,
 'gé': 262,
 'ér': 983,
 'rd': 615,
 'ep': 168,
 'pa': 563,
 'ie': 311,
 'eu': 173,
 'is': 325,
 'sa': 659,
 'ab': 1,
 'be': 44,
 'el': 164,
 'll': 418,
 'le': 411,
 'dj': 119,
 'ja': 350,
 'ia': 307,
 'nt': 504,
 'te': 702,
 'ge': 237,
 'er': 170,
 'en': 166,
 'ci': 

### Train Test Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(ngram_dataset_bow, 
                                                   labels, 
                                                   test_size = 0.25,
                                                   random_state=2)

In [21]:
print(X_train)
len(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 2 2 ... 0 0 0]
 ...
 [0 2 0 ... 0 0 0]
 [0 1 2 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


2250

In [22]:
print(X_test)
len(X_test)

[[0 0 6 ... 0 0 0]
 [0 2 1 ... 0 0 0]
 [0 1 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 1 3 ... 0 0 0]
 [0 0 1 ... 0 0 0]]


750

In [23]:
print(y_train)
len(y_train)

18972    Portugese
17758    Portugese
6522     Portugese
6127     Portugese
17620    Portugese
           ...    
18519      English
17509      Spanish
11792      English
18765      Spanish
19009    Portugese
Name: language, Length: 2250, dtype: object


2250

In [24]:
print(y_test)
len(y_test)

9553       English
4393       Spanish
1795       Spanish
20705      Spanish
9552       Spanish
           ...    
4452     Portugese
9520       Spanish
15283    Portugese
4388     Portugese
9802       English
Name: language, Length: 750, dtype: object


750

## Model training - MLPClassifier 

In [25]:
clf = MLPClassifier(activation='logistic',
                    hidden_layer_sizes=(100,),
                    max_iter=100,
                    solver='adam',
                    tol=0.005,
                    verbose=True)
clf.fit(X_train, y_train)

Iteration 1, loss = 0.77324744
Iteration 2, loss = 0.38787182
Iteration 3, loss = 0.24443763
Iteration 4, loss = 0.17719905
Iteration 5, loss = 0.13870486
Iteration 6, loss = 0.11420993
Iteration 7, loss = 0.09869607
Iteration 8, loss = 0.08253616
Iteration 9, loss = 0.07101350
Iteration 10, loss = 0.06199991
Iteration 11, loss = 0.05471968
Iteration 12, loss = 0.04821019
Iteration 13, loss = 0.04295020
Iteration 14, loss = 0.03839179
Iteration 15, loss = 0.03462902
Iteration 16, loss = 0.03111042
Iteration 17, loss = 0.02794939
Iteration 18, loss = 0.02533963
Iteration 19, loss = 0.02316605
Iteration 20, loss = 0.02120209
Iteration 21, loss = 0.01939347
Iteration 22, loss = 0.01783862
Iteration 23, loss = 0.01640346
Iteration 24, loss = 0.01514420
Training loss did not improve more than tol=0.005000 for 10 consecutive epochs. Stopping.


MLPClassifier(activation='logistic', max_iter=100, tol=0.005, verbose=True)

### Model evaluation

In [26]:
clf.score(X_test, y_test)

0.98

In [27]:
frase_eng = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves."

frase_eng = [" ".join(my_ngram_function(frase_eng,2))]
print(bow_count(frase_eng, count_vectorizer))

clf.predict(bow_count(frase_eng, count_vectorizer)[0])

(array([[0, 1, 3, ..., 0, 0, 0]], dtype=int64), CountVectorizer())


array(['English'], dtype='<U9')

In [28]:
frase_es = "Erase una vez un huevo de donde una oruga se eclosionò. Su forma inicial se volviò en crisálida hasta que llegò a su metàmorfosis adulta. Una mariposa guapìsima."

frase_es = [" ".join(my_ngram_function(frase_es,2))]
print(bow_count(frase_es, count_vectorizer))

clf.predict(bow_count(frase_es, count_vectorizer)[0])

(array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64), CountVectorizer())


array(['Spanish'], dtype='<U9')

In [29]:
frase_pt = "Estava à toa na vida O meu amor me chamou Pra ver a banda passar Cantando coisas de amor A minha gente sofrida Despediu-se da dor Pra ver a banda passar Cantando coisas de amor O homem sério que contava dinheiro"

frase_pt = [" ".join(my_ngram_function(frase_pt,2))]
print(bow_count(frase_pt, count_vectorizer))

clf.predict(bow_count(frase_pt, count_vectorizer)[0])

(array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64), CountVectorizer())


array(['Portugese'], dtype='<U9')