In [1]:
import pandas as pd
import numpy as np

In [2]:
lang = ['Arabic', 'French', 'English', 'Spanish', 'Portuguese', 'German', 'Dutch', 'Italian', 
        'Japanese', 'Hindi', 'Swedish',  'Russian', 'Korean', 'Thai', 'Turkish',
        'South Azerbaijani', 'Bulgarian',  'Persian', 'Modern Greek', 'Finnish', 'Armenian']

In [3]:
lb = 'labels.csv'
X_1_text = 'x_train.txt'
y_1_text = 'y_train.txt'
X_2_text = 'x_test.txt'
y_2_text = 'y_test.txt'

In [4]:
lb_df = pd.read_csv(lb, sep=';')
lb_df.head()

Unnamed: 0,Label,English,Wiki Code,ISO 369-3,German,Language family,Writing system,Remarks,Synonyms
0,ace,Achinese,ace,ace,Achinesisch,Austronesian,,,
1,afr,Afrikaans,af,afr,Afrikaans,Indo-European,,,
2,als,Alemannic German,als,gsw,Alemannisch,Indo-European,,(ursprünglich nur Elsässisch),
3,amh,Amharic,am,amh,Amharisch,Afro-Asiatic,,,
4,ang,Old English,ang,ang,Altenglisch,Indo-European,,(ca. 450-1100),Angelsächsisch


In [5]:
labels = list(lb_df[lb_df['English'].isin(lang)]['Label'])
labels

['ara',
 'azb',
 'bul',
 'deu',
 'ell',
 'eng',
 'fas',
 'fin',
 'fra',
 'hin',
 'hye',
 'ita',
 'jpn',
 'kor',
 'nld',
 'por',
 'rus',
 'spa',
 'swe',
 'tha',
 'tur']

In [6]:
labels_names = (list(lb_df[lb_df['English'].isin(lang)]['English']))
labels_names

['Arabic',
 'South Azerbaijani',
 'Bulgarian',
 'German',
 'Modern Greek',
 'English',
 'Persian',
 'Finnish',
 'French',
 'Hindi',
 'Armenian',
 'Italian',
 'Japanese',
 'Korean',
 'Dutch',
 'Portuguese',
 'Russian',
 'Spanish',
 'Swedish',
 'Thai',
 'Turkish']

In [7]:
def read_text(X_text, y_text):
    y_df = pd.read_csv(y_text, header=None)
    y_df.columns = ['Label']
    with open(X_text, encoding='utf8') as file :
        X_pars = file.readlines()
    X_pars = [t.strip() for t in X_pars]
    X_df = pd.DataFrame(X_pars, columns=['Paragraph'])
    X_df = X_df[y_df['Label'].isin(labels)]
    y_df = y_df[y_df['Label'].isin(labels)]
    return (X_df, y_df)

In [8]:
X_1, y_1 = read_text(X_1_text, y_1_text)
X_2, y_2 = read_text(X_2_text, y_2_text)

In [9]:
X= pd.concat([X_1, X_2])
y= pd.concat([y_1, y_2])
X.head()

Unnamed: 0,Paragraph
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)...."
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...
26,De spons behoort tot het geslacht Haliclona en...
29,エノが行きがかりでバスに乗ってしまい、気分が悪くなった際に助けるが、今すぐバスを降りたいと運...
38,Tsutinalar (İngilizce: Tsuut'ina): Kanada'da A...


In [10]:
y.head()

Unnamed: 0,Label
1,swe
4,tha
26,nld
29,jpn
38,tur


In [11]:
X.shape

(21000, 1)

In [12]:
y.shape

(21000, 1)

In [13]:
y['Label'].value_counts()

swe    1000
fas    1000
tha    1000
jpn    1000
ara    1000
spa    1000
kor    1000
ita    1000
azb    1000
rus    1000
nld    1000
deu    1000
ell    1000
fin    1000
eng    1000
tur    1000
por    1000
hye    1000
fra    1000
bul    1000
hin    1000
Name: Label, dtype: int64

# Text preprocessing

In [14]:
import nltk
from nltk import sent_tokenize, word_tokenize
import re

In [15]:
X['Paragraph'] = X['Paragraph'].str.lower() #lower case
X['Paragraph'] = X['Paragraph'].str.replace('[^\w\s]','') #Remove Punctuation
X['Paragraph'] = X['Paragraph'].str.replace('\d+','') #Remove numbers
X['Paragraph']

1         sebes joseph pereira thomas  på eng the jesuit...
4         ถนนเจรญกรง อกษรโรมน thanon charoen krung เรมตง...
26        de spons behoort tot het geslacht haliclona en...
29        エノが行きがかりでバスに乗ってしまい気分が悪くなった際に助けるが今すぐバスを降りたいと運転手...
38        tsutinalar ingilizce tsuutina kanadada alberta...
                                ...                        
117446    ο οτορίνο ρεσπίγκι ottorino respighi μπολόνια ...
117450    hors du terrain les années  et  sont des année...
117463    ใน พศ  หลกจากทเสดจประพาสแหลมมลาย ชวา อนเดยทรงไ...
117464    con motivo de la celebración del septuagésimoq...
117472    چاپ و شدت قالخانا قدر و دین تماما یالنیز اللها...
Name: Paragraph, Length: 21000, dtype: object

In [16]:
from sklearn.model_selection import train_test_split
text = X.Paragraph
language = y.Label
X_train, X_test, y_train, y_test = train_test_split(text, language, test_size=0.30, random_state=5)

In [17]:
X_train.shape

(14700,)

In [18]:
X_test.shape

(6300,)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [20]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(14700, 217095)

In [24]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [29]:
doc = ['Hello everybody it is me', 'Le ciel est beau']
X_doc_counts = count_vect.transform(doc)
X_doc_tfidf = tfidf_transformer.transform(X_doc_counts)

predicted = clf.predict(X_doc_tfidf)

for doc, language in zip(doc, predicted):
    print ('%r => %s' % (doc, language))

'Hello everybody it is me' => eng
'Le ciel est beau' => fra


In [32]:
lang_NB = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()), ])
lang_NB.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [34]:
predictions = lang_NB.predict(X_test)
accuracy_score(y_test,predictions)

0.9252380952380952

In [25]:
lang_svm = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SVC()), ])
lang_svm.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SVC())])

In [36]:
from sklearn.linear_model import SGDClassifier
lang_svm = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                           alpha=1e-3, random_state=42,
                           max_iter=5, tol=None)), ])
lang_svm.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [37]:
predictions_1 = lang_svm.predict(X_test)
accuracy_score(y_test,predictions_1)

0.9634920634920635

In [38]:
from sklearn import metrics
print(metrics.classification_report(y_test, predictions_1,
    target_names=lang))

                   precision    recall  f1-score   support

           Arabic       0.99      0.99      0.99       323
           French       1.00      0.99      1.00       294
          English       0.89      0.96      0.92       295
          Spanish       0.94      0.98      0.96       290
       Portuguese       1.00      0.99      1.00       312
           German       0.78      0.99      0.87       277
            Dutch       1.00      1.00      1.00       294
          Italian       1.00      0.99      1.00       323
         Japanese       0.96      1.00      0.98       301
            Hindi       1.00      0.98      0.99       305
          Swedish       1.00      0.95      0.97       286
          Russian       0.98      0.98      0.98       309
           Korean       0.84      0.91      0.87       305
             Thai       1.00      0.99      0.99       311
          Turkish       0.98      0.97      0.98       262
South Azerbaijani       0.98      0.94      0.96       

In [40]:
metrics.confusion_matrix(y_test, predictions_1)

array([[321,   0,   0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0],
       [  0, 292,   0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0, 282,   5,   0,   6,   0,   0,   0,   0,   0,   1,   0,
          0,   0,   1,   0,   0,   0,   0,   0],
       [  0,   0,   0, 284,   0,   2,   0,   0,   3,   0,   0,   0,   0,
          0,   1,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   2, 309,   1,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0, 273,   0,   0,   1,   0,   0,   0,   1,
          0,   2,   0,   0,   0,   0,   0,   0],
       [  1,   0,   0,   0,   0,   0, 293,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   1,   0,   0,   0, 321,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   1,   0,   0],
       [  0,   0,   0,  

In [43]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
gs_lang_svm = GridSearchCV(lang_svm, parameters, cv=5, n_jobs=-1)
gs_lang_svm = gs_clf.fit(X_train, y_train)

In [44]:
gs_clf.best_score_

0.9671428571428571

In [46]:
for param in sorted(parameters.keys()):
    print("%s: %r" % (param, gs_lang_svm.best_params_[param]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [53]:
doc = ['Hello everybody it is me', 'Le ciel est beau', 'hello je suis man morocco']


predicted = gs_lang_svm.predict(doc)

for doc, language in zip(doc, predicted):
    print ('%r => %s' % (doc, language))

'Hello everybody it is me' => eng
'Le ciel est beau' => fra
'hello je suis man morocco' => tha
