In [1]:
import pandas as pd
import numpy as np

In [2]:
lang = ['Arabic', 'French', 'English', 'Spanish', 'Portuguese', 'German', 'Dutch', 'Italian', 
        'Japanese', 'Hindi', 'Swedish',  'Russian', 'Korean', 'Thai', 'Turkish',
        'South Azerbaijani', 'Bulgarian',  'Persian', 'Modern Greek', 'Finnish', 'Armenian']

In [3]:
lb = 'labels.csv'
X_1_text = 'x_train.txt'
y_1_text = 'y_train.txt'
X_2_text = 'x_test.txt'
y_2_text = 'y_test.txt'

In [4]:
lb_df = pd.read_csv(lb, sep=';')
lb_df.head()

Unnamed: 0,Label,English,Wiki Code,ISO 369-3,German,Language family,Writing system,Remarks,Synonyms
0,ace,Achinese,ace,ace,Achinesisch,Austronesian,,,
1,afr,Afrikaans,af,afr,Afrikaans,Indo-European,,,
2,als,Alemannic German,als,gsw,Alemannisch,Indo-European,,(ursprünglich nur Elsässisch),
3,amh,Amharic,am,amh,Amharisch,Afro-Asiatic,,,
4,ang,Old English,ang,ang,Altenglisch,Indo-European,,(ca. 450-1100),Angelsächsisch


In [5]:
labels = list(lb_df[lb_df['English'].isin(lang)]['Label'])
labels

['ara',
 'azb',
 'bul',
 'deu',
 'ell',
 'eng',
 'fas',
 'fin',
 'fra',
 'hin',
 'hye',
 'ita',
 'jpn',
 'kor',
 'nld',
 'por',
 'rus',
 'spa',
 'swe',
 'tha',
 'tur']

In [6]:
labels_names = (list(lb_df[lb_df['English'].isin(lang)]['English']))
labels_names

['Arabic',
 'South Azerbaijani',
 'Bulgarian',
 'German',
 'Modern Greek',
 'English',
 'Persian',
 'Finnish',
 'French',
 'Hindi',
 'Armenian',
 'Italian',
 'Japanese',
 'Korean',
 'Dutch',
 'Portuguese',
 'Russian',
 'Spanish',
 'Swedish',
 'Thai',
 'Turkish']

In [7]:
def read_text(X_text, y_text):
    y_df = pd.read_csv(y_text, header=None)
    y_df.columns = ['Label']
    with open(X_text, encoding='utf8') as file :
        X_pars = file.readlines()
    X_pars = [t.strip() for t in X_pars]
    X_df = pd.DataFrame(X_pars, columns=['Paragraph'])
    X_df = X_df[y_df['Label'].isin(labels)]
    y_df = y_df[y_df['Label'].isin(labels)]
    return (X_df, y_df)

In [8]:
X_1, y_1 = read_text(X_1_text, y_1_text)
X_2, y_2 = read_text(X_2_text, y_2_text)

In [9]:
X= pd.concat([X_1, X_2])
y= pd.concat([y_1, y_2])
X.head()

Unnamed: 0,Paragraph
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)...."
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...
26,De spons behoort tot het geslacht Haliclona en...
29,エノが行きがかりでバスに乗ってしまい、気分が悪くなった際に助けるが、今すぐバスを降りたいと運...
38,Tsutinalar (İngilizce: Tsuut'ina): Kanada'da A...


In [10]:
y.head()

Unnamed: 0,Label
1,swe
4,tha
26,nld
29,jpn
38,tur


In [11]:
X.shape

(21000, 1)

In [12]:
y.shape

(21000, 1)

In [13]:
y['Label'].value_counts()

ara    1000
deu    1000
fas    1000
ita    1000
bul    1000
hye    1000
jpn    1000
ell    1000
swe    1000
fin    1000
por    1000
azb    1000
kor    1000
fra    1000
tur    1000
tha    1000
spa    1000
rus    1000
hin    1000
eng    1000
nld    1000
Name: Label, dtype: int64

# Text preprocessing

In [14]:
import nltk
from nltk import sent_tokenize, word_tokenize
import re

In [15]:
X['Paragraph'] = X['Paragraph'].str.lower() #lower case
X['Paragraph'] = X['Paragraph'].str.replace('[^\w\s]','') #Remove Punctuation
X['Paragraph'] = X['Paragraph'].str.replace('\d+','') #Remove numbers
X['Paragraph']

1         sebes joseph pereira thomas  på eng the jesuit...
4         ถนนเจรญกรง อกษรโรมน thanon charoen krung เรมตง...
26        de spons behoort tot het geslacht haliclona en...
29        エノが行きがかりでバスに乗ってしまい気分が悪くなった際に助けるが今すぐバスを降りたいと運転手...
38        tsutinalar ingilizce tsuutina kanadada alberta...
                                ...                        
117446    ο οτορίνο ρεσπίγκι ottorino respighi μπολόνια ...
117450    hors du terrain les années  et  sont des année...
117463    ใน พศ  หลกจากทเสดจประพาสแหลมมลาย ชวา อนเดยทรงไ...
117464    con motivo de la celebración del septuagésimoq...
117472    چاپ و شدت قالخانا قدر و دین تماما یالنیز اللها...
Name: Paragraph, Length: 21000, dtype: object

In [16]:
def tokenization(row):
    para = row['Paragraph']
    tokens = nltk.word_tokenize(para)
    token_words = [w for w in tokens]
    return token_words

X['Paragraph'] = X.apply(tokenization, axis=1)
X['Paragraph']

1         [sebes, joseph, pereira, thomas, på, eng, the,...
4         [ถนนเจรญกรง, อกษรโรมน, thanon, charoen, krung,...
26        [de, spons, behoort, tot, het, geslacht, halic...
29        [エノが行きがかりでバスに乗ってしまい気分が悪くなった際に助けるが今すぐバスを降りたいと運転...
38        [tsutinalar, ingilizce, tsuutina, kanadada, al...
                                ...                        
117446    [ο, οτορίνο, ρεσπίγκι, ottorino, respighi, μπο...
117450    [hors, du, terrain, les, années, et, sont, des...
117463    [ใน, พศ, หลกจากทเสดจประพาสแหลมมลาย, ชวา, อนเดย...
117464    [con, motivo, de, la, celebración, del, septua...
117472    [چاپ, و, شدت, قالخانا, قدر, و, دین, تماما, یال...
Name: Paragraph, Length: 21000, dtype: object

In [17]:
def rejoin_words(row):
    l = row['Paragraph']
    joined_words = ( " ".join(l))
    return joined_words

X['Paragraph'] = X.apply(rejoin_words, axis=1)
X['Paragraph']

1         sebes joseph pereira thomas på eng the jesuits...
4         ถนนเจรญกรง อกษรโรมน thanon charoen krung เรมตง...
26        de spons behoort tot het geslacht haliclona en...
29        エノが行きがかりでバスに乗ってしまい気分が悪くなった際に助けるが今すぐバスを降りたいと運転手...
38        tsutinalar ingilizce tsuutina kanadada alberta...
                                ...                        
117446    ο οτορίνο ρεσπίγκι ottorino respighi μπολόνια ...
117450    hors du terrain les années et sont des années ...
117463    ใน พศ หลกจากทเสดจประพาสแหลมมลาย ชวา อนเดยทรงได...
117464    con motivo de la celebración del septuagésimoq...
117472    چاپ و شدت قالخانا قدر و دین تماما یالنیز اللها...
Name: Paragraph, Length: 21000, dtype: object

In [18]:
from sklearn.model_selection import train_test_split
text = X.Paragraph
language = y.Label
X_train, X_test, y_train, y_test = train_test_split(text, language, test_size=0.30, random_state=5)

In [19]:
X_train.shape

(14700,)

In [20]:
X_test.shape

(6300,)

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [22]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(14700, 217094)

In [23]:
lang_NB = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()), ])
lang_NB.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [24]:
predictions = lang_NB.predict(X_test)
accuracy_score(y_test,predictions)

0.9252380952380952

In [25]:
lang_svm = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SVC()), ])
lang_svm.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SVC())])

In [26]:
predictions_1 = lang_svm.predict(X_test)
accuracy_score(y_test,predictions_1)

0.943968253968254