In [1]:
SENTENCE_LIMIT = 100000
import pandas as pd 
import numpy as np
from spacy.lang.en import English
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [2]:
# import stopwords
gnr_stp = []
with open('./Data/stopwords/gnr.txt', encoding="utf8") as f:
        for line in f.readlines():
            gnr_stp.append(line.strip())

ar_stp = pd.read_csv('./Data/stopwords/ar.txt',sep='\n').values.reshape(-1)
de_stp = pd.read_csv('./Data/stopwords/de.txt',sep='\n').values.reshape(-1)
en_stp = pd.read_csv('./Data/stopwords/en.txt',sep='\n').values.reshape(-1)
fa_stp = pd.read_csv('./Data/stopwords/fa.txt',sep='\n').values.reshape(-1)
se_stp = pd.read_csv('./Data/stopwords/se.txt',sep='\n').values.reshape(-1)
tr_stp = pd.read_csv('./Data/stopwords/tr.txt',sep='\n').values.reshape(-1)

In [3]:
# preprocessor
def pre_df(sentence,stopwords):
    stopwords = stopwords.tolist()
    sentence = str(sentence).replace('\n','')
    sentence = str(sentence).replace('\t','')
    sentence = str(sentence).replace('\r','')
    for sym in gnr_stp:
        sentence = str(sentence).replace(sym,'')
    for stp in stopwords:
        sentence = re.sub(r'\b'+str(stp)+r'\b', '', str(sentence))
    
    return sentence
    

In [4]:
# import ar dataset
ar_df = pd.read_csv('./Data/ar/ar.csv')
ar_df = ar_df.sample(frac = 1)
ar_df = ar_df[:SENTENCE_LIMIT]
ar_df.drop(columns=['targe'],inplace=True)
ar_df.columns = ['txt']
ar_df.dropna(inplace=True)
# stopwords and special characters
ar_df['final'] = ar_df.apply(lambda x: pre_df(x['txt'],stopwords=ar_stp) ,axis=1)
ar_df.dropna(inplace=True)
ar_df.head()

Unnamed: 0,txt,final
37465,قالت الحكومة الإسبانية السبت إنه تم إطلاق سراح...,قالت الحكومة الإسبانية السبت إنه إطلاق سراح ...
78751,أبرز لاعب وسط ريال مدريد البرازيلي كاسيميرو أن...,أبرز لاعب وسط ريال مدريد البرازيلي كاسيميرو أن...
95502,احتضنت قاعة مندوبية الشباب والرياضة باكادير حف...,احتضنت قاعة مندوبية الشباب والرياضة باكادير حف...
91688,ساعات في الجحيم بسبب اختناق المرور بعد مباراة ...,ساعات الجحيم اختناق المرور مباراة الأسود ت...
84752,حصدت بطولات الفئات الصغرى للعصبة في الموسم الم...,حصدت بطولات الفئات الصغرى للعصبة الموسم تشار...


In [5]:
# import de dataset
de_df = pd.read_csv('./Data/de/de.txt',sep='\t')
de_df = de_df.sample(frac = 1)
de_df = de_df[:SENTENCE_LIMIT]
de_df.drop(columns=["1"],inplace=True)
de_df.columns = ['txt']
# stopwords and special characters
de_df['final'] = de_df.apply(lambda x: pre_df(x['txt'],stopwords=de_stp),axis=1) 
de_df.dropna(inplace=True)
de_df.head()

Unnamed: 0,txt,final
2065186,"Schon gar nicht in der Ukraine, wo die Nachric...",Schon Ukraine Nachrichten fast täglich ...
1779085,Man muss sich mit Augenblicken zufrieden geben...,Man Augenblicken zufrieden geben heimis...
762206,Der Wagen gehört zum Fuhrpark der Bösen und ra...,Der Wagen gehört Fuhrpark Bösen rast meist ...
547602,"Dass Demmel ihn jedoch im Amt ""überfordert"" ne...",Dass Demmel Amt überfordert nennt weist en...
444795,Dann so wie Lotus in der F1 Also lediglich als...,Dann Lotus F Also lediglich Sponsor w...


In [6]:
# import en dataset
books = ['The Adventures of Sherlock Holmes.txt','Pride and Prejudice.txt','Life And Adventures Of Martin Chuzzlewit.txt','Frankenstein.txt','Alice’s Adventures in Wonderland.txt']
# for i in range(36):
#     books.append("en-art{:0>2}".format(i))
en_df = pd.DataFrame
for book in books:
    with open('./Data/en/'+book, encoding="utf8") as f:
        str_out = f.read()

    nlp = English()
    nlp.add_pipe("sentencizer")
    tmp_df = pd.DataFrame([sent.text for sent in nlp(str_out).sents])
    if en_df.empty:
        en_df=tmp_df
    else:
        en_df = pd.concat([en_df,tmp_df],axis=0)
en_df = en_df.sample(frac = 1)
en_df = en_df[:SENTENCE_LIMIT]
en_df.columns = ['txt']
# stopwords and special characters
en_df['final'] = en_df.apply(lambda x: pre_df(x['txt'],stopwords=en_stp),axis=1) 
en_df.dropna(inplace=True)
en_df.head()

Unnamed: 0,txt,final
928,Always lay the\nblame on others!”,Always lay theblame
154,\nShe was the most arch and at the same time t...,She arch time artless creaturewas M...
3974,"I say, young ladies, I’m a-going to leave.",I ladies Im agoing leave
3443,said Mr Pecksniff. ‘,Mr Pecksniff
3535,"Mr\nJinkins is hard upon him sometimes, but no...",MrJinkins hard hard deserves


In [7]:
# import fa dataset
droped_columns = ['Id', 'Title', 'Type', 'Rank', 'Namespace', 'RedirectList',
       'IsDisambiguationPage', 'TargetLinksCount', 'InfoBox', 'Links',
       'Parents']
fa_df = pd.read_json('./Data/fa/fawiki.json',lines=True)
fa_df.drop(columns=droped_columns,inplace=True)
fa_df = fa_df.sample(frac=1)
fa_df = fa_df[:SENTENCE_LIMIT]
fa_df.columns = ['txt']
# stopwords and special characters
fa_df['final'] = fa_df.apply(lambda x: pre_df(x['txt'],stopwords=fa_stp),axis=1) 
fa_df.dropna(inplace=True)
fa_df.head()

Unnamed: 0,txt,final
188472,سان را (; – ) آهنگساز، تنظیم‌کننده، پیانیست و ...,سان آهنگساز تنظیم‌کننده پیانیست و کیبوردنو...
198736,مهره‌داران توصیف‌شده در ۱۹۵۵ (میلادی),مهره‌داران توصیف‌ میلادی
265149,تانزانیا در ۱۹۷۴ (میلادی),تانزانیا میلادی
198754,مهره‌داران توصیف‌شده در ۱۹۷۹ (میلادی),مهره‌داران توصیف‌ میلادی
107518,گروه‌های موسیقی اوماها، نبراسکا,گروه‌های موسیقی اوماها نبراسکا


In [8]:
# import se dataset
se_df = pd.read_csv('./Data/se/se-lite.txt', sep='\n')
se_df = se_df.sample(frac=1)
se_df = se_df[:SENTENCE_LIMIT]
se_df.columns = ['txt']
# stopwords and special characters
se_df['final'] = se_df.apply(lambda x: pre_df(x['txt'],stopwords=se_stp),axis=1) 
se_df.dropna(inplace=True)
se_df.head()

Unnamed: 0,txt,final
86820,"Jag sa att vi är två , men menar naturligtvis ...",Jag sa är menar naturligtvis JAG är ...
55739,Nu får han ligga på sofflocket och ta det så v...,Nu sofflocket ta varligt
9982,Kanske skulle Mats rentav uppskatta gåvan ?,Kanske Mats rentav uppskatta gåvan
132045,Sen tog Tina vid :,Sen tog Tina
38046,Strax nedanför höftbenet hade ett första liggs...,Strax nedanför höftbenet liggsår öppnat gl...


In [9]:
# import tr dataset
tr_df = pd.read_csv('./Data/tr/turkish.csv')
tmp_df = pd.read_csv('./Data/tr/dunya-nz.txt',sep='\n')

tr_df = pd.concat([tr_df,tmp_df],axis=0)
tr_df.drop(columns=['Unnamed: 0'],inplace=True)
tr_df = tr_df.sample(frac=1)
tr_df = tr_df[:SENTENCE_LIMIT]
tr_df.columns = ['txt']
# stopwords and special characters
tr_df['final'] = tr_df.apply(lambda x: pre_df(x['txt'],stopwords=tr_stp),axis=1) 
tr_df.dropna(inplace=True)
tr_df.head()

Unnamed: 0,txt,final
77085,rivlin ™in iddiasına göre baskı sonuç verdi ve...,rivlin ™in iddiasına göre baskı sonuç verdi a...
321991,yunan başbakanı na türkiye ziyaretini sığınmac...,yunan başbakanı na türkiye ziyaretini sığınmac...
68393,artık son karar parti kurullarında,artık son karar parti kurullarında
197092,çavuşoğlu dışişleri bakanı klimkin ile görüşme...,çavuşoğlu dışişleri bakanı klimkin görüşmesin...
282203,bu karara itiraz eden kuzen rami makhlouf bir ...,karara itiraz eden kuzen rami makhlouf üst m...


# Feature Extraction

In [10]:
# labeling
ar_df['label'] = 'ar'
de_df['label'] = 'de'
en_df['label'] = 'en'
fa_df['label'] = 'fa'
se_df['label'] = 'se'
tr_df['label'] = 'tr'
# cleaning
ar_df.drop(columns=['txt'],inplace=True)
de_df.drop(columns=['txt'],inplace=True)
en_df.drop(columns=['txt'],inplace=True)
fa_df.drop(columns=['txt'],inplace=True)
se_df.drop(columns=['txt'],inplace=True)
tr_df.drop(columns=['txt'],inplace=True)

In [11]:
# concat
df = pd.concat([ar_df,de_df,en_df,fa_df,se_df,tr_df],axis=0)
df.to_csv('pr_100k_data.csv')
df.tail()

Unnamed: 0,final,label
221926,son gelişmelerle ilgili olarak yazık ciddi ...,tr
51040,araştırmalardan birine göre çocukluk fotoğrafl...,tr
102349,tamaulipas eyaleti uyuşturucu kaçakçılığına ba...,tr
49610,sendika başkanı christophe regnard tarafsız ha...,tr
208126,cunta lideri prayut un iddialarına ülke geneli...,tr


In [None]:
df = pd.read_csv('pr_data.csv')

In [12]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(df['final'].to_numpy(), df.label.to_numpy(), test_size=0.33, random_state=404)
# vectorize by 2 characters
cnt = CountVectorizer(analyzer = 'char',ngram_range=(2,3))

In [13]:
# training Naive Bayes
pipeline = Pipeline([
   ('vectorizer',cnt),  
   ('model',MultinomialNB())
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

In [38]:
# training Linear Classifier
pipeline = Pipeline([
   ('vectorizer',cnt),  
   ('model',LogisticRegression(max_iter=500))
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

In [52]:
# training Support Vector
pipeline = Pipeline([
   ('vectorizer',cnt),
   ('standardscaler', StandardScaler(with_mean=False)),  
   ('model',SVC())
])
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

In [14]:
# results
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[32062     0     2     3     0     0]
 [    0 32792   135     0    83    11]
 [    0   161  6768    61    82    83]
 [   25    15    45 32915    11     0]
 [    0   640   226   249 31842   105]
 [    0    53   128    14    29 32788]]
              precision    recall  f1-score   support

          ar       1.00      1.00      1.00     32067
          de       0.97      0.99      0.98     33021
          en       0.93      0.95      0.94      7155
          fa       0.99      1.00      0.99     33011
          se       0.99      0.96      0.98     33062
          tr       0.99      0.99      0.99     33012

    accuracy                           0.99    171328
   macro avg       0.98      0.98      0.98    171328
weighted avg       0.99      0.99      0.99    171328



In [15]:
# test
print(pipeline.predict(['سلام بر شما دوست عزیز']))
print(pipeline.predict(['to night we will rock the hell out of the place']))
print(pipeline.predict(['Heute Nacht werden wir die Hölle auf dem Platz rocken']))
print(pipeline.predict(['إلى الليل سنهزّ الجحيم في المكان']))
print(pipeline.predict(['geceye cehennemi oralarda sallayacağız']))
print(pipeline.predict(['i natt kommer vi att rocka helvetet på platsen']))

['fa']
['en']
['de']
['ar']
['tr']
['se']
