In [1]:
SENTENCE_LIMIT = 100
import pandas as pd 
import numpy as np
from spacy.lang.en import English
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report


In [45]:
# import stopwords
gnr_stp = []
with open('./Data/stopwords/gnr.txt', encoding="utf8") as f:
        for line in f.readlines():
            gnr_stp.append(line.strip())

ar_stp = pd.read_csv('./Data/stopwords/ar.txt',sep='\n').values.reshape(-1)
de_stp = pd.read_csv('./Data/stopwords/de.txt',sep='\n').values.reshape(-1)
en_stp = pd.read_csv('./Data/stopwords/en.txt',sep='\n').values.reshape(-1)
fa_stp = pd.read_csv('./Data/stopwords/fa.txt',sep='\n').values.reshape(-1)
se_stp = pd.read_csv('./Data/stopwords/se.txt',sep='\n').values.reshape(-1)
tr_stp = pd.read_csv('./Data/stopwords/tr.txt',sep='\n').values.reshape(-1)

In [46]:
# preprocessor
def pre_df(sentence,stopwords):
    stopwords = stopwords.tolist()
    sentence = str(sentence).replace('\n','')
    sentence = str(sentence).replace('\t','')
    sentence = str(sentence).replace('\r','')
    for sym in gnr_stp:
        sentence = str(sentence).replace(sym,'')
    for stp in stopwords:
        sentence = re.sub(r'\b'+str(stp)+r'\b', '', str(sentence))
    
    return sentence
    

In [13]:
# import ar dataset
ar_df = pd.read_csv('./Data/ar/ar.csv')
ar_df = ar_df.sample(frac = 1)
ar_df = ar_df[:SENTENCE_LIMIT]
ar_df.drop(columns=['targe'],inplace=True)
ar_df.columns = ['txt']
ar_df.dropna(inplace=True)
# stopwords and special characters
ar_df['final'] = ar_df.apply(lambda x: pre_df(x['txt'],stopwords=ar_stp) ,axis=1)
ar_df.dropna(inplace=True)
ar_df.head()

Unnamed: 0,txt
39925,أفاد المكتب الوطني للسلامة الصحية للمنتجات الغ...
33616,أعلنت غرفة التجارة والصناعة الإندونيسية بجاكرت...
101358,الفريق الرباطي يبحث تعويض مدافعه فضال ويجرب ل...
29316,لقي شخص مغرب الخميس مصرعه بعدما دهسه قطار مس...
42406,أكد وزير السياحة والصناعة التقليدية السيد ياسر...


In [6]:
# import de dataset
de_df = pd.read_csv('./Data/de/de.txt',sep='\t')
de_df = de_df.sample(frac = 1)
de_df = de_df[:SENTENCE_LIMIT]
de_df.drop(columns=["1"],inplace=True)
de_df.columns = ['txt']
# stopwords and special characters
de_df['final'] = de_df.apply(lambda x: pre_df(x['txt'],stopwords=de_stp),axis=1) 
de_df.dropna(inplace=True)
de_df.head()

Unnamed: 0,txt
1028685,Die Umstellung Generation Transporters A...
2176051,Sonst Gisdol Stielike Es Südkorea besti...
323578,Beschädigungen Alten Oper Geschäften Ei...
2469549,Wellmann Ja Platz Flieger saß
1434362,Grosse Organisationen gefährlicher Einzeltä...


In [36]:
# import en dataset
books = ['The Adventures of Sherlock Holmes.txt','Pride and Prejudice.txt','Life And Adventures Of Martin Chuzzlewit.txt','Frankenstein.txt','Alice’s Adventures in Wonderland.txt']
# for i in range(36):
#     books.append("en-art{:0>2}".format(i))
en_df = pd.DataFrame
for book in books:
    with open('./Data/en/'+book, encoding="utf8") as f:
        str_out = f.read()

    nlp = English()
    nlp.add_pipe("sentencizer")
    tmp_df = pd.DataFrame([sent.text for sent in nlp(str_out).sents])
    if en_df.empty:
        en_df=tmp_df
    else:
        en_df = pd.concat([en_df,tmp_df],axis=0)
en_df = en_df.sample(frac = 1)
en_df = en_df[:SENTENCE_LIMIT]
en_df.columns = ['txt']
# stopwords and special characters
en_df['final'] = en_df.apply(lambda x: pre_df(x['txt'],stopwords=en_stp),axis=1) 
en_df.dropna(inplace=True)
en_df.head()

Unnamed: 0,txt,final
1122,"\n\n“Let’s go on with the game,” the Queen sai...",Lets game Queen Alice Alice toomuch f...
1222,But she was innocent.,But innocent
3040,"\n\n“Yes, to-day.”",Yes
2953,Mrs Todgers rises; the two Miss Pecksniffs ris...,Mrs Todgers rises Miss Pecksniffs rise allrise
2284,"As we entered this city, our minds\nwere fille...",As entered city mindswere filled remembra...


In [47]:
# import fa dataset
droped_columns = ['Id', 'Title', 'Type', 'Rank', 'Namespace', 'RedirectList',
       'IsDisambiguationPage', 'TargetLinksCount', 'InfoBox', 'Links',
       'Parents']
fa_df = pd.read_json('./Data/fa/fawiki.json',lines=True)
fa_df.drop(columns=droped_columns,inplace=True)
fa_df = fa_df.sample(frac=1)
fa_df = fa_df[:SENTENCE_LIMIT]
fa_df.columns = ['txt']
# stopwords and special characters
fa_df['final'] = fa_df.apply(lambda x: pre_df(x['txt'],stopwords=fa_stp),axis=1) 
fa_df.dropna(inplace=True)
fa_df.head()

Unnamed: 0,txt,final
252083,هشام حافظ (؛ ۲۸ آوریل ۱۹۳۱ – ۲۶ فوریه ۲۰۰۶) رو...,هشام حافظ آوریل فوریه روزنامه‌نگار اهل ع...
115414,ورزشگاه‌های کره جنوبی بر پایه استان,ورزشگاه‌های کره جنوبی پایه استان
164360,دوربین‌های معرفی‌شده در ۲۰۱۰ (میلادی),دوربین‌های معرفی‌ میلادی
187655,مناطق حفاظت شده سان‌فرانسیسکو,مناطق حفاظت سان‌فرانسیسکو
232758,مسابقات بین‌المللی باشگاهی فوتبال,مسابقات بین‌المللی باشگاهی فوتبال


In [52]:
# import se dataset
se_df = pd.read_csv('./Data/se/se-lite.txt', sep='\n')
se_df = se_df.sample(frac=1)
se_df = se_df[:SENTENCE_LIMIT]
se_df.columns = ['txt']
# stopwords and special characters
se_df['final'] = se_df.apply(lambda x: pre_df(x['txt'],stopwords=se_stp),axis=1) 
se_df.dropna(inplace=True)
se_df.head()

Unnamed: 0,txt,final
9818,"-- Det här berättar jag bara för dig , Clara ,...",Det berättar Clara fortsatte
75090,Det gick en del rykten om att potatisar kunde ...,Det rykten potatisar giftiga fel g...
104890,Nej !,Nej
58251,Då stördes hon inte av Mats hummanden och enst...,Då stördes Mats hummanden enstaka dunk tr...
40106,Om kvinnorna glömmer sina män så ska vi män gl...,Om kvinnorna glömmer män män glömma kvinno...


In [55]:
# import tr dataset
tr_df = pd.read_csv('./Data/tr/turkish.csv')
tmp_df = pd.read_csv('./Data/tr/dunya-nz.txt',sep='\n')

tr_df = pd.concat([tr_df,tmp_df],axis=0)
tr_df.drop(columns=['Unnamed: 0'],inplace=True)
tr_df = tr_df.sample(frac=1)
tr_df = tr_df[:SENTENCE_LIMIT]
tr_df.columns = ['txt']
# stopwords and special characters
tr_df['final'] = tr_df.apply(lambda x: pre_df(x['txt'],stopwords=tr_stp),axis=1) 
tr_df.dropna(inplace=True)
tr_df.head()

Index(['Unnamed: 0', 'text'], dtype='object')


Unnamed: 0,txt,final
236497,yaşındaki pilot muaz el kesasibe ışid in sana ...,yaşındaki pilot muaz el kesasibe ışid in sana ...
89062,bu arada afp nin almanya merkezli recm karşıtl...,arada afp nin almanya merkezli recm karşıtlar...
256585,bibi nasıl başardı,bibi nasıl başardı
199072,davutoğlu katar da ateşkes boyunca israil ve h...,davutoğlu katar ateşkes boyunca israil hamas...
50358,bu da kurtarılma ihtimallerini azaltıyor,kurtarılma ihtimallerini azaltıyor
