### Data Preprocess

This notebook is used to apply pre-process techniques to word and etymology data.

#### Word Process

In [57]:
import unicodedata
import pandas as pd
import numpy as np
import re
import nltk
from nltk import word_tokenize
from itertools import islice
import glob
import os
from pathlib import Path

In [58]:
# custom character alphabet for word
tr = re.compile(r"[abcçdefgğhıijklmnoöprsştuüvyzqxw]+", re.IGNORECASE|re.UNICODE) # Turkish filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789'
en = re.compile(r"[abcdefghıijklmnopqrstxuvwyz]+", re.IGNORECASE|re.UNICODE) # English
nl = re.compile(r"[abcdefghıijklmnopqrstxuvwyzāăēĕīĭōŏūŭ]+", re.IGNORECASE|re.UNICODE) # Dutch (Flemenk)
fr = re.compile(r"[abcçdefghıijklmnopqrstxuvwyzàâæèéêëîïôœùûüÿ]+", re.IGNORECASE|re.UNICODE) # French
de = re.compile(r"[abcdefghıijklmnopqrstxuvwyzäöüß]+", re.IGNORECASE|re.UNICODE) # German
es = re.compile(r"[abcdefghıijklmnopqrstxuvwyzñáéíóú]+", re.IGNORECASE|re.UNICODE) # Spanish (¿¡)
pt = re.compile(r"[abcçdefghıijklmnopqrstxuvwyzàáâãéêíóôõú]+", re.IGNORECASE|re.UNICODE) # Portuguese
it = re.compile(r"[abcdefghıijklmnopqrstxuvwyzàéèìòùî]+", re.IGNORECASE|re.UNICODE) # Italian
ar = re.compile(r"[ٿصؼۤڳڲؿڎػڠجڿ٬ٸؽؒؓطۄڀۂؘؔتٚڛےٝڜؖڦ٫ډ۰زۇٖۀ،لۓعٮێڔ۶ؚۧۜڤۏإٞٷؗۖ؈ژۣؕؑٴأۻڸۺگاڴڹۯ؉ْڌ؍ي؟ـٟړۅؐڶُىڽېًۢؠضۚڄٛڏٱۦ٩س٦ڼڂٔۘ٠ښٌٍ۬ٳ۾ٲږذۋٵٜ٘ڞڅںٗهڣۿپڒۥۗڋیؙم؞ثۨٹڵڪظٶۭ١ڭەڨحٕ؎ٺڷٰ۪۫ڻڥۛڑڟټآڡغګ؊ّٯڧڮ؏ۮ؋ؤ٪ؾڗۼق۟دکوِڰڐۃ۽ہفرڇچڝ۴بۈٽڕۡھةٓڃئ؛ڬٙڙڢڱۊَۆۉځ۠ۍۑۙڊنءڈٻشڍ؇۵كخ\ا]+", re.IGNORECASE|re.UNICODE) # Arabic

In [59]:
folder_name = "Turkish"  # English, Turkish, German, French, Spanish, Arabic, Portuguese, Dutch, Italian
 
if folder_name == "English":
    custom_alp = en
elif folder_name == "Turkish":
    custom_alp = tr
elif folder_name == "German":
    custom_alp = de
elif folder_name == "French":
    custom_alp = fr
elif folder_name == "Spanish":
    custom_alp = es
elif folder_name == "Arabic":
    custom_alp = ar
elif folder_name == "Portuguese":
    custom_alp = pt
elif folder_name == "Dutch":
    custom_alp = nl
elif folder_name == "Italian":
    custom_alp = it
else:
    pass
#custom_alp = tr  # en, tr, de, fr, es, ar, pt, nl, it
print(f"{folder_name}\n{custom_alp}") 

Turkish
re.compile('[abcçdefgğhıijklmnoöprsştuüvyzqxw]+', re.IGNORECASE)


In [60]:
def convert_one_character_letter(text):
    '''
    This function converts two byte occupy of letter to one byte unicode character without any visual change \n
    like as Turkish character ç,ş,ö,ğ. \n
    convert_one_character_letter(text): text is any string word or sentence.
    '''
    new_text = unicodedata.normalize('NFC', f"{text}")
    return new_text

In [61]:
def clean_text(text, custom_alp):  # def clean_text(text)
    '''
    clean_text(text, custom_alp) text is any string word or sentence.\n
    function clean string according to specific language alphabet\n
    '''
    #text_clean = re.findall(custom_alp, text)
    text_one = convert_one_character_letter(text) 
    text_clean = re.findall(custom_alp, str(text_one))
    text_result = "".join(text_clean)
    return text_result

In [62]:
def string_length(string):
    len_out = len(string)
    return len_out

In [63]:
def lower_strip_func(x):
    try:
        var_low = x.lower()
        var_out = var_low.strip()
    except:
        var_out = x
    return var_out 

In [64]:
df_word = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{folder_name.lower().capitalize()}/Result/Word/Merge/Word_Merge2.xlsx")
#df_word = pd.read_csv("")
df_word.drop_duplicates(inplace=True)
df_word

Unnamed: 0,word,frequency
0,bir,18835735
1,bu,11062659
2,fikret,9285954
3,ne,8025880
4,ve,7766036
...,...,...
1006293,karnaya,5
1006294,dörtlümüzün,5
1006295,karnavalınız,5
1006296,hurmanın,5


In [65]:
df_word.iloc[:,0] = df_word.iloc[:,0].apply(lambda x: clean_text(x, custom_alp))
df_word.dropna(inplace=True)
df_word.drop(df_word[df_word.iloc[:,0] == ""].index, inplace=True)
#df_word.drop(df_word[df_word.iloc[:,0] == "fikret"].index, inplace=True) # paris, fikret
df_word["word"] = df_word["word"].apply(lambda x: lower_strip_func(x))
df_word.drop_duplicates(subset="word", inplace=True)
df_word.reset_index(drop=True, inplace=True)
df_word

Unnamed: 0,word,frequency
0,bir,18835735
1,bu,11062659
2,fikret,9285954
3,ne,8025880
4,ve,7766036
...,...,...
989327,karneleme,5
989328,karnaya,5
989329,dörtlümüzün,5
989330,karnavalınız,5


In [64]:
df_word["length"] = df_word["word"].apply(lambda x: string_length(x))
df_word

Unnamed: 0,word,frequency,length
0,que,37853284,3
1,de,37809537,2
2,no,33043466,2
3,a,25439588,1
4,la,24024343,2
...,...,...,...
446460,bibe,5,4
446461,dejugo,5,6
446462,bibbi,5,5
446463,bibberman,5,9


In [65]:
df_word.head(200)

Unnamed: 0,word,frequency,length
0,que,37853284,3
1,de,37809537,2
2,no,33043466,2
3,a,25439588,1
4,la,24024343,2
...,...,...,...
195,papá,644137,4
196,dice,640271,4
197,dije,639354,4
198,tres,638318,4


In [66]:
df_word["length"].mean()

8.123456485950747

In [67]:
df_word["length"].std()

2.6919036131380745

In [68]:
length_mean = df_word["length"].mean()
#length_std = df_word["length"].std()

In [69]:
df_word = df_word[df_word["length"]<=(1.6*length_mean)]
df_word.drop("length", axis=1, inplace=True)
df_word

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,word,frequency
0,que,37853284
1,de,37809537
2,no,33043466
3,a,25439588
4,la,24024343
...,...,...
446460,bibe,5
446461,dejugo,5
446462,bibbi,5
446463,bibberman,5


In [70]:
df_word.head(200)

Unnamed: 0,word,frequency
0,que,37853284
1,de,37809537
2,no,33043466
3,a,25439588
4,la,24024343
...,...,...
195,papá,644137
196,dice,640271
197,dije,639354
198,tres,638318


In [71]:
remove_word_list = ["fikret","paris"]

In [72]:
set_remove = set(remove_word_list)
set_word = set(df_word["word"])

In [73]:
df_word_clean = pd.DataFrame(set_word.difference(set_remove), columns=["word"])

In [74]:
df_word = pd.merge(df_word_clean, df_word, how="left", on="word")
df_word.sort_values(by="frequency", ascending=False, inplace=True)
df_word.reset_index(drop=True, inplace=True)
df_word

Unnamed: 0,word,frequency
0,que,37853284
1,de,37809537
2,no,33043466
3,a,25439588
4,la,24024343
...,...,...
420724,snowflakes,5
420725,cazzone,5
420726,tonmawr,5
420727,halstedter,5


In [75]:
# finding the longest string
lengths = df_word["word"].str.len()
argmax = np.where(lengths == lengths.max())[0]
df_word.iloc[argmax]

Unnamed: 0,word,frequency
992,departamento,99192
1492,conversación,61973
1504,sentimientos,61363
1813,desaparecido,49715
1818,directamente,49692
...,...,...
420628,desparejados,5
420649,zigzagueador,5
420694,mortificadas,5
420699,impertérrita,5


In [76]:
Path(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{folder_name}/Deployment/Data/Word").mkdir(parents=True, exist_ok=True)  # create path

In [77]:
df_word.to_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{folder_name}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx", index=False)  # For Data/Word

#### Etymology Result Pre-Process

In [None]:
lang_folder = "Turkish"  # English, German, French, Spanish, Portuguese, Italian, Arabic ...
lang_pair = "French"  # English, German, French, Spanish, Portuguese, Italian, Arabic ...

In [None]:
def lower_strip_func(x):
    try:
        var_low = x.lower()
        var_out = var_low.strip()
    except:
        var_out = x
    return var_out 

In [None]:
df_ety_result = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_Shared_Vocabulary.xlsx")
df_ety_result

In [None]:
df_ety_result["dict_entry_main"] = df_ety_result["dict_entry_main"].apply(lambda x: lower_strip_func(x))
df_ety_result[f"{lang_pair.lower()}_word"] = df_ety_result[f"{lang_pair.lower()}_word"].apply(lambda x: lower_strip_func(x))

In [None]:
df_ety_result.to_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_Shared_Vocabulary.xlsx", index=False)

#### Export Adjust Word Cover Result

In [None]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
import re
import pandas as pd
import numpy as np

In [None]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

In [None]:
df_word = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{folder_name}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word

In [None]:
word_num = 206
df_select = df_word.head(word_num)
df_select

In [None]:
word_list = df_select["word"].values.tolist()

In [None]:
data_kind = "sentence" # sentence, twogram, threegram, fourgram, fivegram
spec_folder = "Sentence" # Sentence, N Gram

In [None]:
df = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{folder_name.capitalize()}/Result/{spec_folder.capitalize()}/Merge/{data_kind.capitalize()}_Merge.csv") # sentence, n gram dataframe
df

In [None]:
d_list  = df.iloc[:,0].values.tolist()

In [None]:
resultlist = []

In [None]:
#import os
#from multiprocessing import Process, Manager, Pool, Queue
manager = multiprocessing.Manager()
resultlist = manager.list()

def word_in_wordgroup(d_list):
    mergelist = []
    try:
        word = d_list.split()
    except:
        pass
    var1 = range(len(word))
    for j in var1:
        if word[j] in word_list:
            mergelist.append(word[j])
            if len(mergelist) == len(word):
                    resultlist.append(d_list)
                        
if __name__ == '__main__':
    # with Pool(16) as p:
    with Pool(nprocs) as p: # Pool number CPU sayısına eşit olursa tüm CPU lar çalışır
        p.map(word_in_wordgroup, d_list) # string_word liste

In [None]:
result_list = list(resultlist)
df_result = pd.DataFrame(result_list)
df_result = df_result.rename(columns = {0: f"{data_kind}"})
df_merge = pd.merge(df_result, df, how="left", on=f"{data_kind}")
df_merge_result = df_merge.sort_values(by="frequency", ascending=False)
df_merge_result.drop_duplicates(inplace=True)
df_merge_result.reset_index(drop=True, inplace=True)
df_merge_result

In [None]:
data_len = len(df_merge_result)
#data_len

In [None]:
df_merge_result = df_merge_result.head(1000000)

In [None]:
df_merge_result.to_excel(f"{data_kind.capitalize()}_{data_len}_With_{word_num}_Word.xlsx", index=False)

In [None]:
#df_select.to_excel(f"Word_{word_num}.xlsx", sheet_name='Word', index=False)

#### Temp

In [14]:
df_file = pd.read_excel("/home/kurubal/Downloads/Visual_Genome_Select_Image.xlsx")
df_file

Unnamed: 0,select1,select2,select3,Unnamed: 3
0,aç1.jpg,aç3.jpg,aç2.jpg,
1,,,,
2,acele1.jpg,,,
3,acı5.jpg,acı2.jpg,acı6.jpg,
4,açık7.jpg,açık4.jpg,açık1.jpg,
...,...,...,...,...
995,zaten7.jpg,,,
996,zavallı1.jpg,,,
997,zor6.jpg,,,
998,zorunda2.jpg,,,


In [15]:
df_file[["word","ext"]] = df_file["select1"].apply(lambda x: pd.Series(str(x).split(".")))
df_file

Unnamed: 0,select1,select2,select3,Unnamed: 3,word,ext
0,aç1.jpg,aç3.jpg,aç2.jpg,,aç1,jpg
1,,,,,,
2,acele1.jpg,,,,acele1,jpg
3,acı5.jpg,acı2.jpg,acı6.jpg,,acı5,jpg
4,açık7.jpg,açık4.jpg,açık1.jpg,,açık7,jpg
...,...,...,...,...,...,...
995,zaten7.jpg,,,,zaten7,jpg
996,zavallı1.jpg,,,,zavallı1,jpg
997,zor6.jpg,,,,zor6,jpg
998,zorunda2.jpg,,,,zorunda2,jpg


In [17]:
df_file["word"] = df_file["word"].apply(lambda x: clean_text(x, custom_alp))
df_file.drop("ext", axis=1, inplace=True)
df_file

Unnamed: 0,select1,select2,select3,Unnamed: 3,word
0,aç1.jpg,aç3.jpg,aç2.jpg,,aç
1,,,,,
2,acele1.jpg,,,,acele
3,acı5.jpg,acı2.jpg,acı6.jpg,,acı
4,açık7.jpg,açık4.jpg,açık1.jpg,,açık
...,...,...,...,...,...
995,zaten7.jpg,,,,zaten
996,zavallı1.jpg,,,,zavallı
997,zor6.jpg,,,,zor
998,zorunda2.jpg,,,,zorunda


In [18]:
df_file.to_excel("/home/kurubal/Downloads/Visual_Genome_Select_Image2.xlsx", index=False)