### Word Prefix Suffix Analysis

In [1]:
import os
import pandas as pd
import numpy as np
import glob
import re
from kneed import KneeLocator
from pathlib import Path
import shutil

In [3]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian

# pre-suffix select
prefix = False  # True, False  word is prefix  example: prefix = True and suffix = False for Turkish word
suffix = True # True, False  word is suffix

# native word select
file_ext = 1000
word_start = 0  # 0  # native word start index
word_end = 1000  # 28  # native word end index


In [4]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/2-1-Word Prefix Suffix Analysis"

#Path(path).mkdir(parents=True, exist_ok=True)

In [5]:
def detect_prefix_suffix_word(df, df_column, word_list, prefix_word=True, suffix_word=False):
    '''
    default parameter:\n
    detect_prefix_suffix_word(df, df_column, word_ety_list, prefix_word=True, suffix_word=False)\n
    detect_prefix_suffix_word(df, "word", ["abacus","aba","su"], prefix=True, suffix=True)\n
    df is dataframe. Each word of word_ety_list search in df_column according to prefix_word and suffix_word condition.\n
    prefix_word and suffix_word are not extention. they represents location of word of word_ety_list (word before or after)
    '''    
    df_prefix_suffix_word_result = pd.DataFrame()
    for i in word_list:
        # suffix result
        if prefix_word:
            word_in_word = df[df[f"{df_column}"].str.contains(fr"{i}(?:$)", na=True)]  # string+ext=> word
            word_in_word.insert(0,"search_word",i)
            df_prefix_suffix_word_result = pd.concat([df_prefix_suffix_word_result, word_in_word], axis=0)
        else:
            pass
        # prefix result
        if suffix_word:
            word_in_word = df[df[f"{df_column}"].str.contains(fr"(?:^){i}", na=True)]  # ext+string=> word
            word_in_word.insert(0,"search_word",i)
            df_prefix_suffix_word_result = pd.concat([df_prefix_suffix_word_result, word_in_word], axis=0)
        else:
            pass    
        
    #df_ety_suffix_word_result = df_word_result.sort_values(by="frequency", ascending=False)
    df_prefix_suffix_word_result.drop_duplicates(inplace=True)
    df_prefix_suffix_word_result.reset_index(drop=True, inplace=True)

    return df_prefix_suffix_word_result

In [6]:
def exract_prefix_suffix(df, source_column, target_column):
    '''
    exract_prefix_suffix(df, source_column, target_column):\n
    exract_prefix_suffix(df, "word_pair", "word"):\n
    df is dataframe. word of source_column search in target_column\n
    and exract prefix or suffix. 
    '''
    for i in range(len(df)):
        source_word = df.loc[i,f"{source_column}"]
        target_word = df.loc[i,f"{target_column}"]
        try:
            search_loc = re.search(fr"{source_word}", target_word, re.UNICODE|re.IGNORECASE)
            search_loc_start = search_loc.span()[0]
            search_loc_end = search_loc.span()[1]
            if search_loc_start > 0:
                var1= target_word[0:search_loc_start]
                prefix_suffix = f"{var1}+"
                df.loc[i,"prefix_suffix"] = prefix_suffix                
            else:
                var2 = target_word[search_loc_end:]
                prefix_suffix = f"+{var2}" 
                df.loc[i,"prefix_suffix"] = prefix_suffix 
        except:
            pass
    return df

#### Word Data

In [7]:
Pos_Tag = "VERB" # NOUN, VERB (ol, var stem ayrı), ADJ, ADV, NUM, PRON, CCONJ, ADP, AUX 
file_ext = 1000

In [8]:
df_word_raw = pd.read_excel(f"Turkish_{file_ext}_Process.xlsx")
df_word_raw

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency
0,NUM,,bir,bir,bir,a,a,18835735
1,PRON,,bu,bu,bu,this,this,11062659
2,PRON,Q,ne,ne,ne,what,what,8025880
3,CCONJ,,ve,ve,ve,and,and,7766036
4,ADP,,için,için,için,for,for,5484109
...,...,...,...,...,...,...,...,...
995,ADJ,,resmi,resmi,resmi,formal,formal,68287
996,VERB,,veriyor,ver,ver,giving,give,68163
997,NOUN,,okul,okul,oku,school,school,68160
998,NOUN,,suçlu,suç,suç,guilty,crime,68124


In [9]:
#df_word_raw_select = df_word_raw[df_word_raw["POS1"] == Pos_Tag]
df_word_raw_select = df_word_raw.copy()
df_word_raw_select = df_word_raw_select.drop_duplicates(subset=["word"])
df_word_raw_select

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency
0,NUM,,bir,bir,bir,a,a,18835735
1,PRON,,bu,bu,bu,this,this,11062659
2,PRON,Q,ne,ne,ne,what,what,8025880
3,CCONJ,,ve,ve,ve,and,and,7766036
4,ADP,,için,için,için,for,for,5484109
...,...,...,...,...,...,...,...,...
995,ADJ,,resmi,resmi,resmi,formal,formal,68287
996,VERB,,veriyor,ver,ver,giving,give,68163
997,NOUN,,okul,okul,oku,school,school,68160
998,NOUN,,suçlu,suç,suç,guilty,crime,68124


In [10]:
word_list = df_word_raw_select["word"].values.tolist()
#word_list

In [11]:
df_prefix_suffix_word = detect_prefix_suffix_word(df_word_raw_select, "word", word_list, prefix_word=prefix, suffix_word=suffix)
df_prefix_suffix_word

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency
0,bir,NUM,,bir,bir,bir,a,a,18835735
1,bir,ADV,,biraz,biraz,bir,a little,a little,1269641
2,bir,PRON,,biri,biri,bir,somebody,somebody,837400
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940
4,bir,ADJ,,birkaç,birkaç,birkaç,a few,a few,404462
...,...,...,...,...,...,...,...,...,...
1878,veriyor,VERB,,veriyorum,ver,ver,i give,give,126290
1879,veriyor,VERB,,veriyor,ver,ver,giving,give,68163
1880,okul,NOUN,,okul,okul,oku,school,school,68160
1881,suçlu,NOUN,,suçlu,suç,suç,guilty,crime,68124


In [12]:
df_prefix_suffix = exract_prefix_suffix(df_prefix_suffix_word, "search_word", "word")
df_prefix_suffix

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
0,bir,NUM,,bir,bir,bir,a,a,18835735,+
1,bir,ADV,,biraz,biraz,bir,a little,a little,1269641,+az
2,bir,PRON,,biri,biri,bir,somebody,somebody,837400,+i
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940,+likte
4,bir,ADJ,,birkaç,birkaç,birkaç,a few,a few,404462,+kaç
...,...,...,...,...,...,...,...,...,...,...
1878,veriyor,VERB,,veriyorum,ver,ver,i give,give,126290,+um
1879,veriyor,VERB,,veriyor,ver,ver,giving,give,68163,+
1880,okul,NOUN,,okul,okul,oku,school,school,68160,+
1881,suçlu,NOUN,,suçlu,suç,suç,guilty,crime,68124,+


In [24]:
df_test = df_prefix_suffix.sort_values(["search_word","word"],key=lambda x:x.str.len())
df_test

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
74,o,PRON,,o,o,o,he,he,5013838,+
86,o,VERB,,ol,ol,ol,be,be,600750,+l
110,o,NUM,,on,on,on,ten,ten,168880,+n
75,o,PRON,,onu,o,o,him-her-it,he,2486889,+nu
78,o,PRON,,ona,o,o,him,he,1124548,+na
...,...,...,...,...,...,...,...,...,...,...
1598,inanamıyorum,VERB,NEG,inanamıyorum,inan,inan,i cant believe,believe,90208,+
1744,çalışıyorsun,VERB,,çalışıyorsun,çalış,çalış,you work,work,77707,+
1814,hatırlıyorum,VERB,,hatırlıyorum,hatırla,hatır,i remember,remember,72290,+
1830,istiyorsunuz,VERB,,istiyorsunuz,iste,iste,you want,want,71208,+


In [25]:
df_test.to_excel("test_result.xlsx", index=False)

In [21]:
df_prefix_suffix[(df_prefix_suffix["prefix_suffix"] == "+iyor") | (df_prefix_suffix["prefix_suffix"] == "+ıyor") | (df_prefix_suffix["prefix_suffix"] == "+uyor") | 
(df_prefix_suffix["prefix_suffix"] == "+üyor")]

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
486,gerek,VERB,,gerekiyor,gerek,gerek,needed,necessary,324783,+iyor
500,ol,VERB,,oluyor,ol,ol,is happening,be,503064,+uyor
550,gel,VERB,,geliyor,gel,gel,coming,come,416067,+iyor
580,ver,VERB,,veriyor,ver,ver,giving,give,68163,+iyor
987,yap,VERB,,yapıyor,yap,yap,doing,do it,179712,+ıyor
1579,düşün,VERB,,düşünüyor,düşün,düş,thinking,think,93634,+üyor


In [18]:
df_prefix_suffix[(df_prefix_suffix["prefix_suffix"] == "+ecek") | (df_prefix_suffix["prefix_suffix"] == "+acak")]

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
496,ol,VERB,,olacak,ol,ol,will be,be,786194,+acak
551,gel,NOUN,TIME,gelecek,gelecek,gel,the future,the future,189299,+ecek
993,yap,VERB,,yapacak,yap,yap,will make,do it,115620,+acak


In [23]:
df_prefix_suffix[(df_prefix_suffix["prefix_suffix"] == "+mış") | (df_prefix_suffix["prefix_suffix"] == "+miş") | (df_prefix_suffix["prefix_suffix"] == "+muş") | (df_prefix_suffix["prefix_suffix"] == "+müş") |
(df_prefix_suffix["prefix_suffix"] == "+dı") | (df_prefix_suffix["prefix_suffix"] == "+di") | (df_prefix_suffix["prefix_suffix"] == "+du") | (df_prefix_suffix["prefix_suffix"] == "+dü") | 
(df_prefix_suffix["prefix_suffix"] == "+tı") | (df_prefix_suffix["prefix_suffix"] == "+ti") | (df_prefix_suffix["prefix_suffix"] == "+tu") | (df_prefix_suffix["prefix_suffix"] == "+tü")]

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
160,de,VERB,,dedi,de,de,said,also,182523,+di
175,var,VERB,,vardı,var,var,there was,there is,451341,+dı
178,var,VERB,,varmış,var,var,there was,there is,118304,+mış
186,değil,VERB,NEG,değildi,değil,değil,it wasn't,not,235756,+di
220,yok,VERB,NEG,yoktu,yok,yok,there was not,there is not,181801,+tu
403,olur,VERB,,olurdu,ol,ol,would be,be,124365,+du
457,et,VERB,,etti,et,et,he,meat,245330,+ti
492,ol,VERB,,oldu,ol,ol,happened,be,1141161,+du
505,ol,VERB,,olmuş,ol,ol,it is ok,be,213607,+muş
549,gel,VERB,,geldi,gel,gel,came,come,464529,+di


In [20]:
df_prefix_suffix.search_word.nunique()

283

In [21]:
df_prefix_suffix.word.nunique()

283

##### Copy Move And Delete

In [39]:
output_file1 = glob.glob(f"Prefix_Suffix_*Result.*")
output_file1

['Turkish_English_5000_Word_Prefix_Suffix_Custom_Result.xlsx']

In [40]:
for k in output_file1:
    source = k # source directory
    destination = path
    shutil.copy2(source, destination)

In [41]:
for i in output_file1:
    try:
        os.remove(i)
    except:
        pass

In [42]:
output_file2 = glob.glob(f"{lang_folder.capitalize()}_{lang_pair.capitalize()}_*_Prefix_Suffix_*.*")
output_file2

['Turkish_English_5000_Word_Prefix_Suffix_All.csv',
 'Turkish_English_5000_Word_Prefix_Suffix_Select.xlsx']

In [43]:
for l in output_file2:
    source = l # source directory
    destination = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Data/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}"
    shutil.copy2(source, destination)

In [44]:
for j in output_file2:
    try:
        os.remove(j)
    except:
        pass

### Concat Native And Etymology Prefix Suffix Result

In [3]:
import os
import pandas as pd
import numpy as np
import glob

In [4]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
lang_pair = "English"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian, Intersect ==> native language

# file extention
file_ext = 1000  # native word number

In [5]:
def lower_strip(x):
    try:
        var_low = x.lower()
        var_out = var_low.strip()
    except:
        var_out = x
    return var_out

In [6]:
native_file = glob.glob(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_{file_ext}_Word_Prefix_Suffix_Custom_Result_Manuel.xlsx")
native_file

['/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/Turkish English/Turkish_English_200_Word_Prefix_Suffix_Custom_Result_Manuel.xlsx']

In [7]:
df_native = pd.read_excel(native_file[0])
df_native = df_native[["search_word","word"]]
df_native

Unnamed: 0,search_word,word
0,adam,adam
1,adam,adama
2,adam,adamdan
3,adam,adamdı
4,adam,adamdır
...,...,...
1618,şey,şeyler
1619,şey,şeylerden
1620,şey,şeylere
1621,şey,şeyleri


In [8]:
etymology_file = glob.glob(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_Shared_Word_Prefix_Suffix_Custom_Result.xlsx")
etymology_file

['/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/Turkish English/Turkish_English_Shared_Word_Prefix_Suffix_Custom_Result.xlsx']

In [9]:
df_etmology = pd.read_excel(etymology_file[0])
df_etmology = df_etmology[["search_word","word"]]
df_etmology

Unnamed: 0,search_word,word
0,abaküs,abaküs
1,abandone,abandoned
2,abanoz,abanoz
3,abdomen,abdomende
4,abluka,abluka
...,...,...
6188,şut,şutu
6189,şut,şutunu
6190,şırınga,şırınga
6191,şırınga,şırıngayla


In [10]:
df_native_etymology_concat = pd.concat([df_native,df_etmology], axis=0)
df_native_etymology_concat

Unnamed: 0,search_word,word
0,adam,adam
1,adam,adama
2,adam,adamdan
3,adam,adamdı
4,adam,adamdır
...,...,...
6188,şut,şutu
6189,şut,şutunu
6190,şırınga,şırınga
6191,şırınga,şırıngayla


In [11]:
df_native_etymology_concat["search_word"] = df_native_etymology_concat["search_word"].apply(lambda x : lower_strip(x))
df_native_etymology_concat["word"] = df_native_etymology_concat["word"].apply(lambda x : lower_strip(x))
df_native_etymology_concat.drop_duplicates(inplace=True)
df_native_etymology_concat.reset_index(drop=True, inplace=True)
df_native_etymology_concat

Unnamed: 0,search_word,word
0,adam,adam
1,adam,adama
2,adam,adamdan
3,adam,adamdı
4,adam,adamdır
...,...,...
7811,şut,şutu
7812,şut,şutunu
7813,şırınga,şırınga
7814,şırınga,şırıngayla


In [12]:
df_native_etymology_concat.to_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} \
{lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_{file_ext}_Native_And_Shared_Word_Prefix_Suffix_Custom_Concat.xlsx", index=False)

### Temp

In [None]:
# English, French, German, Spanish, Portuguese, Dutch, Italian

In [29]:
#df_pair1 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish English/Turkish_English_Shared_Vocabulary.xlsx")
#df_pair1

In [31]:
#df_pair2 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish French/Turkish_French_Shared_Vocabulary.xlsx")
#df_pair2

In [32]:
#df_pair3 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish German/Turkish_German_Shared_Vocabulary.xlsx")
#df_pair3

In [33]:
#df_pair4 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Spanish/Turkish_Spanish_Shared_Vocabulary.xlsx")
#df_pair4

In [34]:
#df_pair5 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Portuguese/Turkish_Portuguese_Shared_Vocabulary.xlsx")
#df_pair5

In [35]:
#df_pair6 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Dutch/Turkish_Dutch_Shared_Vocabulary.xlsx")
#df_pair6

In [36]:
#df_pair7 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Italian/Turkish_Italian_Shared_Vocabulary.xlsx")
#df_pair7

In [None]:
#set1 = set(df_pair1.dict_entry_main)
#set2 = set(df_pair2.dict_entry_main)
#set3 = set(df_pair3.dict_entry_main)
#set4 = set(df_pair4.dict_entry_main)
#set5 = set(df_pair5.dict_entry_main)
#set6 = set(df_pair6.dict_entry_main)
#set7 = set(df_pair7.dict_entry_main)

In [None]:
#df_ety_intersect = pd.DataFrame((((((set7.intersection(set6)).intersection(set5)).intersection(set4)).intersection(set3)).intersection(set2)).intersection(set1), columns=["dict_entry_main"])
#df_ety_intersect

In [None]:
#df_ety_intersect.to_excel("Turkish_Intersect_Shared_Vocabulary.xlsx", index=False)

In [19]:
#df_intersect = pd.read_excel("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/Turkish Intersect/Turkish_Intersect_Shared_Vocabulary.xlsx")
#df_intersect  # English, French, German, Spanish, Portuguese, Dutch, Italian

In [20]:
#df_pair1_merge = pd.merge(df_intersect,df_pair1, how="left", on="dict_entry_main")
#df_pair1_merge.drop_duplicates(inplace=True)
#df_pair1_merge

In [21]:
#df_pair2_merge = pd.merge(df_pair1_merge,df_pair2, how="left", on="dict_entry_main")
#df_pair2_merge.drop_duplicates(inplace=True)
#df_pair2_merge

In [22]:
#df_pair3_merge = pd.merge(df_pair2_merge,df_pair3, how="left", on="dict_entry_main")
#df_pair3_merge.drop_duplicates(inplace=True)
#df_pair3_merge

In [23]:
#df_pair4_merge = pd.merge(df_pair3_merge,df_pair4, how="left", on="dict_entry_main")
#df_pair4_merge.drop_duplicates(inplace=True)
#df_pair4_merge

In [24]:
#df_pair5_merge = pd.merge(df_pair4_merge,df_pair5, how="left", on="dict_entry_main")
#df_pair5_merge.drop_duplicates(inplace=True)
#df_pair5_merge

In [25]:
#df_pair6_merge = pd.merge(df_pair5_merge,df_pair6, how="left", on="dict_entry_main")
#df_pair6_merge.drop_duplicates(inplace=True)
#df_pair6_merge

In [26]:
#df_pair7_merge = pd.merge(df_pair6_merge,df_pair7, how="left", on="dict_entry_main")
#df_pair7_merge.drop_duplicates(inplace=True)
#df_pair7_merge

In [27]:
#df_pair7_merge.rename(columns={"dict_entry_main":"turkish_word_intersect"}, inplace=True)
#df_pair7_merge

In [28]:
#df_pair7_merge.to_excel("Turkish_Intersect_Shared_Vocabulary_With_Other_Languages.xlsx", index=False)