### Word Prefix Suffix Analysis

In [2]:
import os
import pandas as pd
import numpy as np
import glob
import re
from kneed import KneeLocator
from pathlib import Path
import shutil

In [3]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian

# pre-suffix select
prefix = False  # True, False  word is prefix  example: prefix = True and suffix = False for Turkish word
suffix = True # True, False  word is suffix

# native word select
file_ext = 1000
word_start = 0  # 0  # native word start index
word_end = 1000  # 28  # native word end index


In [4]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/2-1-Word Prefix Suffix Analysis"

#Path(path).mkdir(parents=True, exist_ok=True)

In [5]:
def detect_prefix_suffix_word(df, df_column, word_list, prefix_word=True, suffix_word=False):
    '''
    default parameter:\n
    detect_prefix_suffix_word(df, df_column, word_ety_list, prefix_word=True, suffix_word=False)\n
    detect_prefix_suffix_word(df, "word", ["abacus","aba","su"], prefix=True, suffix=True)\n
    df is dataframe. Each word of word_ety_list search in df_column according to prefix_word and suffix_word condition.\n
    prefix_word and suffix_word are not extention. they represents location of word of word_ety_list (word before or after)
    '''    
    df_prefix_suffix_word_result = pd.DataFrame()
    for i in word_list:
        # suffix result
        if prefix_word:
            word_in_word = df[df[f"{df_column}"].str.contains(fr"{i}(?:$)", na=True)]  # string+ext=> word
            word_in_word.insert(0,"search_word",i)
            df_prefix_suffix_word_result = pd.concat([df_prefix_suffix_word_result, word_in_word], axis=0)
        else:
            pass
        # prefix result
        if suffix_word:
            word_in_word = df[df[f"{df_column}"].str.contains(fr"(?:^){i}", na=True)]  # ext+string=> word
            word_in_word.insert(0,"search_word",i)
            df_prefix_suffix_word_result = pd.concat([df_prefix_suffix_word_result, word_in_word], axis=0)
        else:
            pass    
        
    #df_ety_suffix_word_result = df_word_result.sort_values(by="frequency", ascending=False)
    df_prefix_suffix_word_result.drop_duplicates(inplace=True)
    df_prefix_suffix_word_result.reset_index(drop=True, inplace=True)

    return df_prefix_suffix_word_result

In [6]:
def exract_prefix_suffix(df, source_column, target_column):
    '''
    exract_prefix_suffix(df, source_column, target_column):\n
    exract_prefix_suffix(df, "word_pair", "word"):\n
    df is dataframe. word of source_column search in target_column\n
    and exract prefix or suffix. 
    '''
    for i in range(len(df)):
        source_word = df.loc[i,f"{source_column}"]
        target_word = df.loc[i,f"{target_column}"]
        try:
            search_loc = re.search(fr"{source_word}", target_word, re.UNICODE|re.IGNORECASE)
            search_loc_start = search_loc.span()[0]
            search_loc_end = search_loc.span()[1]
            if search_loc_start > 0:
                var1= target_word[0:search_loc_start]
                prefix_suffix = f"{var1}+"
                df.loc[i,"prefix_suffix"] = prefix_suffix                
            else:
                var2 = target_word[search_loc_end:]
                prefix_suffix = f"+{var2}" 
                df.loc[i,"prefix_suffix"] = prefix_suffix 
        except:
            pass
    return df

#### Word Data

In [7]:
#Pos_Tag = "VERB" # NOUN, VERB (ol, var stem ayrı), ADJ, ADV, NUM, PRON, CCONJ, ADP, AUX 
file_ext = 1000

In [8]:
df_word_raw = pd.read_excel(f"Turkish_{file_ext}_Process.xlsx")
df_word_raw

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency
0,NUM,,bir,bir,bir,a,a,18835735
1,PRON,,bu,bu,bu,this,this,11062659
2,PRON,Q,ne,ne,ne,what,what,8025880
3,CCONJ,,ve,ve,ve,and,and,7766036
4,ADP,,için,için,için,for,for,5484109
...,...,...,...,...,...,...,...,...
995,ADJ,,resmi,resmi,resmi,formal,formal,68287
996,VERB,,veriyor,ver,ver,giving,give,68163
997,NOUN,,okul,okul,oku,school,school,68160
998,NOUN,,suçlu,suç,suç,guilty,crime,68124


In [9]:
#df_word_raw_select = df_word_raw[df_word_raw["POS1"] == Pos_Tag]
df_word_raw_select = df_word_raw.copy()
df_word_raw_select = df_word_raw_select.drop_duplicates(subset=["word"])
df_word_raw_select

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency
0,NUM,,bir,bir,bir,a,a,18835735
1,PRON,,bu,bu,bu,this,this,11062659
2,PRON,Q,ne,ne,ne,what,what,8025880
3,CCONJ,,ve,ve,ve,and,and,7766036
4,ADP,,için,için,için,for,for,5484109
...,...,...,...,...,...,...,...,...
995,ADJ,,resmi,resmi,resmi,formal,formal,68287
996,VERB,,veriyor,ver,ver,giving,give,68163
997,NOUN,,okul,okul,oku,school,school,68160
998,NOUN,,suçlu,suç,suç,guilty,crime,68124


In [10]:
word_list = df_word_raw_select["word"].values.tolist()
#word_list

In [11]:
df_prefix_suffix_word = detect_prefix_suffix_word(df_word_raw_select, "word", word_list, prefix_word=prefix, suffix_word=suffix)
df_prefix_suffix_word

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency
0,bir,NUM,,bir,bir,bir,a,a,18835735
1,bir,ADV,,biraz,biraz,bir,a little,a little,1269641
2,bir,PRON,,biri,biri,bir,somebody,somebody,837400
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940
4,bir,ADJ,,birkaç,birkaç,birkaç,a few,a few,404462
...,...,...,...,...,...,...,...,...,...
1878,veriyor,VERB,,veriyorum,ver,ver,i give,give,126290
1879,veriyor,VERB,,veriyor,ver,ver,giving,give,68163
1880,okul,NOUN,,okul,okul,oku,school,school,68160
1881,suçlu,NOUN,,suçlu,suç,suç,guilty,crime,68124


In [12]:
df_prefix_suffix = exract_prefix_suffix(df_prefix_suffix_word, "search_word", "word")
df_prefix_suffix

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
0,bir,NUM,,bir,bir,bir,a,a,18835735,+
1,bir,ADV,,biraz,biraz,bir,a little,a little,1269641,+az
2,bir,PRON,,biri,biri,bir,somebody,somebody,837400,+i
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940,+likte
4,bir,ADJ,,birkaç,birkaç,birkaç,a few,a few,404462,+kaç
...,...,...,...,...,...,...,...,...,...,...
1878,veriyor,VERB,,veriyorum,ver,ver,i give,give,126290,+um
1879,veriyor,VERB,,veriyor,ver,ver,giving,give,68163,+
1880,okul,NOUN,,okul,okul,oku,school,school,68160,+
1881,suçlu,NOUN,,suçlu,suç,suç,guilty,crime,68124,+


In [13]:
df_prefix_suffix_select = pd.DataFrame()
for row_num in range(len(df_prefix_suffix)):
    df_var = df_prefix_suffix.iloc[[row_num],]
    if len("".join(df_var["search_word"])) >= len("".join(df_var["stem"])):
        df_prefix_suffix_select = pd.concat([df_prefix_suffix_select,df_var], axis=0)
    else:
        pass
    
df_prefix_suffix_select = df_prefix_suffix_select.reset_index(drop=True)
df_prefix_suffix_select

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
0,bir,NUM,,bir,bir,bir,a,a,18835735,+
1,bir,ADV,,biraz,biraz,bir,a little,a little,1269641,+az
2,bir,PRON,,biri,biri,bir,somebody,somebody,837400,+i
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940,+likte
4,bir,NOUN,,birini,biri,bir,one,somebody,259916,+ini
...,...,...,...,...,...,...,...,...,...,...
1619,veriyor,VERB,,veriyorum,ver,ver,i give,give,126290,+um
1620,veriyor,VERB,,veriyor,ver,ver,giving,give,68163,+
1621,okul,NOUN,,okul,okul,oku,school,school,68160,+
1622,suçlu,NOUN,,suçlu,suç,suç,guilty,crime,68124,+


#### Noun

In [14]:
df_noun = df_prefix_suffix_select[df_prefix_suffix_select["POS1"] == "NOUN"]
df_noun

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
4,bir,NOUN,,birini,biri,bir,one,somebody,259916,+ini
5,bir,NOUN,,birisi,biri,bir,someone,somebody,182370,+isi
6,bir,NOUN,,birinin,biri,bir,someone,somebody,135905,+inin
7,bir,NOUN,,birine,biri,bir,to someone,somebody,111455,+ine
9,bir,NOUN,,biriyle,biri,bir,with someone,somebody,82104,+iyle
...,...,...,...,...,...,...,...,...,...,...
1609,güneş,NOUN,,güneş,güneş,güneş,sun,sun,68622,+
1616,önceki,NOUN,,önceki,önce,önce,previous,before,68345,+
1617,ifade,NOUN,,ifade,ifade,ifade,expression,expression,68298,+
1621,okul,NOUN,,okul,okul,oku,school,school,68160,+


##### Noun Plural

In [15]:
df_noun_plural = df_noun[df_noun["prefix_suffix"].str.contains(fr"lar", na=True) | df_noun["prefix_suffix"].str.contains(fr"ler", na=True) | 
                        df_noun["word"].str.contains(fr"lar", na=True) | df_noun["word"].str.contains(fr"ler", na=True)]
df_noun_plural

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
20,ne,NOUN,Q,neler,ne,ne,what,what,397377,+ler
46,o,NOUN,,onlardan,onlar,o,from them,they,82961,+nlardan
84,şey,NOUN,,şeyler,şey,şey,things,thing,649627,+ler
87,şey,NOUN,,şeyleri,şey,şey,things,thing,137646,+leri
134,zaman,NOUN,TIME,zamanlar,zaman,zaman,times,time,89599,+lar
224,adam,NOUN,,adamlar,adam,adam,men,man,85319,+lar
252,gün,NOUN,,günler,gün,gün,days,day,79133,+ler
259,teşekkürler,NOUN,GRE,teşekkürler,teşekkür,teşekkür,thanks,thanks,680203,+
277,gece,NOUN,TIME,geceler,gece,gece,nights,night,129031,+ler
278,şeyler,NOUN,,şeyler,şey,şey,things,thing,649627,+


In [16]:
noun_plural_list = list(set(df_noun_plural["word"]))

##### Noun Persons

In [17]:
df_noun_first_person = df_noun[(df_noun["prefix_suffix"] == "+ım") | (df_noun["prefix_suffix"] == "+im") | (df_noun["prefix_suffix"] == "+um") | (df_noun["prefix_suffix"] == "+üm") |
                        df_noun["word"].str.contains(fr"um(?:$)", na=False) | df_noun["word"].str.contains(fr"üm(?:$)", na=False) | df_noun["word"].str.contains(fr"ım(?:$)", na=False) | 
                        df_noun["word"].str.contains(fr"im(?:$)", na=False)]
df_noun_first_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
223,adam,NOUN,,adamım,adam,adam,my man,man,146760,+ım
230,efendim,NOUN,GRE,efendim,efendim,efendim,sir,sir,763275,+
233,yardım,NOUN,,yardım,yardım,yardım,help,help,752820,+
246,tüm,NOUN,,tüm,tüm,tüm,all,all,706547,+
353,üzgünüm,NOUN,GRE,üzgünüm,üzgün,üz,I am sorry,sorry,567153,+
403,kız,NOUN,,kızım,kız,kız,my daughter,girl,95342,+ım
415,iş,NOUN,,işim,iş,iş,my job,job,84281,+im
419,dur,NOUN,,durum,durum,dur,situation,situation,179459,+um
467,dostum,NOUN,GRE,dostum,dostum,dost,buddy,buddy,438955,+
530,bay,NOUN,GRE,bayım,bay,bay,sir,mr,100890,+ım


In [18]:
noun_first_person_list = list(set(df_noun_first_person["word"]))

In [19]:
df_noun_second_person = df_noun[(df_noun["prefix_suffix"] == "+ın") | (df_noun["prefix_suffix"] == "+in") | (df_noun["prefix_suffix"] == "+un") | (df_noun["prefix_suffix"] == "+ün") |
                        df_noun["word"].str.contains(fr"un(?:$)", na=False) | df_noun["word"].str.contains(fr"ün(?:$)", na=False) | df_noun["word"].str.contains(fr"ın(?:$)", na=False) | 
                        df_noun["word"].str.contains(fr"in(?:$)", na=False)]
df_noun_second_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
6,bir,NOUN,,birinin,biri,bir,someone,somebody,135905,+inin
24,ne,NOUN,Q,neyin,ne,ne,what,what,119777,+yin
85,şey,NOUN,,şeyin,şey,şey,your thing,thing,160429,+in
143,nasıl,NOUN,Q,nasılsın,nasıl,nasıl,how are you,how,130778,+sın
176,oldu,NOUN,,olduğun,ol,ol,you,die,95341,+ğun
...,...,...,...,...,...,...,...,...,...,...
1461,yaptığın,NOUN,,yaptığın,yap,yap,you do,do it,79162,+
1530,arkadaşı,NOUN,,arkadaşın,arkadaş,arkadaş,your friend,friend,84749,+n
1551,çocuğun,NOUN,,çocuğun,çocuk,çocuk,your child,child,73003,+
1555,aferin,NOUN,GRE,aferin,aferin,aferin,well done,well done,72635,+


In [20]:
noun_second_person_list = list(set(df_noun_second_person["word"]))

In [21]:
df_noun_third_person = df_noun[(df_noun["prefix_suffix"] == "+ı") | (df_noun["prefix_suffix"] == "+i") | (df_noun["prefix_suffix"] == "+u") | (df_noun["prefix_suffix"] == "+ü") |
                        df_noun["word"].str.contains(fr"u(?:$)", na=False) | df_noun["word"].str.contains(fr"ü(?:$)", na=False) | df_noun["word"].str.contains(fr"ı(?:$)", na=False) | 
                        df_noun["word"].str.contains(fr"i(?:$)", na=False)]
df_noun_third_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
4,bir,NOUN,,birini,biri,bir,one,somebody,259916,+ini
5,bir,NOUN,,birisi,biri,bir,someone,somebody,182370,+isi
22,ne,NOUN,Q,neyi,ne,ne,what,what,124260,+yi
23,ne,NOUN,Q,neydi,ne,ne,what was it,what,123671,+ydi
83,şey,NOUN,,şeyi,şey,şey,thing,thing,683786,+i
...,...,...,...,...,...,...,...,...,...,...
1593,verici,NOUN,,verici,verici,ver,transmitter,transmitter,69281,+
1597,olayı,NOUN,,olayı,olay,olay,incident,event,69118,+
1602,herkesi,NOUN,,herkesi,herkes,herkes,everyone,everyone,68852,+
1616,önceki,NOUN,,önceki,önce,önce,previous,before,68345,+


In [22]:
noun_third_person_list = list(set(df_noun_third_person["word"]))

In [23]:
df_noun_first_plural_person = df_noun[(df_noun["prefix_suffix"] == "+mız") | (df_noun["prefix_suffix"] == "+miz") | (df_noun["prefix_suffix"] == "+muz") | (df_noun["prefix_suffix"] == "+müz") |
                        df_noun["word"].str.contains(fr"mız(?:$)", na=False) | df_noun["word"].str.contains(fr"miz(?:$)", na=False) | df_noun["word"].str.contains(fr"muz(?:$)", na=False) | 
                        df_noun["word"].str.contains(fr"müz(?:$)", na=False)]
df_noun_first_plural_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
549,hep,NOUN,,hepimiz,hep,hep,we all,all,155913,+imiz
704,ihtiyacım,NOUN,,ihtiyacımız,ihtiyaç,ihtiyaç,we need,need,117512,+ız
940,hepimiz,NOUN,,hepimiz,hep,hep,we all,all,155913,+
1041,ihtiyacı,NOUN,,ihtiyacımız,ihtiyaç,ihtiyaç,we need,need,117512,+mız
1138,ihtiyacımız,NOUN,,ihtiyacımız,ihtiyaç,ihtiyaç,we need,need,117512,+
1304,temiz,NOUN,,temiz,temiz,temiz,clean,clean,93441,+


In [24]:
noun_first_plural_person_list = list(set(df_noun_first_plural_person["word"]))

In [25]:
df_noun_second_plural_person = df_noun[(df_noun["prefix_suffix"] == "+nız") | (df_noun["prefix_suffix"] == "+niz") | (df_noun["prefix_suffix"] == "+nuz") | (df_noun["prefix_suffix"] == "+nüz") |
                        df_noun["word"].str.contains(fr"nız(?:$)", na=False) | df_noun["word"].str.contains(fr"niz(?:$)", na=False) | df_noun["word"].str.contains(fr"nuz(?:$)", na=False) | 
                        df_noun["word"].str.contains(fr"nüz(?:$)", na=False)]
df_noun_second_plural_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
551,hep,NOUN,,hepiniz,hep,hep,all of you,all,73633,+iniz
1542,hepiniz,NOUN,,hepiniz,hep,hep,all of you,all,73633,+


In [26]:
noun_second_plural_person_list = list(set(df_noun_second_plural_person["word"]))

In [27]:
df_noun_third_plural_person = df_noun[(df_noun["prefix_suffix"] == "+leri") | (df_noun["prefix_suffix"] == "+ları") |
                                df_noun["word"].str.contains(fr"leri(?:$)", na=False) | df_noun["word"].str.contains(fr"ları(?:$)", na=False)]
df_noun_third_plural_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
87,şey,NOUN,,şeyleri,şey,şey,things,thing,137646,+leri
279,şeyler,NOUN,,şeyleri,şey,şey,things,thing,137646,+i
557,çocuk,NOUN,,çocukları,çocuk,çocuk,children,child,78970,+ları
592,çocuklar,NOUN,,çocukları,çocuk,çocuk,children,child,78970,+ı
685,insan,NOUN,,insanları,insan,insan,people,human,91961,+ları
707,insanlar,NOUN,,insanları,insan,insan,people,human,91961,+ı
1032,şeyleri,NOUN,,şeyleri,şey,şey,things,thing,137646,+
1322,insanları,NOUN,,insanları,insan,insan,people,human,91961,+
1480,çocukları,NOUN,,çocukları,çocuk,çocuk,children,child,78970,+


In [28]:
noun_third_plural_person_list = list(set(df_noun_third_plural_person["word"]))

##### Noun Places

In [30]:
df_noun_at_in_on = df_noun[(df_noun["prefix_suffix"] == "+de") | (df_noun["prefix_suffix"] == "+da") | (df_noun["prefix_suffix"] == "+te") | (df_noun["prefix_suffix"] == "+ta") |
df_noun["word"].str.contains(fr"de(?:$)", na=False) | df_noun["word"].str.contains(fr"da(?:$)", na=False) | df_noun["word"].str.contains(fr"te(?:$)", na=False) | df_noun["word"].str.contains(fr"ta(?:$)", na=False)]
df_noun_at_in_on

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
54,ben,NOUN,,bende,ben,ben,me,i,175992,+de
132,zaman,NOUN,TIME,zamanda,zaman,zaman,at the time,time,127700,+da
398,hakkında,NOUN,,hakkında,hakkında,hak,about,about,519488,+
420,dur,NOUN,,durumda,durum,dur,in the case,situation,147099,+umda
502,şekilde,NOUN,,şekilde,şekil,şekil,like that,shape,415598,+
541,yer,NOUN,,yerde,yer,yer,on the ground,location,274936,+de
544,yer,NOUN,,yerinde,yer,yer,in place,location,92140,+inde
584,an,NOUN,,anda,an,an,at the moment,moment,325568,+da
599,anda,NOUN,,anda,an,an,at the moment,moment,325568,+
616,zor,NOUN,,zorunda,zor,zor,must,hard,275011,+unda


In [31]:
noun_at_in_on_list = list(set(df_noun_at_in_on["word"]))  # bulunma eki

In [32]:
df_noun_from = df_noun[(df_noun["prefix_suffix"] == "+den") | (df_noun["prefix_suffix"] == "+dan") | (df_noun["prefix_suffix"] == "+ten") | (df_noun["prefix_suffix"] == "+tan") |
df_noun["word"].str.contains(fr"den(?:$)", na=False) | df_noun["word"].str.contains(fr"dan(?:$)", na=False) | df_noun["word"].str.contains(fr"ten(?:$)", na=False) | df_noun["word"].str.contains(fr"tan(?:$)", na=False)]
df_noun_from

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
46,o,NOUN,,onlardan,onlar,o,from them,they,82961,+nlardan
63,çok,NOUN,,çoktan,çok,çok,already,lots,82798,+tan
88,şey,NOUN,,şeyden,şey,şey,out of what,thing,83839,+den
129,neden,NOUN,Q,neden,neden,neden,why,why,1866089,+
178,oldu,NOUN,,olduğundan,ol,ol,because,be,82671,+ğundan
326,ol,NOUN,NEG,olmadan,ol,ol,without,be,137285,+madan
341,ol,NOUN,,olduğundan,ol,ol,because,be,82671,+duğundan
386,yüzden,NOUN,,yüzden,yüz,yüz,because,face,529452,+
515,onlar,NOUN,,onlardan,onlar,o,from them,they,82961,+dan
582,olduğu,NOUN,,olduğundan,ol,ol,because,be,82671,+ndan


In [None]:
noun_from_list = list(set(df_noun_from["word"]))  # ayrılma eki

In [33]:
df_noun_to = df_noun[(df_noun["prefix_suffix"] == "+e") | (df_noun["prefix_suffix"] == "+a") |
df_noun["word"].str.contains(fr"e(?:$)", na=False) | df_noun["word"].str.contains(fr"a(?:$)", na=False)]
df_noun_to

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
7,bir,NOUN,,birine,biri,bir,to someone,somebody,111455,+ine
9,bir,NOUN,,biriyle,biri,bir,with someone,somebody,82104,+iyle
25,ne,NOUN,Q,neye,ne,ne,to what,what,98750,+ye
54,ben,NOUN,,bende,ben,ben,me,i,175992,+de
86,şey,NOUN,,şeye,şey,şey,to what,thing,148073,+e
...,...,...,...,...,...,...,...,...,...,...
1594,dava,NOUN,,dava,dava,dava,case,case,69269,+
1598,anlamına,NOUN,,anlamına,anlam,anla,meaning,meaning,69099,+
1599,dua,NOUN,,dua,dua,dua,prayer,prayer,68974,+
1604,başıma,NOUN,,başıma,baş,baş,to me,top,68674,+


In [None]:
noun_to_list = list(set(df_noun_from["word"]))  # yönelme eki

In [34]:
df_noun_sign = df_noun[(df_noun["prefix_suffix"] == "+ı") | (df_noun["prefix_suffix"] == "+i") | (df_noun["prefix_suffix"] == "+u") | (df_noun["prefix_suffix"] == "+ü") |
df_noun["word"].str.contains(fr"ı(?:$)", na=False) | df_noun["word"].str.contains(fr"i(?:$)", na=False) | df_noun["word"].str.contains(fr"u(?:$)", na=False) | df_noun["word"].str.contains(fr"ü(?:$)", na=False)]
df_noun_sign

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
4,bir,NOUN,,birini,biri,bir,one,somebody,259916,+ini
5,bir,NOUN,,birisi,biri,bir,someone,somebody,182370,+isi
22,ne,NOUN,Q,neyi,ne,ne,what,what,124260,+yi
23,ne,NOUN,Q,neydi,ne,ne,what was it,what,123671,+ydi
83,şey,NOUN,,şeyi,şey,şey,thing,thing,683786,+i
...,...,...,...,...,...,...,...,...,...,...
1593,verici,NOUN,,verici,verici,ver,transmitter,transmitter,69281,+
1597,olayı,NOUN,,olayı,olay,olay,incident,event,69118,+
1602,herkesi,NOUN,,herkesi,herkes,herkes,everyone,everyone,68852,+
1616,önceki,NOUN,,önceki,önce,önce,previous,before,68345,+


In [None]:
noun_sign_list = list(set(df_noun_sign["word"]))  # belirtme eki

#### Verb

In [94]:
df_verb = df_prefix_suffix_select[df_prefix_suffix_select["POS1"] == "VERB"]
df_verb

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
56,de,VERB,,demek,de,de,means,say,753378,+mek
57,de,VERB,,dedim,de,de,i said,say,267039,+dim
58,de,VERB,,dedi,de,de,said,also,182523,+di
59,de,VERB,,dersin,de,de,you say,say,151593,+rsin
60,de,VERB,,dedin,de,de,you said,also,121622,+din
...,...,...,...,...,...,...,...,...,...,...
1613,bul,VERB,,bulduk,bul,bul,we found,find,77535,+duk
1614,bul,VERB,,bul,bul,bul,find,find,68524,+
1615,edeyim,VERB,,edeyim,et,et,let me,do,68506,+
1619,veriyor,VERB,,veriyorum,ver,ver,i give,give,126290,+um


##### Verb Not Mean

In [105]:
df_verb_not_mean = df_verb[(df_verb["prefix_suffix"] == "+ma") | (df_verb["prefix_suffix"] == "+me") | 
                            df_verb["word"].str.contains(fr"(?:^)ma(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)me(?:$)", na=False) | 
                            df_verb["word"].str.contains(fr"ma(?:$)", na=False) | df_verb["word"].str.contains(fr"me(?:$)", na=False)]
df_verb_not_mean

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
194,bak,VERB,NEG,bakma,bak,bak,do not look,look,100165,+ma
270,et,VERB,NEG,etme,et,et,don't,do,218533,+me
342,ol,VERB,NEG,olma,ol,ol,don't be,die,79121,+ma
642,yapma,VERB,NEG,yapma,yap,yap,don't do that,do it,283114,+
725,etme,VERB,NEG,etme,et,et,don't,do,218533,+
751,yap,VERB,NEG,yapma,yap,yap,don't do that,do it,283114,+ma
1244,bakma,VERB,NEG,bakma,bak,bak,do not look,look,100165,+
1393,unutma,VERB,NEG,unutma,unut,unut,don't forget,forget,85318,+
1396,konuşma,VERB,,konuşma,konuş,konuş,don't talk,talk,85079,+
1476,olma,VERB,NEG,olma,ol,ol,don't be,die,79121,+


In [106]:
verb_not_mean_list = list(set(df_verb_not_mean["word"]))

##### Verb Persons

In [108]:
df_verb_first_person = df_verb[(df_verb["prefix_suffix"] == "+ım") | (df_verb["prefix_suffix"] == "+im") | (df_verb["prefix_suffix"] == "+um") | (df_verb["prefix_suffix"] == "+üm") | 
df_verb["word"].str.contains(fr"ım(?:$)", na=False) | df_verb["word"].str.contains(fr"im(?:$)", na=False) | df_verb["word"].str.contains(fr"um(?:$)", na=False) | df_verb["word"].str.contains(fr"üm(?:$)", na=False)]
df_verb_first_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
57,de,VERB,,dedim,de,de,i said,say,267039,+dim
76,değil,VERB,NEG,değilim,değil,değil,i am not,not,448632,+im
119,iyi,VERB,,iyiyim,iyi,iyi,i'm fine,good,125245,+yim
140,burada,VERB,,buradayım,bura,bura,i am here,here,100375,+yım
174,oldu,VERB,,oldum,ol,ol,became,be,175870,+m
...,...,...,...,...,...,...,...,...,...,...
1578,getirdim,VERB,,getirdim,getir,getir,i brought,bring,70790,+
1600,istemedim,VERB,NEG,istemedim,iste,iste,i didn't want,want,68907,+
1610,bul,VERB,,buldum,bul,bul,i found,find,179420,+dum
1615,edeyim,VERB,,edeyim,et,et,let me,do,68506,+


In [109]:
verb_first_person_list = list(set(df_verb_first_person["word"]))

In [110]:
df_verb_second_person = df_verb[(df_verb["prefix_suffix"] == "+ın") | (df_verb["prefix_suffix"] == "+in") | (df_verb["prefix_suffix"] == "+un") | (df_verb["prefix_suffix"] == "+ün") | 
df_verb["word"].str.contains(fr"ın(?:$)", na=False) | df_verb["word"].str.contains(fr"in(?:$)", na=False) | df_verb["word"].str.contains(fr"un(?:$)", na=False) | df_verb["word"].str.contains(fr"ün(?:$)", na=False)]
df_verb_second_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
59,de,VERB,,dersin,de,de,you say,say,151593,+rsin
60,de,VERB,,dedin,de,de,you said,also,121622,+din
78,değil,VERB,NEG,değilsin,değil,değil,you are not,not,215503,+sin
98,sen,VERB,,sensin,sen,sen,you are,you,123984,+sin
179,oldu,VERB,,oldun,ol,ol,you became,be,70121,+n
...,...,...,...,...,...,...,...,...,...,...
1516,affedersin,VERB,GRE,affedersin,affet,af,you forgive,forgive,75477,+
1579,dinleyin,VERB,,dinleyin,dinle,dinle,listen,listen,70632,+
1583,oldun,VERB,,oldun,ol,ol,you became,be,70121,+
1596,çekilin,VERB,,çekilin,çek,çek,withdraw,check,69201,+


In [111]:
verb_second_person_list = list(set(df_verb_second_person["word"]))

In [112]:
df_verb_third_person = df_verb[(df_verb["prefix_suffix"] == "+ı") | (df_verb["prefix_suffix"] == "+i") | (df_verb["prefix_suffix"] == "+u") | (df_verb["prefix_suffix"] == "+ü") | 
df_verb["word"].str.contains(fr"ı(?:$)", na=False) | df_verb["word"].str.contains(fr"i(?:$)", na=False) | df_verb["word"].str.contains(fr"u(?:$)", na=False) | df_verb["word"].str.contains(fr"ü(?:$)", na=False)]
df_verb_third_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
58,de,VERB,,dedi,de,de,said,also,182523,+di
66,var,VERB,,vardı,var,var,there was,there is,451341,+dı
77,değil,VERB,NEG,değildi,değil,değil,it wasn't,not,235756,+di
103,yok,VERB,NEG,yoktu,yok,yok,there was not,there is not,181801,+tu
170,oldu,VERB,,oldu,ol,ol,happened,be,1141161,+
...,...,...,...,...,...,...,...,...,...,...
1463,gerekiyordu,VERB,,gerekiyordu,gerek,gerek,was necessary,necessary,79130,+
1466,olma,VERB,,olmalı,ol,ol,must be,be,350916,+lı
1471,olma,VERB,NEG,olmadı,ol,ol,it didn't happen,be,125635,+dı
1474,olma,VERB,,olmasını,ol,ol,being,be,93571,+sını


In [113]:
verb_third_person_list = list(set(df_verb_third_person["word"]))

In [114]:
df_verb_first_plural_person = df_verb[(df_verb["prefix_suffix"] == "+mız") | (df_verb["prefix_suffix"] == "+miz") | (df_verb["prefix_suffix"] == "+muz") | (df_verb["prefix_suffix"] == "+müz") | 
df_verb["word"].str.contains(fr"mız(?:$)", na=False) | df_verb["word"].str.contains(fr"miz(?:$)", na=False) | df_verb["word"].str.contains(fr"muz(?:$)", na=False) | df_verb["word"].str.contains(fr"müz(?:$)", na=False)]
df_verb_first_plural_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix


In [116]:
verb_first_plural_person_list = list(set(df_verb_first_plural_person["word"]))

In [115]:
df_verb_second_plural_person = df_verb[(df_verb["prefix_suffix"] == "+nız") | (df_verb["prefix_suffix"] == "+niz") | (df_verb["prefix_suffix"] == "+nuz") | (df_verb["prefix_suffix"] == "+nüz") | 
df_verb["word"].str.contains(fr"nız(?:$)", na=False) | df_verb["word"].str.contains(fr"niz(?:$)", na=False) | df_verb["word"].str.contains(fr"nuz(?:$)", na=False) | df_verb["word"].str.contains(fr"nüz(?:$)", na=False)]
df_verb_second_plural_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
365,gel,VERB,,geldiniz,gel,gel,you come,come,87109,+diniz
434,geldi,VERB,,geldiniz,gel,gel,you come,come,87109,+niz
510,istiyorsun,VERB,,istiyorsunuz,iste,iste,you want,want,71208,+uz
628,istiyor,VERB,,istiyorsunuz,iste,iste,you want,want,71208,+sunuz
998,geldin,VERB,,geldiniz,gel,gel,you come,come,87109,+iz
1146,affedersiniz,VERB,GRE,affedersiniz,affet,af,you forgive,forgive,115746,+
1371,geldiniz,VERB,,geldiniz,gel,gel,you come,come,87109,+
1515,affedersin,VERB,GRE,affedersiniz,affet,af,you forgive,forgive,115746,+iz
1575,istiyorsunuz,VERB,,istiyorsunuz,iste,iste,you want,want,71208,+


In [117]:
verb_second_plural_person_list = list(set(df_verb_second_plural_person["word"]))

In [118]:
df_verb_third_plural_person = df_verb[(df_verb["prefix_suffix"] == "+ları") | (df_verb["prefix_suffix"] == "+leri") | 
df_verb["word"].str.contains(fr"leri(?:$)", na=False) | df_verb["word"].str.contains(fr"ları(?:$)", na=False)]
df_verb_third_plural_person

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix


In [119]:
verb_third_plural_person_list = list(set(df_verb_third_plural_person["word"]))

##### Verb Tenses

In [126]:
df_verb_present_time = df_verb[df_verb["prefix_suffix"].str.contains(fr"yor(?:$)", na=False) | df_verb["word"].str.contains(fr"yor(?:$)", na=False) | 
df_verb["word"].str.contains(fr"mekte(?:$)", na=False) | df_verb["word"].str.contains(fr"makta(?:$)", na=False) | df_verb["word"].str.contains(fr"mada(?:$)", na=False) | df_verb["word"].str.contains(fr"mede(?:$)", na=False) |
df_verb["word"].str.contains(fr"(?:^)mekte(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)makta(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)mede(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)mada(?:$)", na=False)]
df_verb_present_time

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
282,biliyor,VERB,,biliyor,bil,bil,he knows,know,641399,+
295,gerek,VERB,,gerekiyor,gerek,gerek,needed,necessary,324783,+iyor
309,ol,VERB,,oluyor,ol,ol,is happening,be,503064,+uyor
357,gel,VERB,,geliyor,gel,gel,coming,come,416067,+iyor
385,ver,VERB,,veriyor,ver,ver,giving,give,68163,+iyor
405,oluyor,VERB,,oluyor,ol,ol,is happening,be,503064,+
499,geliyor,VERB,,geliyor,gel,gel,coming,come,416067,+
602,gerekiyor,VERB,,gerekiyor,gerek,gerek,needed,necessary,324783,+
625,istiyor,VERB,,istiyor,iste,iste,wants,want,304739,+
662,ediyor,VERB,,ediyor,et,et,doing,do,269847,+


In [None]:
verb_present_time_list = list(set(df_verb_present_time["word"]))

In [130]:
df_verb_future_time = df_verb[df_verb["prefix_suffix"].str.contains(fr"ecek(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)ecek(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)eceğ(?:$)", na=False) | 
df_verb["prefix_suffix"].str.contains(fr"acak(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)acak(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)acağ(?:$)", na=False) | 
df_verb["word"].str.contains(fr"ecek(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)ecek(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)eceğ(?:$)", na=False) | 
df_verb["word"].str.contains(fr"acak(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)acak(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)acağ(?:$)", na=False)]
df_verb_future_time

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
227,olacak,VERB,,olacak,ol,ol,will be,be,786194,+
305,ol,VERB,,olacak,ol,ol,will be,be,786194,+acak
328,ol,VERB,NEG,olmayacak,ol,ol,will not be,be,132589,+mayacak
761,yap,VERB,,yapacak,yap,yap,will make,do it,115620,+acak
1063,olmayacak,VERB,NEG,olmayacak,ol,ol,will not be,be,132589,+
1089,edecek,VERB,,edecek,et,et,will do,do,125925,+
1147,yapacak,VERB,,yapacak,yap,yap,will make,do it,115620,+
1388,olmaya,VERB,NEG,olmayacak,ol,ol,will not be,be,132589,+cak
1470,olma,VERB,NEG,olmayacak,ol,ol,will not be,be,132589,+yacak
1591,gidecek,VERB,,gidecek,git,git,will go,go,69388,+


In [None]:
verb_future_time_list = list(set(df_verb_future_time["word"]))

In [137]:
df_verb_past_time = df_verb[df_verb["prefix_suffix"].str.contains(fr"mış(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)mış(?:$)") | df_verb["prefix_suffix"].str.contains(fr"miş(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)miş(?:$)") |
df_verb["prefix_suffix"].str.contains(fr"muş(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)muş(?:$)") | df_verb["prefix_suffix"].str.contains(fr"müş(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)müş(?:$)") | 
df_verb["prefix_suffix"].str.contains(fr"dı(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)dı(?:$)") | df_verb["prefix_suffix"].str.contains(fr"di(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)di(?:$)") |
df_verb["prefix_suffix"].str.contains(fr"du(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)du(?:$)") | df_verb["prefix_suffix"].str.contains(fr"dü(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)dü(?:$)") |
df_verb["prefix_suffix"].str.contains(fr"tı(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)tı(?:$)") | df_verb["prefix_suffix"].str.contains(fr"ti(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)ti(?:$)") |
df_verb["prefix_suffix"].str.contains(fr"tu(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)tu(?:$)") | df_verb["prefix_suffix"].str.contains(fr"tü(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)tü(?:$)") | 
df_verb["word"].str.contains(fr"mış(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)mış(?:$)") | df_verb["word"].str.contains(fr"miş(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)miş(?:$)") |
df_verb["word"].str.contains(fr"muş(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)muş(?:$)") | df_verb["word"].str.contains(fr"müş(?:$)", na=False) | df_verb["prefix_suffix"].str.contains(fr"(?:^)müş(?:$)") | 
df_verb["word"].str.contains(fr"dı(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)dı(?:$)") | df_verb["word"].str.contains(fr"di(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)di(?:$)") |
df_verb["word"].str.contains(fr"du(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)du(?:$)") | df_verb["word"].str.contains(fr"dü(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)dü(?:$)") |
df_verb["word"].str.contains(fr"tı(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)tı(?:$)") | df_verb["word"].str.contains(fr"ti(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)ti(?:$)") |
df_verb["word"].str.contains(fr"tu(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)tu(?:$)") | df_verb["word"].str.contains(fr"tü(?:$)", na=False) | df_verb["word"].str.contains(fr"(?:^)tü(?:$)")]
df_verb_past_time

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
58,de,VERB,,dedi,de,de,said,also,182523,+di
66,var,VERB,,vardı,var,var,there was,there is,451341,+dı
69,var,VERB,,varmış,var,var,there was,there is,118304,+mış
77,değil,VERB,NEG,değildi,değil,değil,it wasn't,not,235756,+di
103,yok,VERB,NEG,yoktu,yok,yok,there was not,there is not,181801,+tu
170,oldu,VERB,,oldu,ol,ol,happened,be,1141161,+
219,olur,VERB,,olurdu,ol,ol,would be,be,124365,+du
269,et,VERB,,etti,et,et,he,meat,245330,+ti
299,gerek,VERB,,gerekiyordu,gerek,gerek,was necessary,necessary,79130,+iyordu
301,ol,VERB,,oldu,ol,ol,happened,be,1141161,+du


In [139]:
verb_past_time_list = list(set(df_verb_past_time["word"]))

#### Adjective Adverb Pronouns

In [140]:
df_prefix_suffix_select["POS1"].unique()

array(['NUM', 'ADV', 'PRON', 'NOUN', 'CCONJ', 'ADP', 'ADJ', 'AUX', 'VERB'],
      dtype=object)

In [35]:
df_adj_adv_prons = df_prefix_suffix_select[(df_prefix_suffix_select["POS1"] == "ADJ") | (df_prefix_suffix_select["POS1"] == "ADV") | (df_prefix_suffix_select["POS1"] == "PRON")]
df_adj_adv_prons

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
1,bir,ADV,,biraz,biraz,bir,a little,a little,1269641,+az
2,bir,PRON,,biri,biri,bir,somebody,somebody,837400,+i
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940,+likte
8,bir,ADV,,birden,birden,bir,suddenly,suddenly,90124,+den
10,bu,PRON,,bu,bu,bu,this,this,11062659,+
...,...,...,...,...,...,...,...,...,...,...
1585,sık,ADJ,,sıkı,sıkı,sık,tight,tight,82533,+ı
1586,sık,ADV,,sık,sık,sık,often,often,69819,+
1587,arka,ADJ,,arka,arka,arka,back,back,69809,+
1618,resmi,ADJ,,resmi,resmi,resmi,formal,formal,68287,+


In [36]:
df_adj_adv_prons_at_in_on = df_adj_adv_prons[(df_adj_adv_prons["prefix_suffix"] == "+de") | (df_adj_adv_prons["prefix_suffix"] == "+da") | (df_adj_adv_prons["prefix_suffix"] == "+te") | (df_adj_adv_prons["prefix_suffix"] == "+ta") |
df_adj_adv_prons["word"].str.contains(fr"de(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"da(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"te(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"ta(?:$)", na=False)]
df_adj_adv_prons_at_in_on

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940,+likte
28,için,ADJ,,içinde,iç,iç,inside,in,443948,+de
97,sen,PRON,,sende,sen,sen,at you,joyful,145056,+de
138,burada,PRON,,burada,bura,bura,here,here,1779610,+
242,nerede,PRON,Q,nerede,nere,nere,where,where,723529,+
247,orada,PRON,,orada,ora,ora,there,there,704870,+
263,son,ADV,TIME,sonunda,sonunda,son,finally,finally,210471,+unda
280,işte,ADV,GRE,işte,işte,işte,there it is,there it is,646422,+
450,içinde,ADJ,,içinde,iç,iç,inside,in,443948,+
493,aslında,ADV,,aslında,aslında,asıl,actually,actually,419145,+


In [37]:
adj_adv_prons_at_in_on_list = list(set(df_adj_adv_prons_at_in_on["word"]))  # bulunma eki

In [38]:
df_adj_adv_prons_from = df_adj_adv_prons[(df_adj_adv_prons["prefix_suffix"] == "+den") | (df_adj_adv_prons["prefix_suffix"] == "+dan") | (df_adj_adv_prons["prefix_suffix"] == "+ten") | (df_adj_adv_prons["prefix_suffix"] == "+tan") |
df_adj_adv_prons["word"].str.contains(fr"den(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"dan(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"ten(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"tan(?:$)", na=False)]
df_adj_adv_prons_from

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
8,bir,ADV,,birden,birden,bir,suddenly,suddenly,90124,+den
15,bu,PRON,,bundan,bu,bu,from this,this,292873,+ndan
44,o,PRON,,ondan,o,o,from him,he,252864,+ndan
53,ben,PRON,,benden,ben,ben,from me,i,323857,+den
95,sen,PRON,,senden,sen,sen,from you,you,279734,+den
139,burada,PRON,,buradan,bura,bura,from here,here,387244,+n
202,gerçekten,ADV,,gerçekten,gerçekten,gerçek,really,really,927796,+
240,biz,PRON,,bizden,biz,biz,us,we,83554,+den
243,nerede,PRON,Q,nereden,nere,nere,from where,where,262992,+n
248,orada,PRON,,oradan,ora,ora,from there,there is,113223,+n


In [39]:
adj_adv_prons_from_list = list(set(df_adj_adv_prons_from["word"]))  # ayrılma eki

In [40]:
df_adj_adv_prons_to = df_adj_adv_prons[(df_adj_adv_prons["prefix_suffix"] == "+e") | (df_adj_adv_prons["prefix_suffix"] == "+a") |
df_adj_adv_prons["word"].str.contains(fr"e(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"a(?:$)", na=False)]
df_adj_adv_prons_to

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
3,bir,ADV,,birlikte,birlikte,bir,together,together,409940,+likte
13,bu,PRON,,buna,bu,bu,this,this,487789,+na
17,bu,PRON,,bununla,bu,bu,with this,this,101648,+nunla
19,ne,PRON,Q,ne,ne,ne,what,what,8025880,+
28,için,ADJ,,içinde,iç,iç,inside,in,443948,+de
...,...,...,...,...,...,...,...,...,...,...
1522,kara,ADJ,,kara,kara,kara,black,black,75039,+
1561,iç,ADJ,,içinde,iç,iç,inside,in,443948,+inde
1563,iç,ADJ,,içine,iç,iç,into,inner,121050,+ine
1587,arka,ADJ,,arka,arka,arka,back,back,69809,+


In [None]:
adj_adv_prons_to_list = list(set(df_adj_adv_prons_from["word"]))  # yönelme eki

In [41]:
df_adj_adv_prons_sign = df_adj_adv_prons[(df_adj_adv_prons["prefix_suffix"] == "+ı") | (df_adj_adv_prons["prefix_suffix"] == "+i") | (df_adj_adv_prons["prefix_suffix"] == "+u") | (df_adj_adv_prons["prefix_suffix"] == "+ü") |
df_adj_adv_prons["word"].str.contains(fr"ı(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"i(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"u(?:$)", na=False) | df_adj_adv_prons["word"].str.contains(fr"ü(?:$)", na=False)]
df_adj_adv_prons_sign

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
2,bir,PRON,,biri,biri,bir,somebody,somebody,837400,+i
10,bu,PRON,,bu,bu,bu,this,this,11062659,+
11,bu,PRON,,bunu,bu,bu,this,this,2445337,+nu
16,bu,PRON,,bunları,bu,bu,these,this,204443,+nları
37,o,PRON,,onu,o,o,him-her-it,he,2486889,+nu
...,...,...,...,...,...,...,...,...,...,...
1456,çoğu,ADJ,,çoğu,çoğu,çoğu,most,most,79418,+
1507,orası,PRON,,orası,ora,ora,there,ora,75957,+
1532,kendim,PRON,,kendimi,kendi,kendi,myself,own,203298,+i
1585,sık,ADJ,,sıkı,sıkı,sık,tight,tight,82533,+ı


In [42]:
adj_adv_prons_sign_list = list(set(df_adj_adv_prons_sign["word"]))  # belirtme eki

In [23]:
df_prefix_suffix[(df_prefix_suffix["prefix_suffix"] == "+mış") | (df_prefix_suffix["prefix_suffix"] == "+miş") | (df_prefix_suffix["prefix_suffix"] == "+muş") | (df_prefix_suffix["prefix_suffix"] == "+müş") |
(df_prefix_suffix["prefix_suffix"] == "+dı") | (df_prefix_suffix["prefix_suffix"] == "+di") | (df_prefix_suffix["prefix_suffix"] == "+du") | (df_prefix_suffix["prefix_suffix"] == "+dü") | 
(df_prefix_suffix["prefix_suffix"] == "+tı") | (df_prefix_suffix["prefix_suffix"] == "+ti") | (df_prefix_suffix["prefix_suffix"] == "+tu") | (df_prefix_suffix["prefix_suffix"] == "+tü")]

Unnamed: 0,search_word,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,prefix_suffix
160,de,VERB,,dedi,de,de,said,also,182523,+di
175,var,VERB,,vardı,var,var,there was,there is,451341,+dı
178,var,VERB,,varmış,var,var,there was,there is,118304,+mış
186,değil,VERB,NEG,değildi,değil,değil,it wasn't,not,235756,+di
220,yok,VERB,NEG,yoktu,yok,yok,there was not,there is not,181801,+tu
403,olur,VERB,,olurdu,ol,ol,would be,be,124365,+du
457,et,VERB,,etti,et,et,he,meat,245330,+ti
492,ol,VERB,,oldu,ol,ol,happened,be,1141161,+du
505,ol,VERB,,olmuş,ol,ol,it is ok,be,213607,+muş
549,gel,VERB,,geldi,gel,gel,came,come,464529,+di


In [20]:
df_prefix_suffix.search_word.nunique()

283

In [21]:
df_prefix_suffix.word.nunique()

283

##### Copy Move And Delete

In [39]:
output_file1 = glob.glob(f"Prefix_Suffix_*Result.*")
output_file1

['Turkish_English_5000_Word_Prefix_Suffix_Custom_Result.xlsx']

In [40]:
for k in output_file1:
    source = k # source directory
    destination = path
    shutil.copy2(source, destination)

In [41]:
for i in output_file1:
    try:
        os.remove(i)
    except:
        pass

In [42]:
output_file2 = glob.glob(f"{lang_folder.capitalize()}_{lang_pair.capitalize()}_*_Prefix_Suffix_*.*")
output_file2

['Turkish_English_5000_Word_Prefix_Suffix_All.csv',
 'Turkish_English_5000_Word_Prefix_Suffix_Select.xlsx']

In [43]:
for l in output_file2:
    source = l # source directory
    destination = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Data/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}"
    shutil.copy2(source, destination)

In [44]:
for j in output_file2:
    try:
        os.remove(j)
    except:
        pass

### Concat Native And Etymology Prefix Suffix Result

In [3]:
import os
import pandas as pd
import numpy as np
import glob

In [4]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
lang_pair = "English"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian, Intersect ==> native language

# file extention
file_ext = 1000  # native word number

In [5]:
def lower_strip(x):
    try:
        var_low = x.lower()
        var_out = var_low.strip()
    except:
        var_out = x
    return var_out

In [6]:
native_file = glob.glob(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_{file_ext}_Word_Prefix_Suffix_Custom_Result_Manuel.xlsx")
native_file

['/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/Turkish English/Turkish_English_200_Word_Prefix_Suffix_Custom_Result_Manuel.xlsx']

In [7]:
df_native = pd.read_excel(native_file[0])
df_native = df_native[["search_word","word"]]
df_native

Unnamed: 0,search_word,word
0,adam,adam
1,adam,adama
2,adam,adamdan
3,adam,adamdı
4,adam,adamdır
...,...,...
1618,şey,şeyler
1619,şey,şeylerden
1620,şey,şeylere
1621,şey,şeyleri


In [8]:
etymology_file = glob.glob(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} {lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_Shared_Word_Prefix_Suffix_Custom_Result.xlsx")
etymology_file

['/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/Turkish English/Turkish_English_Shared_Word_Prefix_Suffix_Custom_Result.xlsx']

In [9]:
df_etmology = pd.read_excel(etymology_file[0])
df_etmology = df_etmology[["search_word","word"]]
df_etmology

Unnamed: 0,search_word,word
0,abaküs,abaküs
1,abandone,abandoned
2,abanoz,abanoz
3,abdomen,abdomende
4,abluka,abluka
...,...,...
6188,şut,şutu
6189,şut,şutunu
6190,şırınga,şırınga
6191,şırınga,şırıngayla


In [10]:
df_native_etymology_concat = pd.concat([df_native,df_etmology], axis=0)
df_native_etymology_concat

Unnamed: 0,search_word,word
0,adam,adam
1,adam,adama
2,adam,adamdan
3,adam,adamdı
4,adam,adamdır
...,...,...
6188,şut,şutu
6189,şut,şutunu
6190,şırınga,şırınga
6191,şırınga,şırıngayla


In [11]:
df_native_etymology_concat["search_word"] = df_native_etymology_concat["search_word"].apply(lambda x : lower_strip(x))
df_native_etymology_concat["word"] = df_native_etymology_concat["word"].apply(lambda x : lower_strip(x))
df_native_etymology_concat.drop_duplicates(inplace=True)
df_native_etymology_concat.reset_index(drop=True, inplace=True)
df_native_etymology_concat

Unnamed: 0,search_word,word
0,adam,adam
1,adam,adama
2,adam,adamdan
3,adam,adamdı
4,adam,adamdır
...,...,...
7811,şut,şutu
7812,şut,şutunu
7813,şırınga,şırınga
7814,şırınga,şırıngayla


In [12]:
df_native_etymology_concat.to_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/{lang_folder.capitalize()}/{lang_folder.capitalize()} \
{lang_pair.capitalize()}/{lang_folder.capitalize()}_{lang_pair.capitalize()}_{file_ext}_Native_And_Shared_Word_Prefix_Suffix_Custom_Concat.xlsx", index=False)

### Temp

In [None]:
# English, French, German, Spanish, Portuguese, Dutch, Italian

In [29]:
#df_pair1 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish English/Turkish_English_Shared_Vocabulary.xlsx")
#df_pair1

In [31]:
#df_pair2 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish French/Turkish_French_Shared_Vocabulary.xlsx")
#df_pair2

In [32]:
#df_pair3 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish German/Turkish_German_Shared_Vocabulary.xlsx")
#df_pair3

In [33]:
#df_pair4 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Spanish/Turkish_Spanish_Shared_Vocabulary.xlsx")
#df_pair4

In [34]:
#df_pair5 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Portuguese/Turkish_Portuguese_Shared_Vocabulary.xlsx")
#df_pair5

In [35]:
#df_pair6 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Dutch/Turkish_Dutch_Shared_Vocabulary.xlsx")
#df_pair6

In [36]:
#df_pair7 = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/\
#Turkish Italian/Turkish_Italian_Shared_Vocabulary.xlsx")
#df_pair7

In [None]:
#set1 = set(df_pair1.dict_entry_main)
#set2 = set(df_pair2.dict_entry_main)
#set3 = set(df_pair3.dict_entry_main)
#set4 = set(df_pair4.dict_entry_main)
#set5 = set(df_pair5.dict_entry_main)
#set6 = set(df_pair6.dict_entry_main)
#set7 = set(df_pair7.dict_entry_main)

In [None]:
#df_ety_intersect = pd.DataFrame((((((set7.intersection(set6)).intersection(set5)).intersection(set4)).intersection(set3)).intersection(set2)).intersection(set1), columns=["dict_entry_main"])
#df_ety_intersect

In [None]:
#df_ety_intersect.to_excel("Turkish_Intersect_Shared_Vocabulary.xlsx", index=False)

In [19]:
#df_intersect = pd.read_excel("/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Web Scrapping/Result/Turkish/Turkish Intersect/Turkish_Intersect_Shared_Vocabulary.xlsx")
#df_intersect  # English, French, German, Spanish, Portuguese, Dutch, Italian

In [20]:
#df_pair1_merge = pd.merge(df_intersect,df_pair1, how="left", on="dict_entry_main")
#df_pair1_merge.drop_duplicates(inplace=True)
#df_pair1_merge

In [21]:
#df_pair2_merge = pd.merge(df_pair1_merge,df_pair2, how="left", on="dict_entry_main")
#df_pair2_merge.drop_duplicates(inplace=True)
#df_pair2_merge

In [22]:
#df_pair3_merge = pd.merge(df_pair2_merge,df_pair3, how="left", on="dict_entry_main")
#df_pair3_merge.drop_duplicates(inplace=True)
#df_pair3_merge

In [23]:
#df_pair4_merge = pd.merge(df_pair3_merge,df_pair4, how="left", on="dict_entry_main")
#df_pair4_merge.drop_duplicates(inplace=True)
#df_pair4_merge

In [24]:
#df_pair5_merge = pd.merge(df_pair4_merge,df_pair5, how="left", on="dict_entry_main")
#df_pair5_merge.drop_duplicates(inplace=True)
#df_pair5_merge

In [25]:
#df_pair6_merge = pd.merge(df_pair5_merge,df_pair6, how="left", on="dict_entry_main")
#df_pair6_merge.drop_duplicates(inplace=True)
#df_pair6_merge

In [26]:
#df_pair7_merge = pd.merge(df_pair6_merge,df_pair7, how="left", on="dict_entry_main")
#df_pair7_merge.drop_duplicates(inplace=True)
#df_pair7_merge

In [27]:
#df_pair7_merge.rename(columns={"dict_entry_main":"turkish_word_intersect"}, inplace=True)
#df_pair7_merge

In [28]:
#df_pair7_merge.to_excel("Turkish_Intersect_Shared_Vocabulary_With_Other_Languages.xlsx", index=False)