### Picking which words are valid for the crossword
The words will be saved in a csv file. alongside any relevant information.

In [77]:
import os
import re
import numpy as np
import pandas as pd

frequency_path = os.path.join(os.getcwd(),'..','data','korpus','korpus_frequency.csv')
wordforms_path = os.path.join(os.getcwd(),'..','data','gabra','wordforms.csv')
lessiku_path = os.path.join(os.getcwd(),'..','data','gabra','lexemes.csv')
final_lexemes_path = os.path.join(os.getcwd(),'..','data','gabra','final','lexemes.csv')


In [78]:
#Load frequency counts of words from korpus
df_frequency = pd.read_csv(frequency_path)

#Load lexemes
df_lexemes = pd.read_csv(lessiku_path, encoding='utf-32')[["_id","lemma","root","sources","phonetic","pos","glosses"]]

In [79]:
df_lexemes = df_lexemes[df_lexemes['sources'] != "['UserFeedback']"]
df_lexemes = df_lexemes.dropna()

##### Parsing glosses and seperating examples

In [80]:
glosses = []
examples = []

for txt in df_lexemes['glosses']:

    result = re.findall('{(.*?)}', txt)
    
    gloss   = [i.split(',')[0] for i in result]
    gloss   = ','.join([g[10:-1] for g in gloss])
    glosses.append(gloss)
    
    if len(result[0].split(',')) == 2:
        example = [i.split(',')[1] for i in result]
        example = ','.join([i[14:-1] for i in example])
        examples.append(example)
    else:
        examples.append('')

df_lexemes['glosses'] = glosses
df_lexemes['examples'] = examples

df_lexemes['examples'] = np.where(df_lexemes['examples'].str.len() < 4, '', df_lexemes['examples'])

In [81]:
df_lexemes

Unnamed: 0,_id,lemma,root,sources,phonetic,pos,glosses,examples
0,5200a366e36f237975000783,badbad,{'radicals': 'b-d-b-d'},"['Spagnol2011', 'Falzon2013']",bɐdbɐt,VERB,"to fornicate,to cough a lot",
1,5200a366e36f237975000784,tbadbad,{'radicals': 'b-d-b-d'},['Spagnol2011'],dbɐdbɐt,VERB,to act in a rough or anti-social manner,
2,5200a366e36f237975000785,bagħbas,{'radicals': 'b-għ-b-s'},"['Spagnol2011', 'Falzon2013']",bɐːbɐs,VERB,"to touch,to fondle",
3,5200a366e36f237975000786,tbagħbas,{'radicals': 'b-għ-b-s'},['Spagnol2011'],dbɐːbɐs,VERB,"to be damaged as a result of handling,to be me...",
4,5200a366e36f237975000787,bagħtar,{'radicals': 'b-għ-t-r'},['Spagnol2011'],bɐːtɐr,VERB,to wade through mud,
...,...,...,...,...,...,...,...,...
16833,569002ffdff0600b5ab1434e,mgħarwel,{'radicals': 'għ-r-w-l'},"['UserFeedback', 'DM2015']",mɐːrwɛl,ADJ,swarming,
16837,56966190189d1d6376783b94,ħaxixa,{'radicals': 'ħ-x-x'},['DM2015'],hɐʃɪʃɐ,NOUN,colloquial term for marijuana,
16846,569794e16e68020d08030c4d,kitba,{'radicals': 'k-t-b'},['DM2015'],kɪdbɐ,NOUN,"writing,the act of writing,written text,creati...",
16982,5714de59418362c403b9c3ff,xagħat,{'radicals': 'x-għ-t'},['DM2015'],ʃɐːt,NOUN,"caterpillar, ""Ix- xagħat li jfaqqas mill- bajd...",


### Parsing roots

In [82]:
regex = re.compile('[a-zġħċż][a-zġħċż]?-[a-zġħċż][a-zġħċż]?-[a-zġħċż][a-zġħċż]?-?[a-zġħċż]?[a-zġħċż]?',)

regex.findall('b-d-b-t ok')
df_lexemes['root'] = df_lexemes['root'].apply(lambda s: regex.findall(s))
df_lexemes

Unnamed: 0,_id,lemma,root,sources,phonetic,pos,glosses,examples
0,5200a366e36f237975000783,badbad,[b-d-b-d],"['Spagnol2011', 'Falzon2013']",bɐdbɐt,VERB,"to fornicate,to cough a lot",
1,5200a366e36f237975000784,tbadbad,[b-d-b-d],['Spagnol2011'],dbɐdbɐt,VERB,to act in a rough or anti-social manner,
2,5200a366e36f237975000785,bagħbas,[b-għ-b-s],"['Spagnol2011', 'Falzon2013']",bɐːbɐs,VERB,"to touch,to fondle",
3,5200a366e36f237975000786,tbagħbas,[b-għ-b-s],['Spagnol2011'],dbɐːbɐs,VERB,"to be damaged as a result of handling,to be me...",
4,5200a366e36f237975000787,bagħtar,[b-għ-t-r],['Spagnol2011'],bɐːtɐr,VERB,to wade through mud,
...,...,...,...,...,...,...,...,...
16833,569002ffdff0600b5ab1434e,mgħarwel,[għ-r-w-l],"['UserFeedback', 'DM2015']",mɐːrwɛl,ADJ,swarming,
16837,56966190189d1d6376783b94,ħaxixa,[ħ-x-x],['DM2015'],hɐʃɪʃɐ,NOUN,colloquial term for marijuana,
16846,569794e16e68020d08030c4d,kitba,[k-t-b],['DM2015'],kɪdbɐ,NOUN,"writing,the act of writing,written text,creati...",
16982,5714de59418362c403b9c3ff,xagħat,[x-għ-t],['DM2015'],ʃɐːt,NOUN,"caterpillar, ""Ix- xagħat li jfaqqas mill- bajd...",


In [83]:
df_lexemes = df_lexemes.reset_index(drop=True)
df_lexemes.to_csv(final_lexemes_path, index=False)