## Morphalou data: from XML to DataFrame, to .csv  
In this Notebook: Conversion from XML format (original Morphalou3 data) to .csv format. Some information from the initial data hasn't been kept in the .csv format, as it was of no use in this project.

In [1]:
import xml.etree.ElementTree as et
import pandas as pd
%matplotlib inline

### 1. Nouns

In [3]:
xtree = et.parse("data/commonNoun_Morphalou3_LMF.xml")
xroot = xtree.getroot()

dict_list = []
counter = 0

for le in xroot.iter('lexicalEntry'):
    le_dict = {}
    for lemma in le.iter('lemmatizedForm'):
        le_dict['lemma'] = lemma.find('orthography').text
        try:
            le_dict['gender'] = lemma.find('grammaticalGender').text
        except:
            pass
        le_dict['category'] = lemma.find('grammaticalCategory').text
        
    for form in le.iter('inflectedForm'):
        gn = form.find('grammaticalNumber').text
        le_dict[gn] = form.find('orthography').text
    
    dict_list.append(le_dict)
    counter += 1
    if counter % 1000 == 0:
        print('.', end='')

nouns_df = pd.DataFrame(dict_list)
nouns_df

......................................................................................................

Unnamed: 0,lemma,gender,category,invariable,singular,plural
0,100-mètres,masculine,commonNoun,100-mètres,,
1,2D,feminine,commonNoun,2D,,
2,3D,feminine,commonNoun,3D,,
3,A,masculine,commonNoun,µA,,
4,a,masculine,commonNoun,a,,
...,...,...,...,...,...,...
102220,φ,masculine,commonNoun,φ,,
102221,χ,masculine,commonNoun,χ,,
102222,ψ,masculine,commonNoun,ψ,,
102223,ω,masculine,commonNoun,ω,,


In [4]:
nouns_df.to_csv('data/csv/all_nouns.csv')

## 2. Verbs

In [9]:
xtree = et.parse("data/verb_Morphalou3_LMF.xml")
xroot = xtree.getroot()

dict_list = []
counter = 0

for le in xroot.iter('lexicalEntry'):
    le_dict = {}
    for lemma in le.iter('lemmatizedForm'):
        le_dict['lemma'] = lemma.find('orthography').text
       
        
    for form in le.iter('inflectedForm'):
        try:
            gn = form.find('grammaticalNumber').text  
        
        except:
            pass
        
        try:
            gm = form.find('grammaticalMood').text
       
        except:
            pass

        try:
            gt = form.find('grammaticalTense').text
            
        except:
            pass
        
        try:
            gp = form.find('grammaticalPerson').text
            
        except:
            pass
        try:
            gg=form.find('grammaticalGender').text
           
        except:
            pass

        try:
            le_dict[gn,gm,gt,gp,gg] = form.find('orthography').text
        except:
            pass
        
       
    
    dict_list.append(le_dict)
    counter += 1
    if counter % 1000 == 0:
        print('.', end='')

verbs_df = pd.DataFrame(dict_list)
verbs_df

..............

Unnamed: 0,lemma,"(singular, indicative, simplePast, thirdPerson, masculine)","(singular, indicative, simplePast, firstPerson, masculine)","(plural, indicative, imperfect, thirdPerson, masculine)","(singular, indicative, imperfect, secondPerson, masculine)","(singular, indicative, imperfect, thirdPerson, masculine)","(singular, participle, present, thirdPerson, masculine)","(singular, indicative, simplePast, secondPerson, masculine)","(singular, subjunctive, imperfect, firstPerson, masculine)","(plural, subjunctive, imperfect, thirdPerson, masculine)",...,"(invariable, participle, present, thirdPerson, invariable)","(plural, infinitive, imperfect, firstPerson, invariable)","(invariable, indicative, present, thirdPerson, masculine)","(invariable, infinitive, present, thirdPerson, masculine)","(plural, participle, present, secondPerson, feminine)","(singular, infinitive, imperfect, thirdPerson, feminine)","(plural, infinitive, past, firstPerson, masculine)","(invariable, infinitive, past, secondPerson, invariable)","(invariable, indicative, present, thirdPerson, feminine)","(plural, infinitive, present, secondPerson, invariable)"
0,abaisser,abaissa,abaissai,abaissaient,abaissais,abaissait,abaissant,abaissas,abaissasse,abaissassent,...,,,,,,,,,,
1,abalober,,,,,,,,,,...,,,,,,,,,,
2,abalourdir,,,,,,,,,,...,,,,,,,,,,
3,abandonner,abandonna,abandonnai,abandonnaient,abandonnais,abandonnait,abandonnant,abandonnas,abandonnasse,abandonnassent,...,,,,,,,,,,
4,abasourdir,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14757,îloter,,,,,,,,,,...,,,,,,,,,,
14758,ôter,ôta,ôtai,ôtaient,ôtais,ôtait,ôtant,ôtas,ôtasse,ôtassent,...,,,,,,,,,,
14759,œiller,,,,,,,,,,...,,,,,,,,,,
14760,œilletonner,,,,,,,,,,...,,,,,,,,,,


In [10]:
verbs_df.to_csv('data/csv/all_verbs.csv')

## 3. Adjectives

In [11]:
xtree = et.parse("data/adjective_Morphalou3_LMF.xml")
xroot = xtree.getroot()

dict_list = []
counter = 0

for le in xroot.iter('lexicalEntry'):
    le_dict = {}
    for lemma in le.iter('lemmatizedForm'):
        le_dict['lemma'] = lemma.find('orthography').text
       
        
    for form in le.iter('inflectedForm'):
        try:
            gn = form.find('grammaticalNumber').text  
        
        except:
            pass
        
        try:
            gg=form.find('grammaticalGender').text
           
        except:
            pass

        try:
            le_dict[gn,gg] = form.find('orthography').text
        except:
            pass
        
       
    
    dict_list.append(le_dict)
    counter += 1
    if counter % 1000 == 0:
        print('.', end='')

adjectives_df = pd.DataFrame(dict_list)
adjectives_df

....................................

Unnamed: 0,lemma,"(singular, masculine)","(plural, masculine)","(singular, feminine)","(plural, feminine)","(singular, invariable)","(plural, invariable)","(invariable, masculine)","(invariable, invariable)","(invariable, feminine)"
0,a-humain,a-humain,,,,,,,,
1,a-raciste,a-raciste,,,,,,,,
2,aalénien,aalénien,aaléniens,aalénienne,aaléniennes,,,,,
3,aaronide,aaronide,,,,,,,,
4,abactérien,abactérien,abactériens,abactérienne,abactériennes,,,,,
...,...,...,...,...,...,...,...,...,...,...
36518,œstrogène,,,,,œstrogène,œstrogènes,,,
36519,œstrogénique,,,,,œstrogénique,œstrogéniques,,,
36520,œstromane,,,,,œstromane,œstromanes,,,
36521,œstroprogestatif,œstroprogestatif,œstroprogestatifs,œstroprogestative,œstroprogestatives,,,,,


In [12]:
adjectives_df.to_csv("data/csv/all_adjectives.csv")

## 4. Adverbs

In [7]:
xtree = et.parse("data/adverb_Morphalou3_LMF.xml")
xroot = xtree.getroot()

dict_list = []
counter = 0

for le in xroot.iter('lexicalEntry'):
    le_dict = {}
    for lemma in le.iter('lemmatizedForm'):
        le_dict['lemma'] = lemma.find('orthography').text
       
        try:
            loc = lemma.find('locution').text  
        except:
            loc = "non"

        try:
            le_dict["locution"] = loc
        except:
            pass
        
    dict_list.append(le_dict)
    counter += 1
    if counter % 1000 == 0:
        print('.', end='')

adverbs_df = pd.DataFrame(dict_list)
adverbs_df

....

Unnamed: 0,lemma,locution
0,a contrario,oui
1,a fortiori,oui
2,a latere,oui
3,a minima,oui
4,a posteriori,oui
...,...,...
4150,évidemment,non
4151,évolutivement,non
4152,événementiellement,non
4153,ô,non


In [8]:
adverbs_df.to_csv("data/csv/all_adverbs.csv")

## 5. Grammatical Words

In [20]:
xtree = et.parse("data/grammaticalWords_Morphalou3_LMF.xml")
xroot = xtree.getroot()

dict_list = []
counter = 0

for le in xroot.iter('lexicalEntry'):
    le_dict = {}
    attributes = []
    for lemma in le.iter('lemmatizedForm'):
        le_dict["lemma"] = lemma.find('orthography').text
        
        try:
            gc = lemma.find("grammaticalCategory").text
            le_dict["grammaticalCategory"] = gc
        except:
            pass

        try:
            gsc = lemma.find("grammaticalSubCategory").text
            le_dict["grammaticalSubCategory"] = gsc
        except:
            pass

        try:
            loc = lemma.find("locution").text
            le_dict["locution"] = loc
        except:
            pass
       
    i = 1    
    for form in le.iter('inflectedForm'):
        try:
            ort = form.find('orthography').text  
            le_dict["inflectedForm_"+str(i)] = ort
            i += 1
        except:
            pass
       
    
    dict_list.append(le_dict)
    counter += 1
    if counter % 1000 == 0:
        print('.', end='')
        
gwords_df = pd.DataFrame(dict_list)
gwords_df


Unnamed: 0,lemma,grammaticalCategory,inflectedForm_1,grammaticalSubCategory,locution,inflectedForm_2,inflectedForm_3,inflectedForm_4,inflectedForm_5,inflectedForm_6,inflectedForm_7,inflectedForm_8
0,a,preposition,a,,,,,,,,,
1,afin de,conjunction,afin de,subordination,oui,,,,,,,
2,afin de,preposition,afin de,,oui,afin d',,,,,,
3,afin que,conjunction,afin que,subordination,oui,afin qu',,,,,,
4,ains,conjunction,ains,coordination,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
806,ça,pronoun,ça,demonstrative,,ç',,,,,,
807,ça,pronoun,ça,personal,,,,,,,,
808,ès,preposition,ès,,,,,,,,,
809,étant donné que,conjunction,étant donné que,subordination,oui,étant donné qu',,,,,,


In [21]:
gwords_df.to_csv("data/csv/all_grammaticalWords.csv")

## 6. Interjections

In [22]:
xtree = et.parse("data/interjection_Morphalou3_LMF.xml")
xroot = xtree.getroot()

dict_list = []
counter = 0

for le in xroot.iter('lexicalEntry'):
    le_dict = {}
    attributes = []
    for lemma in le.iter('lemmatizedForm'):
        le_dict["lemma"] = lemma.find('orthography').text
        

        try:
            loc = lemma.find("locution").text
            le_dict["locution"] = loc
        except:
            pass
       
    i = 1    
    for form in le.iter('inflectedForm'):
        try:
            ort = form.find('orthography').text  
            le_dict["inflectedForm_"+str(i)] = ort
            i += 1
        except:
            pass
       
    
    dict_list.append(le_dict)
    counter += 1
    if counter % 1000 == 0:
        print('.', end='')
        
interjections_df = pd.DataFrame(dict_list)
interjections_df


Unnamed: 0,lemma,inflectedForm_1,locution
0,acré,acré,
1,adieu,adieu,
2,aglagla,aglagla,
3,ah,ah,
4,aha,aha,
...,...,...,...
417,à plus,à plus,oui
418,çà,çà,
419,évohé,évohé,
420,évoé,évoé,


In [23]:
interjections_df.to_csv("data/csv/all_interjections.csv")

## 7. Others

In [24]:
xtree = et.parse("data/noCategory_Morphalou3_LMF.xml")
xroot = xtree.getroot()

dict_list = []
counter = 0

for le in xroot.iter('lexicalEntry'):
    le_dict = {}
    attributes = []
    for lemma in le.iter('lemmatizedForm'):
        le_dict["lemma"] = lemma.find('orthography').text
           
    dict_list.append(le_dict)
    counter += 1
    if counter % 1000 == 0:
        print('.', end='')
        
noCategory_df = pd.DataFrame(dict_list)
noCategory_df


Unnamed: 0,lemma
0,a demi-mot
1,a divinis
2,a fortiori
3,a maxima
4,a minima
...,...
347,à vau-l'eau
348,à vau-le-feu
349,à vau-vent
350,étouffée


In [25]:
noCategory_df.to_csv("data/csv/all_noCategory.csv")