In [1]:
import pandas as pd
import csv
import glob
import os
import re
import numpy as np
pd.set_option('display.max_columns',20)
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(linewidth=460)

### Load inventories & preprocess before merging

In [2]:
# Input files
bfm_inventory = "inventaire_bfmgoldlem.tsv"
ofrlex_inventory = "inventaire_ofrlex.tsv"

# load files
bfm = pd.read_csv(bfm_inventory, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)
ofr = pd.read_csv(ofrlex_inventory, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

# sort values
bfm = bfm.sort_values(by=['lemma'], ascending=True)
ofr = ofr.sort_values(by=['lemme'], ascending=True)

# preprocess bfm ofr dataframes
bfm.loc[bfm['cattex'] == "NOMpro", "lemma"] = bfm["lemma"].str.capitalize()
bfm.loc[bfm['cattex'] == "NOMpro", "form"] = bfm["form"].str.capitalize()

ofr.loc[ofr['upos'] == "PROPN", "lemme"] = ofr["lemme"].str.capitalize()
ofr.loc[ofr['upos'] == "PROPN", "form"] = ofr["form"].str.capitalize()

# fix œ linkage
bfm['lemma'] = bfm['lemma'].str.replace("œ","oe")
bfm['form'] = bfm['form'].str.replace("œ","oe")
bfm['flp'] = bfm['flp'].str.replace("œ","oe")


bfm['flp'] = bfm['flp'].str.replace("*","")
bfm['form'] = bfm['form'].str.replace("*","")
bfm['lemma'] = bfm['lemma'].str.replace("*","")
bfm['flp'] = bfm['flp'].str.replace('\d+', '')
ofr.loc[ofr.form == 'nan', 'form'] = ofr['form']+"__" # to avoid deleting forms that are actually 'nan' when 'fillna()' in pandas
ofr['form'] = ofr['form'].str.replace("§","")
ofr['flp'] = ofr['form'] + "_" + ofr['raw_lemme'] + "_" + ofr['upos'] # insert flp column (will be used to merge both dfs)

# col flp form_lemma_pos
bfm['flp'] = bfm['flp'].str.lower()
ofr['flp'] = ofr['flp'].str.lower()

# new col `fl` form_lemma
bfm['fl'] = bfm['form'].str.lower() + "_" + bfm['lemma'].str.lower()
ofr['fl'] = ofr['form'].str.lower() + "_" + ofr["raw_lemme"].str.lower()

# new col `fp` form_pos
bfm['fp'] = bfm['form'].str.lower() + "_" + bfm['flp'].apply(lambda x: x.split("_")[-1])
ofr['fp'] = ofr['form'].str.lower() + "_" + ofr["upos"].str.lower()

# ignore irrelevant cols
del bfm['occ_bfm']
del bfm['n'] # del col lemma number
del ofr['inconnu']

# rename bfm lemma column to avoid ocnfusion with lemma col from ofr
bfm.rename(columns = {'lemma':'lemma_bfm'}, inplace = True)
bfm.rename(columns = {'form':'form_bfm'}, inplace = True)

print(f"Entrées dans BFMGOLDLEM: {len(bfm)}")
print(f"Entrées dans OFrLex: {len(ofr)}")

Entrées dans BFMGOLDLEM: 31325
Entrées dans OFrLex: 864083


#### Settings
`merge_by_col` = column used as index to merge dataframes  
If `flp`: merge OFrLex and BFM data by **form_lemma_pos**  
If `fl`: merge OFrLex and BFM data by **form_lemma**  
If `fp`: merge OFrLex and BFM data by **form_pos**  

Each option outputs its own file  

In [3]:
### settings

# set col `fl` (form,lemme) or `flp` (+pos) as index in both dataframes or `fp` (form, pos)
# used to match form-lemma pairs or form-lemma-pos or form-pos between ofrlex & bfm inventories
merge_by_col = 'flp'

# name for all inventory (bfm entries & ofrlex entries aligned), grouped by ofrlex lemmes
full_inventory_name = f"inventaire_{merge_by_col}_ofr-bfm.tsv"

# name for file containing shared entries
common_inventory_name = f"inventaire_{merge_by_col}_commun_ofr-bfm.tsv"

#
verb_inventory_name = f"inventaire_verbes_ofr-bfm_{merge_by_col}.tsv" 

# 
common_verb_inventory_name = f"inventaire_verbes_commun_ofr-bfm_{merge_by_col}.tsv"

## Concat & group aligned ofrlex-bfmgoldlem items

#### Both tables (bfm & ofrlex) are merged with data aligned based on index, then sorted alphabeticallly by ofrlex lemmas
- When an item is missing (e.g. ofrlex lemma) thw rows are filled with the info from the other table (e.g. bfm lemma). It allows to detect variants. 
- Items in row `lemmas` and `forms` ending up with an asterisk correpond to bfm entries

In [4]:

# set indexes
bfm.set_index(merge_by_col)
ofr.set_index(merge_by_col)

# Make new df that merges bfm & ofr dataframes by common col `flp` (form lemma pos)
df = ofr.merge(bfm, on=merge_by_col, how='outer').fillna("_")

# Sort df by ofrlex lemmas
df_sorted = df.sort_values('lemme', ascending=True)
print(f"Number of items : {len(df_sorted)}")

## inventaire ensemble ofr-bfm regroupe par lemme
inventaire_bfm_ofr = df_sorted

# insert bfm lemma/form in `lemme`(ofr) column as 'bfm:lemma_bfm' to allow grouping without deteleting empty rows (they correspond to lemmas specific to bfmgoldlem corpus)
# insert bfm form, "". 
inventaire_bfm_ofr.loc[inventaire_bfm_ofr.lemme == "_", "lemme"] = inventaire_bfm_ofr["lemma_bfm"] + "*" # insert bfm lemma in 'lemme'(ofr) there is no ofr lemme (that way we can see what lemmas are not in ofr)
inventaire_bfm_ofr.loc[inventaire_bfm_ofr.form == "_", "form"] = inventaire_bfm_ofr[merge_by_col].apply(lambda x: x.split("_")[0]) + "*" # insert bfm form in 'form' when """"""

# # NOMpro starting with capital letter
inventaire_bfm_ofr.loc[(inventaire_bfm_ofr.cattex == "NOMpro")&(inventaire_bfm_ofr.form.str.endswith("*")), "form"] = inventaire_bfm_ofr.form.str.capitalize()
inventaire_bfm_ofr.loc[(inventaire_bfm_ofr.cattex == "NOMpro")&(inventaire_bfm_ofr.lemme.str.endswith("*")), "lemme"] = inventaire_bfm_ofr.lemme.str.capitalize()
inventaire_bfm_ofr.loc[inventaire_bfm_ofr.upos == "PROPN", "lemme"] = inventaire_bfm_ofr.lemme.str.capitalize()

# group data by `lemme` (ofr)
inv_all_grouped = inventaire_bfm_ofr.groupby(['lemme','form','upos','traits','form_bfm','cattex','feats_bfm','no_upos','file_src']).size()
inv_all_grouped = inv_all_grouped.reset_index()
inv_all_grouped = inv_all_grouped.iloc[:, :-1]

inv_all_grouped.to_csv(full_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)

### Pour une meilleure visualisation du tableau utiliser:
# inv_all_grouped.loc[inv_all_grouped.lemme.duplicated(), ['lemme']] = ''


Number of items : 889183


In [5]:
# display rows
inv_all_grouped.loc[85:120]

Unnamed: 0,lemme,form,upos,traits,form_bfm,cattex,feats_bfm,no_upos,file_src
85,Affrican___54382__1,Affrican,PROPN,"[pred=""Affrican___54382__1<Suj:(sn)>"",upos=PROPN,cat=np,@pl.nom.masc]",_,_,_,0,../ofrlex-dev\PROPN.lex
86,Affrican___54382__1,Affrican,PROPN,"[pred=""Affrican___54382__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.obl.masc]",_,_,_,0,../ofrlex-dev\PROPN.lex
87,Affrican___54382__1,Affricans,PROPN,"[pred=""Affrican___54382__1<Suj:(sn)>"",upos=PROPN,cat=np,@pl.obl.masc]",_,_,_,0,../ofrlex-dev\PROPN.lex
88,Affrican___54382__1,Affricans,PROPN,"[pred=""Affrican___54382__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.nom.masc]",_,_,_,0,../ofrlex-dev\PROPN.lex
89,Affrike___54383__1,Affrike,PROPN,"[pred=""Affrike___54383__1<Suj:(sn)>"",upos=PROPN,cat=np]",_,_,_,0,../ofrlex-dev\PROPN.lex
90,Affrique___54384__1,Affrique,PROPN,"[pred=""Affrique___54384__1<Suj:(sn)>"",upos=PROPN,cat=np]",_,_,_,0,../ofrlex-dev\PROPN.lex
91,Africain*,Affrican*,_,_,Affrican,NOMpro,_,_,_
92,Africain*,Alfricans*,_,_,Alfricans,NOMpro,_,_,_
93,Afriche___99999__1,Afriche,PROPN,"[pred=""Afriche___99999__1<Suj:(sn)>"",upos=PROPN,cat=np]",_,_,_,0,../ofrlex-dev\PROPN.lex
94,Afrique*,Affrike*,_,_,Affrike,NOMpro,_,_,_


### Common lemmas (based on selected index merging)

#### Sauvegarder lemmes en commun

In [6]:
## inventaire items en commun () regroupes par lemmes ofr
inventaire_commun = inv_all_grouped.loc[(inv_all_grouped['traits'] != "_") & (inv_all_grouped['cattex'] != "_")]

print()
print(f">>> Paires {merge_by_col} en commun : {len(inventaire_commun)}")
print()
inventaire_commun.to_csv(common_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)
inventaire_commun[:10]


>>> Paires flp en commun : 14695



Unnamed: 0,lemme,form,upos,traits,form_bfm,cattex,feats_bfm,no_upos,file_src
17,Abel___54354__1,Abel,PROPN,"[pred=""Abel___54354__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.obl.masc]",Abel,NOMpro,_,0,../ofrlex-dev\PROPN.lex
26,Abisme___99999__1,Abisme,PROPN,"[pred=""Abisme___99999__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.masc]",Abisme,NOMpro,_,0,../ofrlex-dev\PROPN.lex
29,Abraham___54362__1,Abraham,PROPN,"[pred=""Abraham___54362__1<Suj:(sn)>"",upos=PROPN,cat=np]",Abraham,NOMpro,_,0,../ofrlex-dev\PROPN.lex
30,Abrahan___54363__1,Abrahan,PROPN,"[pred=""Abrahan___54363__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.obl.masc]",Abrahan,NOMpro,_,0,../ofrlex-dev\PROPN.lex
36,Acelin___99999__1,Acelin,PROPN,"[pred=""Acelin___99999__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.masc]",Acelin,NOMpro,_,0,../ofrlex-dev\PROPN.lex
50,Acorde___54370__1,Acorde,PROPN,"[pred=""Acorde___54370__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.fem]",Acorde,NOMpro,_,0,../ofrlex-dev\PROPN.lex
54,Acorionde___54371__1,Acorionde,PROPN,"[pred=""Acorionde___54371__1<Suj:(sn)>"",upos=PROPN,cat=np]",Acorionde,NOMpro,_,0,../ofrlex-dev\PROPN.lex
63,Adam___54375__1,Adam,PROPN,"[pred=""Adam___54375__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.obl.masc]",Adam,NOMpro,_,0,../ofrlex-dev\PROPN.lex
75,Aelis___54380__1,Aelis,PROPN,"[pred=""Aelis___54380__1<Suj:(sn)>"",upos=PROPN,cat=np]",Aelis,NOMpro,_,0,../ofrlex-dev\PROPN.lex
116,Agrevain___54392__1,Agrevains,PROPN,"[pred=""Agrevain___54392__1<Suj:(sn)>"",upos=PROPN,cat=np,@sg.nom.masc]",Agrevains,NOMpro,_,0,../ofrlex-dev\PROPN.lex


### Inventory for verbs

#### Sauvegarder les entrees pour les verbes (inventaire verbes)

In [7]:
## inventaire verbes (regroupes par lemme)
# select rows that are verbs (bfm and ofrlex)
verbs = inventaire_bfm_ofr.loc[(inventaire_bfm_ofr['upos'] == "VERB")|(inventaire_bfm_ofr['cattex'].str.contains("VER"))]

# regroup
verbs = verbs.groupby(['lemme','form','upos','traits','form_bfm','cattex','feats_bfm','no_upos','file_src']).size()
verbs = verbs.reset_index()
verbs = verbs.iloc[:, :-1]

# sauvegarder tous les verbes (soit ofr, soit bfm)
all_verbs = verbs
all_verbs.to_csv(verb_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)
all_verbs[7:20]

Unnamed: 0,lemme,form,upos,traits,form_bfm,cattex,feats_bfm,no_upos,file_src
7,aaidier___746692__1__1,aaidier,VERB,"[pred=""aaidier___746692__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",_,_,_,0,../ofrlex-dev\VERB.lex
8,aaignier___746693__1__1,aaignier,VERB,"[pred=""aaignier___746693__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",_,_,_,0,../ofrlex-dev\VERB.lex
9,aairier___746694__1__1,aairier,VERB,"[pred=""aairier___746694__1__1<Suj:cln|sn>"",@pers,cat=v,upos=VERB,@inf]",_,_,_,0,../ofrlex-dev\VERB.lex
10,aaiser*,aeise*,_,_,aeise,VERcjg,VerbForm=Fin,_,_
11,aaiser*,aeisent*,_,_,aeisent,VERcjg,VerbForm=Fin,_,_
12,aaiser*,aeisiee*,_,_,aeisiee,VERppe,VerbForm=Part|Tense=Past,_,_
13,aaiser*,aeisier*,_,_,aeisier,VERinf,VerbForm=Inf,_,_
14,aaiser*,aeisiez*,_,_,aeisiez,VERppe,VerbForm=Part|Tense=Past,_,_
15,aaiser*,aeisié*,_,_,aeisié,VERppe,VerbForm=Part|Tense=Past,_,_
16,aaisier___746695__1__1,aais,VERB,"[pred=""aaisier___746695__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@sg.1.ind.prs]",_,_,_,0,../ofrlex-dev\VERB.lex


In [8]:
# sauvegarder uniquement les verbes en commun (à partir les verbes ofrlex)
verbes_commun = verbs.loc[(verbs['upos'].str.contains("VERB")) & (verbs['cattex'] != "_")]
verbes_commun.to_csv(common_verb_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)

verbes_commun[:10]

Unnamed: 0,lemme,form,upos,traits,form_bfm,cattex,feats_bfm,no_upos,file_src
3885,abonder___746785__1__1,abonde,VERB,"[pred=""abonder___746785__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@sg.2.imp]",abonde,VERcjg,VerbForm=Fin,0,../ofrlex-dev\VERB.lex
3886,abonder___746785__1__1,abonde,VERB,"[pred=""abonder___746785__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@sg.3.ind.prs]",abonde,VERcjg,VerbForm=Fin,0,../ofrlex-dev\VERB.lex
4266,aborrer___746793__1__1,aborré,VERB,"[pred=""aborrer___746793__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@ptcp.pst]",aborré,VERppe,VerbForm=Part|Tense=Past,0,../ofrlex-dev\VERB.lex
5323,abriver___746821__1__1,abrivé,VERB,"[pred=""abriver___746821__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@ptcp.pst]",abrivé,VERppe,VerbForm=Part|Tense=Past,0,../ofrlex-dev\VERB.lex
5536,abstenir___746832__1__1,abstenir,VERB,"[pred=""abstenir___746832__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",abstenir,VERinf,VerbForm=Inf,0,../ofrlex-dev\VERB.lex
7165,aceindre___746866__1__1,aceindrons,VERB,"[pred=""aceindre___746866__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@pl.1.ind.fut]",aceindrons,VERcjg,VerbForm=Fin,0,../ofrlex-dev\VERB.lex
7181,aceindre___746866__1__1,aceint,VERB,"[pred=""aceindre___746866__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@ptcp.pst]",aceint,"VERppe, VERcjg","VerbForm=Part|Tense=Past, VerbForm=Fin",0,../ofrlex-dev\VERB.lex
7182,aceindre___746866__1__1,aceint,VERB,"[pred=""aceindre___746866__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@sg.3.ind.prs]",aceint,"VERppe, VERcjg","VerbForm=Part|Tense=Past, VerbForm=Fin",0,../ofrlex-dev\VERB.lex
7929,acesmer___746881__1__1,acesmer,VERB,"[pred=""acesmer___746881__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",acesmer,VERinf,VerbForm=Inf,0,../ofrlex-dev\VERB.lex
7946,acesmer___746881__1__1,acesmez,VERB,"[pred=""acesmer___746881__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@pl.2.imp]",acesmez,VERppe,VerbForm=Part|Tense=Past,0,../ofrlex-dev\VERB.lex
