In [28]:
import pandas as pd
import csv
import glob
import os
import re
import numpy as np
pd.set_option('display.max_columns',20)
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_colwidth', None)
np.set_printoptions(linewidth=460)
from IPython.display import display, HTML

### Load inventories & preprocess before merging

In [29]:
# Input files
bfm_inventory = "inventaire_bfm.tsv"
ofrlex_inventory = "inventaire_ofrlex.tsv"

# load files
bfm = pd.read_csv(bfm_inventory, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)
ofr = pd.read_csv(ofrlex_inventory, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

# sort values
bfm = bfm.sort_values(by=['lemma'], ascending=True)
ofr = ofr.sort_values(by=['lemme'], ascending=True)

# preprocess bfm ofr dataframes
bfm['flp'] = bfm['flp'].str.replace("*","")
bfm['flp'] = bfm['flp'].str.replace('\d+', '')
ofr.loc[ofr.form == 'nan', 'form'] = ofr['form']+"__" # to avoid deleting forms that are actually 'nan' when 'fillna()' in pandas
ofr['form'] = ofr['form'].str.replace("§","")
ofr['flp'] = ofr['form'] + "_" + ofr['raw_lemme'] + "_" + ofr['upos'] # insert flp column (will be used to merge both dfs)
bfm['flp'] = bfm['flp'].str.lower()
ofr['flp'] = ofr['flp'].str.lower()

# new col `fl` for form+lemme
bfm['fl'] = bfm['form'] + "_" + bfm['lemma']
ofr['fl'] = ofr['form'] + "_" + ofr["raw_lemme"]

# ignore irrelevant cols
del bfm['occ_bfm']
del bfm['form'] # form is already existing in flp col
del bfm['n'] # del col lemma number
del ofr['inconnu']

# rename bfm lemma column to avoid ocnfusion with lemma col from ofr
bfm.rename(columns = {'lemma':'lemma_bfm'}, inplace = True)

print(f"Entrées dans BFMGOLDLEM: {len(bfm)}")
print(f"Entrées dans OFrLex: {len(ofr)}")

Entrées dans BFMGOLDLEM: 31325
Entrées dans OFrLex: 864083


In [30]:
### settings

# set col `fl` (form lemme) or `flp` (+pos) as index in both dataframes
# used to match form-lemma pairs or form-lemma-pos between ofrlex & bfm inventories
merge_by_col = 'fl'

# name for all inventory (bfm entries & ofrlex entries aligned), grouped by ofrlex lemmes
full_inventory_name = f"inventaire_{merge_by_col}_ofr-bfm.tsv"

# name for file containing shared entries
common_inventory_name = f"inventaire_{merge_by_col}_commun_ofr-bfm.tsv"

#
verb_inventory_name = f"inventaire_verbes_ofr-bfm_{merge_by_col}.tsv" 

# 
common_verb_inventory_name = f"inventaire_verbes_commun_ofr-bfm_{merge_by_col}.tsv"

### Concat & group aligned ofrlex-bfmgoldlem form-lemma-pos/form-lemma by OFrLex lemmas

In [32]:

# set indexes
bfm.set_index(merge_by_col)
ofr.set_index(merge_by_col)

# Make new df that merges bfm & ofr dataframes by common col `flp` (form lemma pos)
df = ofr.merge(bfm, on=merge_by_col, how='outer').fillna("_")

# Sort df by ofrlex lemmas
df_sorted = df.sort_values('lemme', ascending=True)
print(f"Number of shared items : {len(df_sorted)}")

## inventaire ensemble ofr-bfm regroupe par lemme
inventaire_bfm_ofr = df_sorted

# insert bfm lemma/form in `lemme`(ofr) column as 'bfm:lemma_bfm' to allow grouping without deteleting empty rows (they correspond to lemmas specific to bfmgoldlem corpus)
# insert bfm form, "". 
inventaire_bfm_ofr.loc[inventaire_bfm_ofr.lemme == "_", "lemme"] = "bfm:" + inventaire_bfm_ofr["lemma_bfm"] # insert bfm lemma in 'lemme'(ofr) there is no ofr lemme (that way we can see what lemmas are not in ofr)
inventaire_bfm_ofr.loc[inventaire_bfm_ofr.form == "_", "form"] = "bfm:" + inventaire_bfm_ofr[merge_by_col].apply(lambda x: x.split("_")[0]) # insert bfm form in 'form' when """"""

# group data by `lemme` (ofr)
inv_all_grouped = inventaire_bfm_ofr.groupby(['lemme','form','upos','traits','cattex','feats_bfm','no_upos','file_src']).size()
inv_all_grouped = inv_all_grouped.reset_index()
inv_all_grouped = inv_all_grouped.iloc[:, :-1]

inv_all_grouped.to_csv(full_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)

### Pour une meilleure visualisation du tableau utiliser:
# inv_all_grouped.loc[inv_all_grouped.lemme.duplicated(), ['lemme']] = ''


Items in shared inventory : 891151


#### Sauvegarder lemmes en commun

In [33]:
## inventaire items en commun () regroupes par lemmes ofr
inventaire_commun = inv_all_grouped.loc[(inv_all_grouped['traits'] != "_") & (inv_all_grouped['cattex'] != "_")]

print()
print(f">>> Paires {merge_by_col} en commun : {len(inventaire_commun)}")
print()
inventaire_commun.to_csv(common_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)
inventaire_commun[:30]


>>> Paires fl en commun : 18747



Unnamed: 0,lemme,form,upos,traits,cattex,feats_bfm,no_upos,file_src
3982,aate___203__1,aates,ADJ,"[pred=""aate___203__1<Suj:cln|sn>"",@pers,cat=adj,upos=ADJ,@pl.fem]",ADJqua,_,0,../ofrlex-dev\ADJ.lex
3983,aate___203__1,aates,ADJ,"[pred=""aate___203__1<Suj:cln|sn>"",@pers,cat=adj,upos=ADJ,@pl.obl.masc]",ADJqua,_,0,../ofrlex-dev\ADJ.lex
3984,aate___203__1,aates,ADJ,"[pred=""aate___203__1<Suj:cln|sn>"",@pers,cat=adj,upos=ADJ,@sg.nom.masc]",ADJqua,_,0,../ofrlex-dev\ADJ.lex
6117,abel___10361__1,abel,NOUN,"[pred=""abel___10361__1<Objde:(de-sinf|de-sn),Objà:(à-sinf)>"",upos=NOUN,cat=nc,@pl.nom.masc]",NOMpro,_,0,../ofrlex-dev\NOUN.lex
6118,abel___10361__1,abel,NOUN,"[pred=""abel___10361__1<Objde:(de-sinf|de-sn),Objà:(à-sinf)>"",upos=NOUN,cat=nc,@sg.obl.masc]",NOMpro,_,0,../ofrlex-dev\NOUN.lex
7027,abisme___10398__1,abisme,NOUN,"[pred=""abisme___10398__1<Objde:(de-sinf|de-sn),Objà:(à-sinf)>"",upos=NOUN,cat=nc,@pl.nom.masc]",NOMpro,_,0,../ofrlex-dev\NOUN.lex
7028,abisme___10398__1,abisme,NOUN,"[pred=""abisme___10398__1<Objde:(de-sinf|de-sn),Objà:(à-sinf)>"",upos=NOUN,cat=nc,@sg.fem]",NOMpro,_,0,../ofrlex-dev\NOUN.lex
7029,abisme___10398__1,abisme,NOUN,"[pred=""abisme___10398__1<Objde:(de-sinf|de-sn),Objà:(à-sinf)>"",upos=NOUN,cat=nc,@sg.obl.masc]",NOMpro,_,0,../ofrlex-dev\NOUN.lex
7364,able___10412__1,able,NOUN,"[pred=""able___10412__1<Objde:(de-sinf|de-sn),Objà:(à-sinf)>"",upos=NOUN,cat=nc,@pl.nom.masc]",ADJqua,_,0,../ofrlex-dev\NOUN.lex
7365,able___10412__1,able,NOUN,"[pred=""able___10412__1<Objde:(de-sinf|de-sn),Objà:(à-sinf)>"",upos=NOUN,cat=nc,@sg.obl.masc]",ADJqua,_,0,../ofrlex-dev\NOUN.lex


#### Sauvegarder les entrees pour les verbes (inventaire verbes)

In [26]:
## inventaire verbes (regroupes par lemme)
# select rows that are verbs
verbs = inventaire_bfm_ofr.loc[(inventaire_bfm_ofr['upos'] == "VERB" )]
verbs = verbs.groupby(['lemme','form','upos','traits','cattex','feats_bfm','no_upos','file_src']).size()
verbs = verbs.reset_index()
verbs = verbs.iloc[:, :-1]

# sauvegarder tous les verbes (soit ofr, soit bfm)
all_verbs = verbs
all_verbs.to_csv(verb_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)
all_verbs[:50]

Unnamed: 0,lemme,form,upos,traits,cattex,feats_bfm,no_upos,file_src
0,aacier___746691__1__1,aacier,VERB,"[pred=""aacier___746691__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",_,_,0.0,../ofrlex-dev\VERB.lex
1,aagié___999999__1__1,aagié,VERB,"[pred=""aagié___999999__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@masc.ptcp.pst]",_,_,0.0,../ofrlex-dev\VERB.lex
2,aagié___999999__1__1,aagié,VERB,"[pred=""aagié___999999__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@masc.ptcp.pst]",_,_,0.0,../ofrlex-dev\mod-lex\VERppe-actif.lex
3,aagié___999999__1__1,aagié,VERB,"[pred=""aagié___999999__1__1<Suj:cln|sn,Obl2:(par-sn)>"",@pers,cat=adj,upos=VERB,@masc.ptcp.pst]",_,_,0.0,../ofrlex-dev\VERB.lex
4,aagié___999999__1__1,aagié,VERB,"[pred=""aagié___999999__1__1<Suj:cln|sn,Obl2:(par-sn)>"",@pers,cat=adj,upos=VERB,@masc.ptcp.pst]",_,_,0.0,../ofrlex-dev\mod-lex\VERppe-ppp.lex
5,aagié___999999__1__1,aagié,VERB,"[pred=""aagié___999999__1__1<Suj:cln|sn,Obl2:(par-sn)>"",@pers,cat=v,upos=VERB,@masc.ptcp.pst]",_,_,0.0,../ofrlex-dev\VERB.lex
6,aagié___999999__1__1,aagié,VERB,"[pred=""aagié___999999__1__1<Suj:cln|sn,Obl2:(par-sn)>"",@pers,cat=v,upos=VERB,@masc.ptcp.pst]",_,_,0.0,../ofrlex-dev\mod-lex\VERppe-passif.lex
7,aaidier___746692__1__1,aaidier,VERB,"[pred=""aaidier___746692__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",_,_,0.0,../ofrlex-dev\VERB.lex
8,aaignier___746693__1__1,aaignier,VERB,"[pred=""aaignier___746693__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",_,_,0.0,../ofrlex-dev\VERB.lex
9,aairier___746694__1__1,aairier,VERB,"[pred=""aairier___746694__1__1<Suj:cln|sn>"",@pers,cat=v,upos=VERB,@inf]",_,_,0.0,../ofrlex-dev\VERB.lex


In [27]:
# sauvegarder uniquement les verbes en commun (à partir les verbes ofrlex)
verbes_commun = verbs.loc[(verbs['upos'].str.contains("upos=VERB")) & (verbs['cattex'] != "_")]
verbes_commun.to_csv(common_verb_inventory_name, sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE)

verbes_commun[:50]

Unnamed: 0,lemme,form,upos,traits,cattex,feats_bfm,no_upos,file_src
3818,abonder___746785__1__1,abonde,VERB,"[pred=""abonder___746785__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@sg.2.imp]",VERcjg,VerbForm=Fin,0.0,../ofrlex-dev\VERB.lex
3819,abonder___746785__1__1,abonde,VERB,"[pred=""abonder___746785__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@sg.3.ind.prs]",VERcjg,VerbForm=Fin,0.0,../ofrlex-dev\VERB.lex
4197,aborrer___746793__1__1,aborré,VERB,"[pred=""aborrer___746793__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@ptcp.pst]",VERppe,VerbForm=Part|Tense=Past,0.0,../ofrlex-dev\VERB.lex
5248,abriver___746821__1__1,abrivé,VERB,"[pred=""abriver___746821__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@ptcp.pst]",VERppe,VerbForm=Part|Tense=Past,0.0,../ofrlex-dev\VERB.lex
5451,abstenir___746832__1__1,abstenir,VERB,"[pred=""abstenir___746832__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",VERinf,VerbForm=Inf,0.0,../ofrlex-dev\VERB.lex
6973,aceindre___746866__1__1,aceindrons,VERB,"[pred=""aceindre___746866__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@pl.1.ind.fut]",VERcjg,VerbForm=Fin,0.0,../ofrlex-dev\VERB.lex
6989,aceindre___746866__1__1,aceint,VERB,"[pred=""aceindre___746866__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@ptcp.pst]","VERppe, VERcjg","VerbForm=Part|Tense=Past, VerbForm=Fin",0.0,../ofrlex-dev\VERB.lex
6990,aceindre___746866__1__1,aceint,VERB,"[pred=""aceindre___746866__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@sg.3.ind.prs]","VERppe, VERcjg","VerbForm=Part|Tense=Past, VerbForm=Fin",0.0,../ofrlex-dev\VERB.lex
7736,acesmer___746881__1__1,acesmer,VERB,"[pred=""acesmer___746881__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@inf]",VERinf,VerbForm=Inf,0.0,../ofrlex-dev\VERB.lex
7753,acesmer___746881__1__1,acesmez,VERB,"[pred=""acesmer___746881__1__1<Suj:cln|sn,Obj:(cla|sn)>"",@pers,cat=v,upos=VERB,@pl.2.imp]",VERppe,VerbForm=Part|Tense=Past,0.0,../ofrlex-dev\VERB.lex
