# Enrichissement du dictionnaire de Buchanan 

Amélie Leboeuf

### Importer les librairies

In [1]:
import pandas as pd

### Lire les documents

Dictionnaire de Buchanan

In [2]:
# Dictionnaire de Buchanan
cheminDuFichierCSV = "./../01_INPUT/1757_Buchanan-J.csv" 
dfDicoBuch = pd.read_csv(cheminDuFichierCSV, delimiter=";", encoding="utf8")
dfDicoBuch.sample(n=5)

Unnamed: 0,nbSyll,pron,origin,warning,remarques,POS,lineId,pageId
11170,1,ray,,,,n.,2.0,177
1372,3,blŏʹmăry,,,,n.,45.0,38
10953,1,pye,F.,,,n.,64.0,173
11994,2,ſaʹltiſh,,,,a.,37.0,188
4434,3,ĕnĕʹrvāte,F.,,,v.,25.0,81


In [3]:
len(dfDicoBuch)

15800

Traduction du dictionnaire de Buchanan

In [4]:
# Traduction du dictionnaire de Buchanan
cheminDuFichierCSV = "./../01_INPUT/Buchanan_PronChar_counts.csv" 
dfTradBuch = pd.read_csv(cheminDuFichierCSV, delimiter=";", encoding="utf8")
dfTradBuch.sample(n=5)

Unnamed: 0,char,occ,graph
15,o,3838,o
27,j,251,j
53,(,28,
39,q,270,q
19,h,2383,h


In [5]:
dfTradBuch[dfTradBuch['char']=="ʹ"]

Unnamed: 0,char,occ,graph
1,ʹ,12725,


In [6]:
# Suprimer les élements qui ne sont pas traduit (NaN)
dfTradBuch = dfTradBuch[dfTradBuch['graph'].notnull()]
dfTradBuch.sample(n=5)

Unnamed: 0,char,occ,graph
62,ë,2,e
72,ï,1,i
33,̇,266,o
50,ė,83,e
24,ĭ,5281,i


Dictionnaire de Bigi

In [7]:
# Dictionnaire de Bigi
cheminDuDicoBigi = "./../01_INPUT/2015_Bigi-B.dict" 
dfDicoBigi = pd.read_csv(cheminDuDicoBigi, delimiter=" \[\] ", encoding="utf8", engine='python', names=['hwd','pronSampa'], skiprows=8 )
dfDicoBigi.head(n=5)

Unnamed: 0,hwd,pronSampa
0,a,@
1,a(2),eI
2,a'ready,@ r\ E d i:
3,a's,eI z
4,a.,eI


Traduction sampa vs ipa 

In [8]:
# Traduction sampa vs ipa
cheminDuMappingSampaIpa = "./../01_INPUT/sampa-ipa.txt" 
dfMappingSampaIpa = pd.read_csv(cheminDuMappingSampaIpa, delimiter="  *", encoding="utf8", engine='python', names=['sampa','ipa'])
dfMappingSampaIpa.sample(n=3)

Unnamed: 0,sampa,ipa
41,G,ɣ
96,_R,̌
30,@U,oʊ


### Associer les tables de Buchanan

In [19]:
# Définir pour un mot à quoi correspond chaque symbole / lettre
def TradDico(x, dfTradBuch):
    trad = ""

    for uneLettre in x:
        dfRowFinded = dfTradBuch[dfTradBuch['char']==uneLettre]

        if len(dfRowFinded==1):
            trad+=dfRowFinded.iloc[0]['graph']

    return trad

In [20]:
# Test de la fonction avec un mot
x = "pŏʹſtage"
TradDico(x, dfTradBuch)

'postage'

In [21]:
# Faire tourner sur le dictionnaire de Buchanan
dfDicoBuch['motVedette'] = dfDicoBuch.apply(lambda x: TradDico(x['pron'], dfTradBuch),1)

In [22]:
# Afficher le résultat final
dfDicoBuch

Unnamed: 0,nbSyll,pron,origin,warning,remarques,POS,lineId,pageId,motVedette
0,3,aʹbăcus,,,,n.,2.0,19,abacus
1,2,ăbăʹft,,,,p.,3.0,19,abaft
2,3,ăbaiʹſănce,F.,,,n.,4.0,19,abaisance
3,3,ābăʹndon,F.,,,v.,5.0,19,abandon
4,4,ābăʹndŏned,,,,a.,6.0,19,abandoned
...,...,...,...,...,...,...,...,...,...
15795,4,zōŏʹgrăphy,G.,,,n.,19.0,241,zoography
15796,4,zōŏʹphytès,G.,,,n.,20.0,241,zoophytes
15797,4,zōŏʹphŏric,G.,,,n.,21.0,241,zoophoric
15798,4,zōŏʹtŏmiſt,G.,,,n.,22.0,241,zootomist


In [23]:
# Sauvegarder les données
path = "./../02_OUTPUT/DicoTradBuch.csv"
dfDicoBuch.to_csv(path, sep=";",encoding="utf8",index=False)

### Associer les tables de Bigi

In [9]:
# ne garder que la première occurence pour éviter une indécision mais ce n'est pas terrible. A discuter avec les linguistes.
dfMappingSampaIpa.drop_duplicates(subset=['sampa'], keep='first', inplace=True)

In [10]:
def sampa2ipa(strSampa, dfMapping):
    ipaRes = ""
    
    listValSampa = strSampa.split(" ")
    #print(listValSampa)
    for valSampa in listValSampa:
        dfRowFinded = dfMapping[dfMapping['sampa']==valSampa]

        if len(dfRowFinded==1):
            ipaRes+=dfRowFinded.iloc[0]['ipa']
        else:
            ipaRes+='?'
    return ipaRes

In [12]:
dfDicoBigi['ipa'] = dfDicoBigi.apply(lambda x: sampa2ipa(x['pronSampa'], dfMappingSampaIpa),1)
dfDicoBigi.sample(n=5)

Unnamed: 0,hwd,pronSampa,ipa
116648,telluride,t E l j 3:r aI d,tɛljɜ:r?d
111753,starwalt,s t A r\ w @ l t,stɑɹwəlt
94498,radell,r\ A d eI l,ɹɑd?l
113938,suez's,s u E z i z,suɛziz
73956,mccartin,m @ k A r\ t @ n,məkɑɹtən


In [17]:
path = "./../02_OUTPUT/DicoTradBigi.csv"
dfDicoBigi.to_csv(path, sep=";",encoding="utf8",index=False)

### Associer les tables de Buchanan et Bigi

In [28]:
cheminDuFichierCSV = "./../02_OUTPUT/DicoTradBuch.csv" 
dfDicoBuch = pd.read_csv(cheminDuFichierCSV, delimiter=";", encoding="utf8")
dfDicoBuch = dfDicoBuch[dfDicoBuch['motVedette'].notnull()]
dfDicoBuch = dfDicoBuch.drop(["origin", "warning","remarques","POS","lineId","pageId"], axis=1)
dfDicoBuch = dfDicoBuch[["motVedette", "nbSyll", "pron"]]
dfDicoBuch.columns = ['motVedette', 'nbSyll','origin']
dfDicoBuch.sample(n=5)

Unnamed: 0,motVedette,nbSyll,origin
3562,detraction,4,detrăʹction
13116,stomachful,3,ſtŏʹmăc_h_ful
4213,eclogue,2,ĕʹclŏgue
11745,rid,1,rĭd
423,amenity,4,amēʹnĭty


In [29]:
cheminDuFichierCSV = "./../02_OUTPUT/DicoTradBigi.csv" 
dfDicoBigi = pd.read_csv(cheminDuFichierCSV, delimiter=";", encoding="utf8")
dfDicoBigi.columns = ['motVedette', 'sampa','ipa']
dfDicoBigi.sample(n=5)

Unnamed: 0,motVedette,sampa,ipa
103040,saxe,s { k s,sæks
32807,dope,d @U p,doʊp
68032,levies,l E v i: z,lɛviːz
49604,gunsalus,g @ n s A l @ s,gənsɑləs
37241,episteme,E p i s t i: m,ɛpistiːm


In [31]:
dfDicoBuch = dfDicoBuch.merge(dfDicoBigi, on = ['motVedette'])
dfDicoBuch

Unnamed: 0,motVedette,nbSyll,origin,sampa,ipa
0,abacus,3,aʹbăcus,{ b @ k @ s,æbəkəs
1,abandon,3,ābăʹndon,@ b { n d @ n,əbændən
2,abandoned,4,ābăʹndŏned,@ b { n d @ n d,əbændənd
3,abash,2,ăbăʹſh,@ b { S,əbæʃ
4,abate,2,ăbāʹte,@ b eI t,əb?t
...,...,...,...,...,...
10328,zenith,2,zĕʹnith,z i: n @ T,ziːnəθ
10329,zest,1,zĕſt,z E s t,zɛst
10330,zink,1,zĭnk,z I N k,zɪŋk
10331,zodiac,3,zōʹdĭac,z @U 4 i: { k,zoʊɾiːæk


In [32]:
path = "./../02_OUTPUT/DicoTradBuch_ALL.csv"
dfDicoBuch.to_csv(path, sep=";",encoding="utf8",index=False)

In [35]:
min(dfDicoBuch["nbSyll"])

0