In [1]:
import spacy
from spacy.matcher import Matcher
import json
from pathlib import Path

BHSA2FRENCH = "../../_private_/French/bhsa2french.json"

with open(BHSA2FRENCH, 'r') as infile:
    bhsa2french = json.load(infile)
    
# load Spacy model
nlp = spacy.load("fr_core_news_lg")

In [10]:
from tf.app import use
A = use('bhsa', hoist=globals(), silent='deep')

In [13]:
bhsa2french['3281']

{'wid': '00100701300014',
 'string': 'בָּ֣א',
 'parse': 'vbqal-perf3ms----H',
 'French': 'entrèrent',
 'french_tense': 'IM'}

In [9]:
bhsa2french['3']

{'wid': '00100100100006',
 'string': 'בָּרָ֣א',
 'parse': 'vbqal-perf3ms----H',
 'French': 'créa',
 'french_tense': 'PS'}

In [15]:
parses = []

for i, verb in enumerate(bhsa2french):
    if i > 100:
        break
    parse = nlp(bhsa2french[verb]['French'])
    tags = '\n'.join('\t'+t.tag_ for t in parse)
    lemmas = '|'.join(t.lemma_ for t in parse)
    print(i)
    print(parse)
    print(tags)
    print('\t', lemmas)
    print()
    
    parses.append(parse)

0
créa
	VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
	 créer

1
était
	AUX__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin
	 être

2
bonne
	ADJ__Gender=Fem|Number=Sing
	 bon

3
appela il
	VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
	PRON__Gender=Masc|Number=Sing|Person=3
	 appeler|il

4
appela il
	VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
	PRON__Gender=Masc|Number=Sing|Person=3
	 appeler|il

5
bon
	ADJ__Gender=Masc|Number=Sing
	 bon

6
bon
	ADJ__Gender=Masc|Number=Sing
	 bon

7
bon
	ADJ__Gender=Masc|Number=Sing
	 bon

8
se mirent à grouiller
	PRON__Person=3|Reflex=Yes
	VERB__Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin
	ADP
	VERB__VerbForm=Inf
	 se|mettre|à|grouiller

9
bon
	ADJ__Gender=Masc|Number=Sing
	 bon

10
bon
	ADJ__Gender=Masc|Number=Sing
	 bon

11
il créa
	PRON__Gender=Masc|Number=Sing|Person=3
	VERB__Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin
	 il|créer

12
il créa
	PRON__Gender=Masc|Number=Sing|Person

In [66]:
test = "La terre était un chaos, elle était vide~; il y avait des ténèbres au-dessus de l'abîme, et le souffle de Dieu tournoyait au-      dessus des eaux."
parse = nlp(test)

In [68]:
t = parse[2]
t

était

In [80]:
t.tag_

'AUX__Mood=Ind|Number=Sing|Person=3|Tense=Imp|VerbForm=Fin'

In [73]:
match = matcher(parse)

match

[(15231430011407509631, 12, 13), (15231430011407509631, 25, 26)]

In [77]:
str_id, begin, end = match[1]

In [78]:
parse[begin:end]

tournoyait

In [40]:
import re

In [56]:
test = 'is'
re.search(fr'\b{test}\b', 'That is nuts')

<re.Match object; span=(5, 7), match='is'>

In [26]:
# french tense rules

tense_rules = [ 

    # -- passé simple --
    (   
        'PS',
        [               
            {'TAG': {'REGEX': 'VERB__Mood=Ind.*Tense=Past\|VerbForm=Fin'}},
        ]   
    ),
    
    # -- passé composé --
    (   
        'PC',        
        [   
            {'TAG': {'REGEX': 'AUX__Mood=Ind.*Tense=Pres\|VerbForm=Fin'}, 'LEMMA': {'IN': ['avoir', 'être']}},
            {'TAG': {'REGEX': 'VERB.*Tense=Past\|VerbForm=Part'}},
        ]   
    ),
    
    # -- l'imparfait --
    (   
        'IM',
        [               
            {'TAG': {'REGEX': 'VERB__Mood=Ind.*Tense=Imp\|VerbForm=Fin'}},
        ]   
    ), 
]

# set up matcher object with the new rules and test it
matcher = Matcher(nlp.vocab)
for tag, ruleset in tense_rules:
    matcher.add(tag, None, ruleset)

In [4]:
def parse_french(bhsa2french):
    """Parse French tenses if they are covered by a rule."""
    
    for bhsa, data in bhsa2french.items():
        
        french = data['French']
        raw_parse = nlp(french)
        tense_parse = matcher(raw_parse)
        
        # get first match if there is one
        if tense_parse:
            m_id, start, end = tense_parse[0]
            data['french_tense'] = nlp.vocab.strings[m_id]
        else:
            data['french_tense'] = None

In [5]:
parse_french(bhsa2french)

In [6]:
bhsa2french['3']

{'wid': '00100100100006',
 'string': 'בָּרָ֣א',
 'parse': 'vbqal-perf3ms----H',
 'French': 'créa',
 'french_tense': 'PS'}

In [50]:
list.remove?

[0;31mSignature:[0m [0mlist[0m[0;34m.[0m[0mremove[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Remove first occurrence of value.

Raises ValueError if the value is not present.
[0;31mType:[0m      method_descriptor


In [57]:
for i, verb in enumerate(bhsa2french):
    if i > 100:
        break
    data = bhsa2french[verb]
    
    print(i)
    print(data['French'])
    print(data['french_tense'])
    print()

0
créa
PS

1
était
None

2
bonne
None

3
appela il
PS

4
appela il
PS

5
bon
None

6
bon
None

7
bon
None

8
se mirent à grouiller
PS

9
bon
None

10
bon
None

11
il créa
PS

12
il créa
PS

13
Je donne
None

14
il avait fait
None

15
il avait fait
None

16
il avait fait
None

17
se reposa
PS

18
en créant
None

19
façonné il avait
None

20
il trouva
PS

21
il avait prise
None

22
attachera s
None

23
ils deviendront
None

24
était
None

25
avait faits
None

26
a t il dit
None

27
dit
None

28
vous serez
None

29
Je t ai entendu
PC

30
Aurais tu mangé
None

31
tu as mise
None

32
a donné
PC

33
as tu fait
None

34
tu as fait
PC

35
dit il
None

36
il dit
None

37
tu as écouté
PC

38
mangeras tu
None

39
est devenue
PC

40
est devenu
PC

41
prendre
None

42
manger
None

43
vivre
None

44
eut des relations
PS

45
J ai produit
PC

46
il porta regard
PS

47
es fâché
PC

48
es tu renfrogné
None

49
Je sais
None

50
as tu fait
None

51
a ouvert
PC

52
je serai
None

53
engendra
None

54
engen

In [41]:
import pandas as pd

In [43]:
# export very basic dataset

rows = []

for verb, data in bhsa2french.items():
    rows.append({
        'bhsa_node': int(verb),
        'french': data['French'],
        'french_tense': data['french_tense']
    })
    
df = pd.DataFrame(rows)

df.head()

Unnamed: 0,bhsa_node,french,french_tense
0,3,créa,PS
1,15,était,
2,47,bonne,
3,69,appela il,PS
4,172,appela il,PS


In [50]:
parsed_ct = df.french_tense.value_counts(dropna=False)

parsed_ct

NaN    9037
PC     2507
PS     1013
IM      667
Name: french_tense, dtype: int64

In [52]:
parsed_pr = parsed_ct / parsed_ct.sum()

parsed_pr

NaN    0.683379
PC     0.189580
PS     0.076603
IM     0.050439
Name: french_tense, dtype: float64

In [54]:
df.to_csv('../../../results/datasets/qtl/french.csv', index=False)