<table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/AbdelMahm/DGI-Lab/blob/master/day2/NLP_with_NLTK.ipynb"><img src="https://colab.research.google.com/img/colab_favicon_256px.png" />Run in Google Colab</a>
  </td>
</table>

In [None]:
# NLP using NLTK

### Download Data: Corpora, Models and Packages

In [4]:
import sys
import os
import numpy as np
import pandas as pd

In [5]:
import nltk
#nltk.download() # run one single time

In [6]:
data_path = os.path.join("datasets", "")
scans_path = os.path.join("datasets", "scans", "")

import urllib.request

download_path = "https://raw.githubusercontent.com/AbdelMahm/DGI-Lab/master/"
os.makedirs(data_path, exist_ok=True)
for filename in ("BD_IDENTIFICATION_ETABLISSEMENTS_PRIVES_VF.xlsx", "glove.6B.50d.txt", "text_ar.txt", "text_fr.txt"):
    print("Downloading", filename)
    url = download_path + "day2/datasets/" + filename
    urllib.request.urlretrieve(url, data_path + filename)

os.makedirs(scans_path, exist_ok=True)
for filename in ("img_ara.png", "img_eng.jpg", "img_fra.png"):
    print("Downloading", filename)
    url = download_path + "day2/datasets/scans/" + filename
    urllib.request.urlretrieve(url, scans_path + filename)

Downloading BD_IDENTIFICATION_ETABLISSEMENTS_PRIVES_VF.xlsx


HTTPError: HTTP Error 404: Not Found

In [2]:

file_fr = open(download_path + 'text_fr.txt', 'r')
file_ar = open(download_path + 'text_ar.txt', 'r')

print(file_fr)
print(file_ar)
print('\n')

text_fr = file_fr.read()
text_ar = file_ar.read()

print(text_fr)
print('\n')
print(text_ar)

FileNotFoundError: [Errno 2] No such file or directory: 'text_fr.txt'

### System default encoding

In [76]:
import locale
locale.getpreferredencoding()

'UTF-8'

### Detect Language

In [77]:
from nltk.classify import textcat

cls = textcat.TextCat()
distances = cls.lang_dists(text_ar)
cls.guess_language(text_ar)

'arb'

## Regular expressions

### Detect Numbers using re

In [79]:
import re 

numbers = re.findall(r'[0-9]+', text_ar) 
print(numbers) 

['31', '109433', '445566778899']


### Detect Numbers using nltk

In [80]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[0-9]+')
numbers=tokenizer.tokenize(text_ar)
print(numbers)

['31', '109433', '445566778899']


In [81]:
#Dates
dates_text = """5-2-2020, 15/2/2020, 2020/2/4 autre autre"""
dates = re.findall(r'(\d{1,4}[.\-/]\d{1,2}[.\-/]\d{1,4})', dates_text) 
print(dates) 

['5-2-2020', '15/2/2020', '2020/2/4']


In [82]:
# Email
email_text = """ahmed@dgi.gov.ma, maryam@dgi.ma ahmadi3maryam@gmail.com other text here"""
emails = re.findall(r'[\w.-]+@[\w.-]+', email_text) 
print(emails)

['ahmed@dgi.gov.ma', 'maryam@dgi.ma', 'ahmadi3maryam@gmail.com']


## Tokenization

### Word Tokenization

In [83]:
from nltk.tokenize import word_tokenize
print(word_tokenize(text_fr))
print('\n')
print(word_tokenize(text_ar))

['Monsieur', 'le', 'directeur', ',', "j'ai", 'le', 'plaisir', 'de', 'vous', 'informer', "d'un", 'cas', "d'evasion", 'fiscale', 'concernant', 'la', 'société', 'SOKA', 'dirigé', 'par', 'Monsieur', 'Ahmadi', 'Ahmed', '.', 'La', 'société', 'est', 'domiciliée', 'à', "l'adresse", '31', 'Boulevard', 'ANNASR', ',', 'Rabat', ',', 'Maroc', '.', 'Son', 'registre', 'de', 'commerce', 'RC', ':', '109433', 'et', 'ICE', ':', '445566778899', '.', 'Veuillez', 'agréer', 'Monsieur', ',', 'mes', 'salutations', 'distinguées', '.']


['السيد', 'المدير،', 'يسرني', 'أن', 'أبلغكم', 'بحالة', 'التهرب', 'الضريبي', 'فيما', 'يتعلق', 'بشركة', 'SOKA', 'برئاسة', 'السيد', 'أحمدي', 'أحمد', '.', 'يقع', 'مقر', 'الشركة', 'في', '31', 'شارع', 'النصر،', 'الرباط،', 'المغرب', '.', 'السجل', 'التجاري', '109433', ':', 'RC', 'و', '445566778899', ':', 'ICE', '.تفضلوا', 'سيدي', 'بقبول', 'أطيب', 'تحياتي', '.']


In [84]:
#this tokenizer is traind on more data
from nltk.tokenize import wordpunct_tokenize
print(wordpunct_tokenize(text_ar))

['السيد', 'المدير', '،', 'يسرني', 'أن', 'أبلغكم', 'بحالة', 'التهرب', 'الضريبي', 'فيما', 'يتعلق', 'بشركة', 'SOKA', 'برئاسة', 'السيد', 'أحمدي', 'أحمد', '.', 'يقع', 'مقر', 'الشركة', 'في', '31', 'شارع', 'النصر', '،', 'الرباط', '،', 'المغرب', '.', 'السجل', 'التجاري', '109433', ':', 'RC', 'و', '445566778899', ':', 'ICE', '.', 'تفضلوا', 'سيدي', 'بقبول', 'أطيب', 'تحياتي', '.']


### Sentence Tokenization

In [85]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(text_fr))
print('\n')
print(sent_tokenize(text_ar))

["Monsieur le directeur, j'ai le plaisir de vous informer d'un cas d'evasion fiscale concernant la société SOKA dirigé par Monsieur Ahmadi Ahmed.", "La société est domiciliée à l'adresse 31 Boulevard ANNASR, Rabat, Maroc.", 'Son registre de commerce RC: 109433 et ICE: 445566778899.', 'Veuillez agréer Monsieur, mes salutations distinguées.']


['السيد المدير، يسرني أن أبلغكم بحالة التهرب الضريبي فيما يتعلق بشركة SOKA برئاسة السيد أحمدي أحمد.', 'يقع مقر الشركة في 31 شارع النصر، الرباط، المغرب.', 'السجل التجاري 109433:RC و 445566778899 :ICE .تفضلوا سيدي بقبول أطيب تحياتي.']


## POS (Part-Of-Speech) Tagging & Chunking 

### POS Tagging

In [86]:
from nltk import pos_tag

text = text_fr.split()
print("After Split:",text)
print("\n")
pos_tagged = pos_tag(text)
print("After Token:",pos_tagged)

After Split: ['Monsieur', 'le', 'directeur,', "j'ai", 'le', 'plaisir', 'de', 'vous', 'informer', "d'un", 'cas', "d'evasion", 'fiscale', 'concernant', 'la', 'société', 'SOKA', 'dirigé', 'par', 'Monsieur', 'Ahmadi', 'Ahmed.', 'La', 'société', 'est', 'domiciliée', 'à', "l'adresse", '31', 'Boulevard', 'ANNASR,', 'Rabat,', 'Maroc.', 'Son', 'registre', 'de', 'commerce', 'RC:', '109433', 'et', 'ICE:', '445566778899.', 'Veuillez', 'agréer', 'Monsieur,', 'mes', 'salutations', 'distinguées.']


After Token: [('Monsieur', 'NNP'), ('le', 'CC'), ('directeur,', 'JJ'), ("j'ai", 'NN'), ('le', 'NN'), ('plaisir', 'NN'), ('de', 'IN'), ('vous', 'JJ'), ('informer', 'NN'), ("d'un", 'NN'), ('cas', 'NN'), ("d'evasion", 'NN'), ('fiscale', 'NN'), ('concernant', 'NN'), ('la', 'NN'), ('société', 'FW'), ('SOKA', 'NNP'), ('dirigé', 'NN'), ('par', 'NN'), ('Monsieur', 'NNP'), ('Ahmadi', 'NNP'), ('Ahmed.', 'NNP'), ('La', 'NNP'), ('société', 'NN'), ('est', 'JJS'), ('domiciliée', 'NN'), ('à', 'NNP'), ("l'adresse", 'VBZ'

### Parsing using a Chunk Parser

In [53]:
from nltk import RegexpParser

patterns= """mychunk:{<NNP.?>*<CD.?>}"""
chunk_parser = RegexpParser(patterns)
print("After Regex:",chunk_parser)
output = chunk_parser.parse(pos_tagged)
print("After chunk parsing",output)
output.draw()

After Regex: chunk.RegexpParser with 1 stages:
RegexpChunkParser with 1 rules:
       <ChunkRule: '<NNP.?>*<CD.?>'>
After chunk parsing (S
  Monsieur/NNP
  le/CC
  directeur,/JJ
  j'ai/NN
  le/NN
  plaisir/NN
  de/IN
  vous/JJ
  informer/NN
  d'un/NN
  cas/NN
  d'evasion/NN
  fiscale/NN
  concernant/NN
  la/NN
  société/FW
  SOKA/NNP
  dirigé/NN
  par/NN
  Monsieur/NNP
  Ahmadi/NNP
  Ahmed./NNP
  La/NNP
  société/NN
  est/JJS
  domiciliée/NN
  à/NNP
  l'adresse/VBZ
  (mychunk 31/CD)
  Boulevard/NNP
  ANNASR,/NNP
  Rabat,/NNP
  Maroc./NNP
  Son/NNP
  registre/FW
  de/FW
  commerce/NN
  (mychunk RC:/NNP 109433/CD)
  et/NN
  (mychunk ICE:/NNP 445566778899./CD)
  Veuillez/NNP
  agréer/NN
  Monsieur,/NNP
  mes/VBZ
  salutations/NNS
  distinguées./NN)


## Stemming and Lemmatization

### Stemming

In [87]:
import nltk
tokens_fr = nltk.word_tokenize(text_fr)
tokens_ar = nltk.word_tokenize(text_ar)

#### French Stemming

##### Using SnowballStemmer

In [88]:
from nltk.stem.snowball import SnowballStemmer
print (SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [89]:
stemmer=SnowballStemmer("french", ignore_stopwords=True)
for w in tokens_fr:
    print("{} ===> {}".format(w,stemmer.stem(w)))

Monsieur ===> monsieur
le ===> le
directeur ===> directeur
, ===> ,
j'ai ===> j'ai
le ===> le
plaisir ===> plais
de ===> de
vous ===> vous
informer ===> inform
d'un ===> d'un
cas ===> cas
d'evasion ===> d'evas
fiscale ===> fiscal
concernant ===> concern
la ===> la
société ===> societ
SOKA ===> sok
dirigé ===> dirig
par ===> par
Monsieur ===> monsieur
Ahmadi ===> ahmad
Ahmed ===> ahmed
. ===> .
La ===> la
société ===> societ
est ===> est
domiciliée ===> domicili
à ===> à
l'adresse ===> l'adress
31 ===> 31
Boulevard ===> boulevard
ANNASR ===> annasr
, ===> ,
Rabat ===> rabat
, ===> ,
Maroc ===> maroc
. ===> .
Son ===> son
registre ===> registr
de ===> de
commerce ===> commerc
RC ===> rc
: ===> :
109433 ===> 109433
et ===> et
ICE ===> ice
: ===> :
445566778899 ===> 445566778899
. ===> .
Veuillez ===> veuill
agréer ===> agré
Monsieur ===> monsieur
, ===> ,
mes ===> mes
salutations ===> salut
distinguées ===> distingu
. ===> .


In [90]:
stemmer=SnowballStemmer("arabic", ignore_stopwords=True)
for w in tokens_ar:
    print("{} ===> {}".format(w,stemmer.stem(w)))

السيد ===> سيد
المدير، ===> مدير
يسرني ===> يسر
أن ===> أن
أبلغكم ===> ابلغ
بحالة ===> حال
التهرب ===> تهرب
الضريبي ===> ضريب
فيما ===> فيما
يتعلق ===> يتعلق
بشركة ===> شرك
SOKA ===> SOKA
برئاسة ===> رياس
السيد ===> سيد
أحمدي ===> احمد
أحمد ===> احمد
. ===> .
يقع ===> يقع
مقر ===> مقر
الشركة ===> شرك
في ===> في
31 ===> 31
شارع ===> شارع
النصر، ===> نصر
الرباط، ===> رباط
المغرب ===> مغرب
. ===> .
السجل ===> سجل
التجاري ===> تجار
109433 ===> 109433
: ===> :
RC ===> RC
و ===> و
445566778899 ===> 445566778899
: ===> :
ICE ===> ICE
.تفضلوا ===> .تفضل
سيدي ===> سيد
بقبول ===> قبول
أطيب ===> اطيب
تحياتي ===> تح
. ===> .


##### Using FrenchStemmer

In [91]:
from nltk.stem.snowball import FrenchStemmer

stemmer  = FrenchStemmer()
for w in tokens_fr:
    print("{} ===> {}".format(w,stemmer.stem(w)))

Monsieur ===> monsieur
le ===> le
directeur ===> directeur
, ===> ,
j'ai ===> j'ai
le ===> le
plaisir ===> plais
de ===> de
vous ===> vous
informer ===> inform
d'un ===> d'un
cas ===> cas
d'evasion ===> d'evas
fiscale ===> fiscal
concernant ===> concern
la ===> la
société ===> societ
SOKA ===> sok
dirigé ===> dirig
par ===> par
Monsieur ===> monsieur
Ahmadi ===> ahmad
Ahmed ===> ahmed
. ===> .
La ===> la
société ===> societ
est ===> est
domiciliée ===> domicili
à ===> à
l'adresse ===> l'adress
31 ===> 31
Boulevard ===> boulevard
ANNASR ===> annasr
, ===> ,
Rabat ===> rabat
, ===> ,
Maroc ===> maroc
. ===> .
Son ===> son
registre ===> registr
de ===> de
commerce ===> commerc
RC ===> rc
: ===> :
109433 ===> 109433
et ===> et
ICE ===> ice
: ===> :
445566778899 ===> 445566778899
. ===> .
Veuillez ===> veuill
agréer ===> agré
Monsieur ===> monsieur
, ===> ,
mes ===> me
salutations ===> salut
distinguées ===> distingu
. ===> .


#### Write stems in a file

In [92]:
fr_stem_file=open("text_fr_stems.txt",mode="a+")
fr_stem_file.truncate(0)
stem_sentence = []
for w in tokens_fr:
    stem_sentence.append(stemmer.stem(w))
    stem_sentence.append(" ")
stem_sentence = "".join(stem_sentence)        
fr_stem_file.write(stem_sentence)

fr_stem_file.close()

### Arabic Stemming

#### ArabicStemmer

In [93]:
from nltk.stem.snowball import ArabicStemmer

stemmer  = ArabicStemmer()

for w in tokens_ar:
    print("{} ===> {}".format(w,stemmer.stem(w)))

السيد ===> سيد
المدير، ===> مدير
يسرني ===> يسر
أن ===> أن
أبلغكم ===> ابلغ
بحالة ===> حال
التهرب ===> تهرب
الضريبي ===> ضريب
فيما ===> فيم
يتعلق ===> يتعلق
بشركة ===> شرك
SOKA ===> SOKA
برئاسة ===> رياس
السيد ===> سيد
أحمدي ===> احمد
أحمد ===> احمد
. ===> .
يقع ===> يقع
مقر ===> مقر
الشركة ===> شرك
في ===> في
31 ===> 31
شارع ===> شارع
النصر، ===> نصر
الرباط، ===> رباط
المغرب ===> مغرب
. ===> .
السجل ===> سجل
التجاري ===> تجار
109433 ===> 109433
: ===> :
RC ===> RC
و ===> و
445566778899 ===> 445566778899
: ===> :
ICE ===> ICE
.تفضلوا ===> .تفضل
سيدي ===> سيد
بقبول ===> قبول
أطيب ===> اطيب
تحياتي ===> تح
. ===> .


#### Arabic ARLSTem (More Recent 2017)

In [94]:
from nltk.stem.arlstem import ARLSTem

stemmer  = ARLSTem()

for w in tokens_ar:
    print("{} ===> {}".format(w,stemmer.stem(w)))

السيد ===> سيد
المدير، ===> مدير،
يسرني ===> سرني
أن ===> ان
أبلغكم ===> بلغ
بحالة ===> بحال
التهرب ===> تهرب
الضريبي ===> ضريبي
فيما ===> فيم
يتعلق ===> تعلق
بشركة ===> بشرك
SOKA ===> SOKA
برئاسة ===> برئاس
السيد ===> سيد
أحمدي ===> حمد
أحمد ===> حمد
. ===> .
يقع ===> يقع
مقر ===> مقر
الشركة ===> شرك
في ===> في
31 ===> 31
شارع ===> شارع
النصر، ===> نصر،
الرباط، ===> رباط،
المغرب ===> مغرب
. ===> .
السجل ===> سجل
التجاري ===> تجاري
109433 ===> 109433
: ===> :
RC ===> RC
و ===> و
445566778899 ===> 445566778899
: ===> :
ICE ===> ICE
.تفضلوا ===> .تفضل
سيدي ===> سيدي
بقبول ===> بقبول
أطيب ===> طيب
تحياتي ===> حياتي
. ===> .


### Lemmatization

In [62]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
for w in tokens_fr:
    print("{} ===> {}".format(w, wordnet_lemmatizer.lemmatize(w)))  

Monsieur ===> Monsieur
le ===> le
directeur ===> directeur
, ===> ,
j'ai ===> j'ai
le ===> le
plaisir ===> plaisir
de ===> de
vous ===> vous
informer ===> informer
d'un ===> d'un
cas ===> ca
d'evasion ===> d'evasion
fiscale ===> fiscale
concernant ===> concernant
la ===> la
société ===> société
SOKA ===> SOKA
dirigé ===> dirigé
par ===> par
Monsieur ===> Monsieur
Ahmadi ===> Ahmadi
Ahmed ===> Ahmed
. ===> .
La ===> La
société ===> société
est ===> est
domiciliée ===> domiciliée
à ===> à
l'adresse ===> l'adresse
31 ===> 31
Boulevard ===> Boulevard
ANNASR ===> ANNASR
, ===> ,
Rabat ===> Rabat
, ===> ,
Maroc ===> Maroc
. ===> .
Son ===> Son
registre ===> registre
de ===> de
commerce ===> commerce
RC ===> RC
: ===> :
109433 ===> 109433
et ===> et
ICE ===> ICE
: ===> :
445566778899 ===> 445566778899
. ===> .
Veuillez ===> Veuillez
agréer ===> agréer
Monsieur ===> Monsieur
, ===> ,
mes ===> me
salutations ===> salutation
distinguées ===> distinguées
. ===> .


## Parsing

In [95]:
grammar = nltk.CFG.fromstring("""S -> NP VP
VP -> V NP
NP -> 'Ahmed' | 'book'
V -> 'reads'
""")

In [96]:
text = nltk.word_tokenize("Ahmed reads book")

In [97]:
parser = nltk.ChartParser(grammar)

trees = parser.parse_all(text)

for tree in trees:
    print(tree)

(S (NP Ahmed) (VP (V reads) (NP book)))


In [98]:
#parsers for other langauges
#!brew install stanford-parser

In [99]:
from nltk.corpus import treebank
tree = treebank.parsed_sents('wsj_0001.mrg')[0]
print(tree)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


### Find RC in Excel File

In [100]:
import pandas as pd

charika_df = pd.read_excel('BD_IDENTIFICATION_ETABLISSEMENTS_PRIVES_VF.xlsx', sheet_name='BD_CHARIKA', index_col=0)

charika_df.head()

Unnamed: 0_level_0,Activite_CHARIKA,RC_CHARIKA,Tribunal_CHARIKA,Adresse_CHARIKA
RAISON_SOCIALE_CHARIKA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Instrimpex maghreb,"import & export, negoce international",109433,Casablanca,8 Angle Rue Cadi Bekkar Et Lot Ettabib - Casa...
Bach pull,"tricotage, confection de toute gamme , import...",109365,Casablanca,"75, Avenue (b) Drissia - 2, El Fida 20550 - A..."
Societe casablanca de marches,"tous les travaux de marches, reparation de ca...",110187,Casablanca,
Sasmie,import export et le commerce en general,54187,Rabat,
Coop voyages,agence de voyages,110751,Casablanca,


In [101]:
tokens = wordpunct_tokenize(text_ar)

RC_num = int(tokens[tokens.index('RC') - 2])

print(RC_num)

charika_df.index[charika_df['RC_CHARIKA'] == RC_num]

#charika_df.index[charika_df['RC_CHARIKA'] == RC_num].tolist()


109433


Index(['Instrimpex maghreb ', '1000 metal '], dtype='object', name='RAISON_SOCIALE_CHARIKA')

### Edit distance (Levenstein distance)

In [6]:
import nltk 
sent1 = "It might help to re-install Python if possible."
sent2 = "It can help to install Python again if possible."
 
nltk.edit_distance(sent1, sent2)

14

## Named Entity Detection

In [70]:
from nltk import pos_tag

text = text_fr.split()
print("After Split:",text)
print("\n")
pos_tagged = pos_tag(text)
print("After Token:",pos_tagged)

ne = nltk.ne_chunk(pos_tagged)
ne.draw()

After Split: ['Monsieur', 'le', 'directeur,', "j'ai", 'le', 'plaisir', 'de', 'vous', 'informer', "d'un", 'cas', "d'evasion", 'fiscale', 'concernant', 'la', 'société', 'SOKA', 'dirigé', 'par', 'Monsieur', 'Ahmadi', 'Ahmed.', 'La', 'société', 'est', 'domiciliée', 'à', "l'adresse", '31', 'Boulevard', 'ANNASR,', 'Rabat,', 'Maroc.', 'Son', 'registre', 'de', 'commerce', 'RC:', '109433', 'et', 'ICE:', '445566778899.', 'Veuillez', 'agréer', 'Monsieur,', 'mes', 'salutations', 'distinguées.']


After Token: [('Monsieur', 'NNP'), ('le', 'CC'), ('directeur,', 'JJ'), ("j'ai", 'NN'), ('le', 'NN'), ('plaisir', 'NN'), ('de', 'IN'), ('vous', 'JJ'), ('informer', 'NN'), ("d'un", 'NN'), ('cas', 'NN'), ("d'evasion", 'NN'), ('fiscale', 'NN'), ('concernant', 'NN'), ('la', 'NN'), ('société', 'FW'), ('SOKA', 'NNP'), ('dirigé', 'NN'), ('par', 'NN'), ('Monsieur', 'NNP'), ('Ahmadi', 'NNP'), ('Ahmed.', 'NNP'), ('La', 'NNP'), ('société', 'NN'), ('est', 'JJS'), ('domiciliée', 'NN'), ('à', 'NNP'), ("l'adresse", 'VBZ'

In [71]:
from cltk.corpus.utils.importer import CorpusImporter
# French CLTK Corpora
corpus_importer = CorpusImporter('french')
corpus_importer.list_corpora
corpus_importer.import_corpus('french_data_cltk')

In [72]:
# Arabic CLTK Corpora
#corpus_importer = CorpusImporter('arabic')
#corpus_importer.list_corpora
#corpus_importer.import_corpus('arabic_text_perseus')

In [73]:
from cltk.tag.ner import NamedEntityReplacer

text_str = """La France a célébré Vendredi dernier sa fête d'indépendance. François Sarkozy a prononcé son discours."""

ner_replacer = NamedEntityReplacer()

ner_replacer.tag_ner_fr(text_str)


[('La',),
 [('France', 'entity', 'LOC')],
 ('a',),
 ('célébré',),
 ('Vendredi',),
 ('dernier',),
 ('sa',),
 ('fête',),
 ("d'",),
 ('indépendance',),
 ('.',),
 [('François', 'entity', 'NAT')],
 ('Sarkozy',),
 ('a',),
 ('prononcé',),
 ('son',),
 ('discours',),
 ('.',)]