# NLP using NLTK

### Download Data: Corpora, Models and Packages

In [None]:
import sys
import os
import numpy as np
import pandas as pd

In [None]:
import nltk
nltk.download() # run one single time

In [None]:
import urllib.request
data_path = os.path.join("datasets", "")
download_path = "https://raw.githubusercontent.com/AbdelMahm/FSR/master/IDDLO-29-20/Notebooks/datasets/"
os.makedirs(data_path, exist_ok=True)
for filename in ("text_ar.txt", "text_fr.txt"):
    print("Downloading", filename)
    url = download_path + filename
    urllib.request.urlretrieve(url, data_path + filename)

In [None]:
import sys
import os

data_path = os.path.join("datasets", "")

file_fr = open(data_path + 'text_fr.txt', 'r')
file_ar = open(data_path + 'text_ar.txt', 'r')

print(file_fr)
print(file_ar)
print('\n')

text_fr = file_fr.read()
text_ar = file_ar.read()

print(text_fr)
print('\n')
print(text_ar)

### System default encoding

In [None]:
import locale
locale.getpreferredencoding()

### Detect Language

In [None]:
from nltk.classify import textcat

cls = textcat.TextCat()
distances = cls.lang_dists(text_fr)
cls.guess_language(text_fr)

### Arabic Reshaping for display

In [None]:
#import arabic_reshaper
#from bidi.algorithm import get_display
 
#reshaped_text = arabic_reshaper.reshape(text_ar)
#bidi_text = get_display(reshaped_text)
#print(bidi_text)

## Regular expressions

### Detect Numbers using re

In [None]:
import re 

numbers = re.findall(r'[0-9]+', text_ar) 
print(numbers) 

### Detect Numbers using nltk

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[0-9]+')
numbers=tokenizer.tokenize(text_ar)
print(numbers)

In [None]:
#Dates
dates_text = """5-2-2020, 15/2/2020, 2020/2/4 autre autre"""
dates = re.findall(r'(\d{1,4}[.\-/]\d{1,2}[.\-/]\d{1,4})', dates_text) 
print(dates) 

In [None]:
# Email
email_text = """ahmed@dgi.gov.ma, maryam@dgi.ma ahmadi3maryam@gmail.com other text here"""
emails = re.findall(r'[\w.-]+@[\w.-]+', email_text) 
print(emails)

## Tokenization

### Word Tokenization

In [None]:
from nltk.tokenize import word_tokenize
print(word_tokenize(text_fr))
print('\n')
print(word_tokenize(text_ar))

In [None]:
#this tokenizer is traind on more data
from nltk.tokenize import wordpunct_tokenize
print(wordpunct_tokenize(text_ar))

### Sentence Tonization

In [None]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(text_fr))
print('\n')
print(sent_tokenize(text_ar))

## POS (Part-Of-Speech) Tagging & Chunking 

### POS Tagging

In [None]:
from nltk import pos_tag

text = text_fr.split()
print("After Split:",text)
print("\n")
pos_tagged = pos_tag(text)
print("After Token:",pos_tagged)

### Parsing using a Chunk Parser

In [None]:
from nltk import RegexpParser

patterns= """mychunk:{<NNP.?>*<CD.?>}"""
chunk_parser = RegexpParser(patterns)
print("After Regex:",chunk_parser)
output = chunk_parser.parse(pos_tagged)
print("After chunk parsing",output)
output.draw()

## Stemming and Lemmatization

### Stemming

In [None]:
import nltk
tokens_fr = nltk.word_tokenize(text_fr)
tokens_ar = nltk.word_tokenize(text_ar)

#### French Stemming

##### Using SnowballStemmer

In [None]:
from nltk.stem.snowball import SnowballStemmer
print (SnowballStemmer.languages)

In [None]:
stemmer=SnowballStemmer("french", ignore_stopwords=True)
for w in tokens_fr:
    print("{} ===> {}".format(w,stemmer.stem(w)))

In [None]:
stemmer=SnowballStemmer("arabic", ignore_stopwords=True)
for w in tokens_ar:
    print("{} ===> {}".format(w,stemmer.stem(w)))

##### Using FrenchStemmer

In [None]:
from nltk.stem.snowball import FrenchStemmer

stemmer  = FrenchStemmer()
for w in tokens_fr:
    print("{} ===> {}".format(w,stemmer.stem(w)))

#### Write stems in a file

In [None]:
fr_stem_file=open("text_fr_stems.txt",mode="a+")
fr_stem_file.truncate(0)
stem_sentence = []
for w in tokens_fr:
    stem_sentence.append(stemmer.stem(w))
    stem_sentence.append(" ")
stem_sentence = "".join(stem_sentence)        
fr_stem_file.write(stem_sentence)

fr_stem_file.close()

### Arabic Stemming

#### ArabicStemmer

In [None]:
from nltk.stem.snowball import ArabicStemmer

stemmer  = ArabicStemmer()

for w in tokens_ar:
    print("{} ===> {}".format(w,stemmer.stem(w)))

#### Arabic ARLSTem (More Recent 2017)

In [None]:
from nltk.stem.arlstem import ARLSTem

stemmer  = ARLSTem()

for w in tokens_ar:
    print("{} ===> {}".format(w,stemmer.stem(w)))

### Lemmatization

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
for w in tokens_fr:
    print("{} ===> {}".format(w, wordnet_lemmatizer.lemmatize(w)))  

## Parsing

In [None]:
grammar = nltk.CFG.fromstring("""S -> NP VP
VP -> V NP
NP -> 'Ahmed' | 'book'
V -> 'reads'
""")

In [None]:
text = nltk.word_tokenize("Ahmed reads book")

In [None]:
parser = nltk.ChartParser(grammar)

trees = parser.parse_all(text)

for tree in trees:
    print(tree)

In [None]:
#parsers for other langauges
#!brew install stanford-parser

In [None]:
from nltk.corpus import treebank
tree = treebank.parsed_sents('wsj_0001.mrg')[0]
print(tree)

### Find RC in Excel File

In [None]:
import pandas as pd

charika_df = pd.read_excel(data_path + 'BD_IDENTIFICATION_ETABLISSEMENTS_PRIVES_VF.xlsx', sheet_name='BD_CHARIKA', index_col=0)

charika_df.head()

In [None]:
tokens = wordpunct_tokenize(text_ar)

RC_num = int(tokens[tokens.index('RC') - 2])

print(RC_num)

charika_df.index[charika_df['RC_CHARIKA'] == RC_num]

#charika_df.index[charika_df['RC_CHARIKA'] == RC_num].tolist()


In [None]:
## Edit distance

In [None]:
import nltk 
sent1 = "It might help to re-install Python if possible."
sent2 = "It can help to install Python again if possible."
 
nltk.edit_distance(sent1, sent2)

## Named Entity Detection

In [None]:
from nltk import pos_tag

text = text_fr.split()
print("After Split:",text)
print("\n")
pos_tagged = pos_tag(text)
print("After Token:",pos_tagged)

ne = nltk.ne_chunk(pos_tagged)
ne.draw()

In [None]:
from cltk.corpus.utils.importer import CorpusImporter
# French CLTK Corpora
corpus_importer = CorpusImporter('french')
corpus_importer.list_corpora
corpus_importer.import_corpus('french_data_cltk')

In [None]:
# Arabic CLTK Corpora
#corpus_importer = CorpusImporter('arabic')
#corpus_importer.list_corpora
#corpus_importer.import_corpus('arabic_text_perseus')

In [None]:
from cltk.tag.ner import NamedEntityReplacer

text_str = """La France a célébré Vendredi dernier sa fête d'indépendance. François Sarkozy a prononcé son discours."""

ner_replacer = NamedEntityReplacer()

ner_replacer.tag_ner_fr(text_str)
