#Transformers 


### Sentiment analysis

In [3]:
!pip install transformers 



In [4]:
from transformers import pipeline

nlp = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

result = nlp("bien reçu")[0]
print(result)
print(f"label: {result['label']}, with score: {round(result['score']*100, 2)}%")

result = nlp("mauvais remarque")[0]
print(f"label: {result['label']}, with score: {round(result['score']*100, 2)}%")

{'label': '4 stars', 'score': 0.42931947112083435}
label: 4 stars, with score: 42.93%
label: 1 star, with score: 57.96%


##Text generation

In [None]:
from transformers import pipeline

In [None]:
# Francais
TXT_GEN_FR = pipeline('text-generation', model='dbddv01/gpt2-french-small')

print(TXT_GEN_FR("je parle avec", max_length=50, do_sample=False))

In [None]:
# Arabe
TXT_GEN_AR = pipeline('text-generation', model='akhooli/gpt2-small-arabic')
print(TXT_GEN_AR("تعتبر مدينة الرباط", max_length=50, do_sample=False))


## Name entity recognition (NER)

In [None]:

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

NER_ENG_REC = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER")
NER_AR_REC = pipeline("ner", model="hatmimoha/arabic-ner", tokenizer="hatmimoha/arabic-ner")
NER_FR_REC = pipeline("ner", model="gilf/french-postag-model", tokenizer="gilf/french-postag-model")

print(NER_AR_REC("تعتبر مدينة الرباط"))
 
print(NER_ENG_REC("good job"))

print(NER_FR_REC("je parle avec"))

## 04- Question answering

In [None]:
from transformers import pipeline
Ques_Answ = pipeline("question-answering")
context = """
La Mauritanie, en forme longue la République islamique de Mauritanie est un État d'Afrique du Nord-Ouest, situé entre 15 et 27 degrés de latitude nord et 5 et 17 degrés de longitude ouest..
"""
Question = "Quelle est La Mauritanie?"
Result = Ques_Answ(question=Question, context=context)
print("Reponse:", Result['answer'])




## Filling masked text

In [None]:
from transformers import pipeline

nlp = pipeline("fill-mask")

from pprint import pprint
pprint(nlp(f"Les coronavirus sont des {nlp.tokenizer.mask_token} de la famille des Coronaviridae."))



In [None]:
#Arabic
arabic_fill_mask = pipeline('fill-mask', model='CAMeL-Lab/bert-base-camelbert-ca')
pprint(arabic_fill_mask("سكان كلميمة يحذرون من استنزاف [MASK]‬ الفرشة المائية ."))

## Summarization

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization")

ARTICLE = """ 
La Mauritanie, en forme longue la République islamique de Mauritanie est un État d'Afrique du Nord-Ouest, situé entre 15 et 27 degrés de latitude nord et 5 et 17 degrés de longitude ouest.
"""
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))

## Translation

In [None]:
from transformers import pipeline
# English to french
translator = pipeline("translation_en_to_fr")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

pprint(translator("The country's name derives from the name of the ancient Berber kingdom of Mauretania, located in present-day Morocco and Algeria.", max_length=40))


In [None]:
# english to Arabic

from transformers import MarianTokenizer, MarianMTModel

tokenizer = MarianTokenizer.from_pretrained("marefa-nlp/marefa-mt-en-ar")
model = MarianMTModel.from_pretrained("marefa-nlp/marefa-mt-en-ar")

text = "The country's name derives from the name of the ancient Berber kingdom of Mauretania, located in present-day Morocco and Algeria."

translated_tokens = model.generate(**tokenizer.prepare_seq2seq_batch(text, return_tensors="pt"))
Output_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]

print(Output_text)

In [None]:
# Arabic to English
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

text_ar = "الجمهورية الإسلامية الموريتانية"

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(text_ar, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
pprint(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))


## Feature extraction

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# sentences.
sentences = [
    "its very easy",
    "you can use it ",
    "You are a very good software engineer, engineer.",]

vectorizer = CountVectorizer(stop_words='english')

vectorizer.fit(sentences)

vectorizer.get_feature_names()

['easy', 'engineer', 'good', 'software', 'use']