# NLP TP - TRANSFORMERS :         

    . Sentiment Analysis
    . Text Generation
    . Name Entity Recognition
    . Question Answering
    . Filling masked text
    . Summarization
    . Translation
    . Feature Extraction 
                                                                                                       Fadwa Saoiabi :)

# Sentiment Analysis

In [29]:
#Sentiment Analysis English

from transformers import pipeline

nlp = pipeline("sentiment-analysis")

result = nlp("I hate you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

result = nlp("I love you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: NEGATIVE, with score: 0.9991
label: POSITIVE, with score: 0.9999


In [2]:
#Sentiment Analysis french

result = nlp("Je suis vraiment fatiguée")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

result = nlp("Je me sens bien")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: NEGATIVE, with score: 0.9971
label: POSITIVE, with score: 0.8181


In [5]:
#Texte en anglais with model

from transformers import pipeline

nlp = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

result = nlp("I hate you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

result = nlp("I love you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: 1 star, with score: 0.6346
label: 5 stars, with score: 0.8547


In [6]:
#Texte en français

from transformers import pipeline

nlp = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

result = nlp("Je suis vraiment fatiguée")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

result = nlp("Je suis au top de ma forme")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: 2 stars, with score: 0.4522
label: 5 stars, with score: 0.7895


In [7]:
#Texte en arabe 

from transformers import pipeline

nlp = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

result = nlp("أنا أحبك كثيرا")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

result = nlp("أكرهك")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: 5 stars, with score: 0.6108
label: 1 star, with score: 0.2798


# Text Generation 

In [30]:
from transformers import pipeline

text_generator = pipeline("text-generation")
print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]


In [31]:
text_generator_eng = pipeline("text-generation",model="xlnet-base-cased")
print(text_generator_eng("My name is", max_length=50, do_sample=False))

text_generator_ara = pipeline("text-generation",model="mofawzy/gpt2-arabic-sentence-generator")
print(text_generator_ara("هذه الصفحة تحتوي على جميع", max_length=50, do_sample=False))

[{'generated_text': 'My name is "S. S. ". I am a "S. S. ". I am a "S. S. ". I am a "S. S. '}]


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'هذه الصفحة تحتوي على جميع ما يمكن أن تكون عليه من قبل، ولكن هذا لا يعني أن يكون الكتاب أفضل من ذلك، ولكن هذا لا يستحق أن يكون أفضل من ذلك."\n"لم يعجبني الكتاب كثيراً ، ربما كان مملاً جداً ، ربما'}]


# Summarization

In [32]:
from transformers import pipeline

summarizer = pipeline("summarization")

ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents"""

print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))

[{'summary_text': ' Liana Barrientos, 39, is facing two criminal counts of "offering a false instrument for filing in the first degree" She has declared "I do" five more times, sometimes within two weeks of each other .'}]


# Translation

In [33]:
from transformers import pipeline
translator = pipeline("translation_en_to_de")
print(translator("Translation is the task of translating a text from one language to another. ", max_length=40))

[{'translation_text': 'Übersetzung ist die Aufgabe, einen Text von einer Sprache in eine andere zu übersetzen.'}]


In [17]:
#Translation using a model and a tokenizer from english to french.

from transformers import AutoModelWithLMHead, AutoTokenizer

model = AutoModelWithLMHead.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

inputs = tokenizer.encode("translate English to French: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
print(tokenizer.decode(outputs[0]))

<pad> Hugging Face est une entreprise technologique basée à New York et à Paris.</s>


# Name Entity Recognition

In [34]:
from transformers import pipeline

nlp = pipeline("ner")

Sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window."

print(nlp(Sequence))

[{'entity': 'I-ORG', 'score': 0.9995786, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}, {'entity': 'I-ORG', 'score': 0.9909764, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}, {'entity': 'I-ORG', 'score': 0.9982225, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}, {'entity': 'I-ORG', 'score': 0.99948806, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16}, {'entity': 'I-LOC', 'score': 0.99943453, 'index': 11, 'word': 'New', 'start': 40, 'end': 43}, {'entity': 'I-LOC', 'score': 0.9993196, 'index': 12, 'word': 'York', 'start': 44, 'end': 48}, {'entity': 'I-LOC', 'score': 0.9993794, 'index': 13, 'word': 'City', 'start': 49, 'end': 53}, {'entity': 'I-LOC', 'score': 0.98625827, 'index': 19, 'word': 'D', 'start': 79, 'end': 80}, {'entity': 'I-LOC', 'score': 0.95142704, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82}, {'entity': 'I-LOC', 'score': 0.93365914, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84}, {'entity': 'I-LOC', 'score': 0.9761654, 'index': 28, 'word': 'Manha

# Question Answering

In [43]:

from transformers import pipeline

nlp = pipeline("question-answering")

context = """The boys in the village attend school every day. When Tina passes by them, she feels so sad and she wishes she could study like them because she dreams of becoming a teacher. Tina really knows the value of education and always says to her: “I will let my children study”. If all the children think in the same way, there will be less illiteracy among the future generations."""

print(nlp(question="Why Tina feels sad?", context=context))




{'score': 0.49223577976226807, 'start': 141, 'end': 173, 'answer': 'she dreams of becoming a teacher'}


# Filling masked text

In [44]:
from transformers import pipeline

nlp = pipeline("fill-mask")
print(nlp(f"The boys in the village {nlp.tokenizer.mask_token} every day."))

[{'sequence': 'The boys in the village pray every day.', 'score': 0.40567484498023987, 'token': 10745, 'token_str': ' pray'}, {'sequence': 'The boys in the village suffer every day.', 'score': 0.03357984870672226, 'token': 6297, 'token_str': ' suffer'}, {'sequence': 'The boys in the village train every day.', 'score': 0.025151127949357033, 'token': 2341, 'token_str': ' train'}, {'sequence': 'The boys in the village cry every day.', 'score': 0.023954305797815323, 'token': 8930, 'token_str': ' cry'}, {'sequence': 'The boys in the village die every day.', 'score': 0.023221848532557487, 'token': 1597, 'token_str': ' die'}]


# Feature extraction

In [46]:
from transformers import AutoTokenizer, AutoModel
import torch

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

sentences = ['Instagram is an American photo and video sharing social networking service.','Created by Kevin Systrom and Mike Krieger']

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/distilbert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/distilbert-base-nli-mean-tokens')

encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
model_output = model(**encoded_input)
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings :")
print(sentence_embeddings)


Sentence embeddings:
tensor([[ 0.7959, -1.0001, -0.3122,  ...,  0.1940, -0.4849, -0.7758],
        [-1.0121, -0.1321, -0.1315,  ..., -0.2003,  0.2946, -0.5103]])


In [None]:
Instagram is an American photo and video sharing social networking service created by Kevin Systrom and Mike Krieger. In April 2012"