In [1]:
# Sample code for BERT tokenization using transformers library
from transformers import BertTokenizer
import torch

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Sample sentence
sentence = "Hello, how are you today?"

# Tokenize the sentence
tokens = tokenizer.tokenize(sentence)
print("Tokens:", tokens)

# Add special tokens, padding, and truncation for model input
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=100)  # max_length is arbitrary here
print("Input IDs:", inputs['input_ids'])
print("Tokens:", tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))

# Review token IDs and tokens
print("\nSpecial tokens added by BERT:")
for token in tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]):
    if token in ['[CLS]', '[SEP]']:
        print(token)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokens: ['hello', ',', 'how', 'are', 'you', 'today', '?']
Input IDs: tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 2651, 1029,  102]])
Tokens: ['[CLS]', 'hello', ',', 'how', 'are', 'you', 'today', '?', '[SEP]']

Special tokens added by BERT:
[CLS]
[SEP]


In [2]:
from transformers import pipeline

# Create a sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Sample sentence for sentiment analysis
sentence = "I love this product!"

# Perform sentiment analysis on the sample sentence
result = sentiment_pipeline(sentence)

# Output the predicted sentiment and confidence score
print("Sentence:", sentence)
print("Predicted Sentiment:", result[0]['label'])
print("Confidence Score:", result[0]['score'])

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use mps:0


Sentence: I love this product!
Predicted Sentiment: POSITIVE
Confidence Score: 0.9998855590820312


In [None]:
pip install transformers torch

CICE 3

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

class BERTSentimentAnalyzer:
    def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
        # Load pre-trained tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def preprocess(self, text):
        # Tokenize and prepare the input text as tensor
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        return inputs

    def predict_sentiment(self, text):
        # Preprocess the text
        inputs = self.preprocess(text)

        # Perform sentiment prediction
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Compute probabilities and determine sentiment
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        confidence_score = probabilities[0][predicted_class].item()

        # Map predicted class index to sentiment label
        sentiment_labels = ['NEGATIVE', 'POSITIVE']
        predicted_label = sentiment_labels[predicted_class]

        return predicted_label, confidence_score

# Testing the custom sentiment analyzer
analyzer = BERTSentimentAnalyzer()

test_texts = [
    "I love this product!",
    "I hate this product.",
    "This is okay, nothing special.",
    "I'm really satisfied with the service."
]

for text in test_texts:
    sentiment, score = analyzer.predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {sentiment}, Confidence Score: {score:.4f}\n")

: 

In [None]:
pip install --upgrade transformers torch

CICE 4

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

class BERTNamedEntityRecognizer:
    def __init__(self, model_name='dslim/bert-base-NER'):
        print("Loading tokenizer and model...")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForTokenClassification.from_pretrained(model_name)
            print("Tokenizer and model loaded successfully.")
        except Exception as e:
            print(f"Error during model initialization: {e}")
            raise

    def recognize_entities(self, text):
        # Tokenize the input text and prepare tensors
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

        # Predict named entities
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Get the predictions as label IDs
        predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()

        # Map predictions to tokens and labels
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        labels = self.model.config.id2label

        # Extract entities based on BIO scheme
        entities = []
        current_entity = None

        for token, prediction in zip(tokens, predictions):
            label = labels[prediction]

            # Skip special tokens like [CLS] and [SEP]
            if token in ['[CLS]', '[SEP]', '[PAD]']:
                continue

            if label.startswith('B-'):
                # Beginning of a new entity
                if current_entity:
                    entities.append(current_entity)
                current_entity = {
                    'text': token.replace('##', ''),  # Remove subword markers
                    'entity': label.split('-')[1],
                    'start': None,  # You can map token positions to text positions for more accuracy
                    'end': None
                }
            elif label.startswith('I-') and current_entity:
                # Continuation of the current entity
                current_entity['text'] += ' ' + token.replace('##', '')
            else:
                # End of an entity or no entity
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None

        # Add the last entity if there is one
        if current_entity:
            entities.append(current_entity)

        return entities

# Testing the NER system
try:
    recognizer = BERTNamedEntityRecognizer()
    test_text = "Apple Inc. is looking at buying U.K. startup for $1 billion. Steve Jobs was the founder of Apple Inc."
    entities = recognizer.recognize_entities(test_text)
    print(f"Text: {test_text}")
    print("Recognized Entities:")
    for entity in entities:
        print(f"  {entity['text']}: {entity['entity']}")
except Exception as e:
    print(f"An error occurred: {e}")

Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenizer and model loaded successfully.
Text: Apple Inc. is looking at buying U.K. startup for $1 billion. Steve Jobs was the founder of Apple Inc.
Recognized Entities:
  Apple Inc: ORG
  U . K .: LOC
  Steve Job s: PER
  Apple Inc: ORG


EXERCICE 5

Explanation:

Initialization:

The BERTNamedEntityRecognizer class is initialized with a pre-trained model name, which loads the corresponding BERT model and tokenizer for NER tasks.
Token Classification:

The recognize_entities method processes the input text by tokenizing it and passing it through the model to predict entity labels using the B-I-O tagging scheme.
Entity Extraction:

It then groups these tokens into entities, handling subword markers and ignoring special tokens such as [CLS] and [SEP].
Testing:

The script tests the NER system with a sample sentence containing entities like organizations (Apple Inc.), locations (U.K.), and persons (Steve Jobs).

Feature	BERT	GPT
Architecture	Encoder-based	Decoder-based
Primary Purpose	Understanding context bidirectionally	Generating coherent text
Common Use Cases	Question answering, sentiment analysis	Text generation, completion, conversation
Strengths	Deep bidirectional understanding of context	Generates coherent and contextually relevant text
Weaknesses	Less effective at text generation	May lack depth in understanding context
Reflection on Differences and Similarities:

Both BERT and GPT are based on the Transformer architecture but are designed for different purposes. BERT excels in tasks requiring a deep understanding of text due to its bidirectional nature, while GPT is adept at generating text, thanks to its unidirectional approach which focuses on predicting the next token in a sequence. The choice between the two models should be guided by the specific requirements of the task at hand—whether it leans more towards understanding or generating text.

This table and reflection should help in understanding the unique advantages and applications of BERT and GPT in the field of NLP.

EXERCICE 6

BERT's Role in Retrieval

In RAG systems, BERT is primarily used for the retrieval component. BERT’s bidirectional context understanding capabilities are leveraged to interpret and retrieve relevant information from a large corpus of documents. This involves:

Generating Embeddings

BERT generates embeddings for documents and queries by converting them into fixed-size vector representations. These vectors capture the semantic essence of the text, allowing for meaningful comparisons between documents and queries based on their content.

Vector Database Usage

The embeddings produced by BERT are stored in a vector database. This database allows for efficient similarity searches — when a query is made, its embedding is matched against those in the database to identify and retrieve the most relevant documents.

Example of BERT and GPT Collaboration

In a typical RAG system, the process might work as follows:

A user submits a query.
BERT processes this query and generates an embedding.
The system searches the vector database for document embeddings that closely match the query embedding, retrieving the most relevant documents.
These documents are then fed into a generative model like GPT, which uses the information to generate a comprehensive, contextually accurate response.
By integrating BERT’s retrieval capabilities with GPT’s generative strength, RAG systems can produce responses that are not only contextually rich but also grounded in external knowledge, enhancing both the relevance and accuracy of generated content.