# Installing necessary libraries

In [53]:
%%capture
!pip install tf-keras
!pip intsall torch
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install evaluate
!pip install nltk
!pip install rouge_score

# QA Pipeline

In [54]:
import transformers

transformers.logging.set_verbosity_error()

In [55]:
input_content = """Braille is a tactile writing system used by people who are visually impaired or blind. 
It was developed by Louis Braille in 1824, who himself was blind. Braille consists of patterns of raised dots arranged in cells, 
where each cell can have up to six dots. The arrangement of these dots represents different letters, numbers, punctuation marks, 
or even entire words, depending on the language and application.
The Braille system is versatile, enabling the blind to read and write not only text but also mathematical expressions 
(using the Nemeth Braille Code), music notation, and computer symbols. It can be read by touch, with the fingertips feeling the dot patterns. 
In modern applications, Braille is produced using specialized Braille printers (embossers) or manually with slates and styluses.
Advancements in technology have expanded Braille's accessibility through digital devices such as refreshable Braille displays, 
which dynamically change the dot patterns to represent screen content, helping users interact with computers, smartphones, 
and other digital systems. While voice recognition and audio interfaces offer alternatives, Braille remains essential for literacy, 
allowing users to understand written language structure and spelling independently."""

from transformers import pipeline

qa_pipeline = pipeline("question-answering",
                      model="deepset/minilm-uncased-squad2")

In [56]:
question = "What is braile consist of?"
answer = qa_pipeline(question = question, context=input_content)

print(answer)

{'score': 0.6161749958992004, 'start': 174, 'end': 215, 'answer': 'patterns of raised dots arranged in cells'}


In [57]:
print("Another Question")

question = "How Braile is produced?"
print("\nQuestion: ", question)
print('Answer: ', qa_pipeline(question = question, context=input_content)['answer'])

Another Question

Question:  How Braile is produced?
Answer:  using specialized Braille printers


## Evaluating QA pipeline performance

In [58]:
from evaluate import load

# Load the SQuAD v2 metric
squad_metric = load("squad_v2")

# Define the correct answer and the list of predicted answers
correct_answer = "New Delhi"
predicted_answers = ["New Delhi", "Mumbai", "Bangalore"]

# Initialize lists to store cumulative predictions and references
cum_predictions = []
cum_references = []

# Iterate through the predicted answers and evaluate each one
for index, predicted_answer in enumerate(predicted_answers):
    predictions = [
        {"prediction_text": predicted_answer, "id": str(index), "no_answer_probability": 0.0}
    ]
    
    references = [
        {"answers": {'answer_start': [1], 'text': [correct_answer]}, "id": str(index)}
    ]
    
    # Append to cumulative lists
    cum_predictions.append(predictions[0])
    cum_references.append(references[0])
    
    # Compute and print the F1 score for each answer
    results = squad_metric.compute(predictions=predictions, references=references)
    print(f"F1 for answer '{predicted_answer}': {results['f1']}")

# Compute and print cumulative results
cum_result = squad_metric.compute(predictions=cum_predictions, references=cum_references)
print(f"Cumulative Results: {cum_result}")


F1 for answer 'New Delhi': 100.0
F1 for answer 'Mumbai': 0.0
F1 for answer 'Bangalore': 0.0
Cumulative Results: {'exact': 33.333333333333336, 'f1': 33.333333333333336, 'total': 3, 'HasAns_exact': 33.333333333333336, 'HasAns_f1': 33.333333333333336, 'HasAns_total': 3, 'best_exact': 33.333333333333336, 'best_exact_thresh': 0.0, 'best_f1': 33.333333333333336, 'best_f1_thresh': 0.0}


# Summarization with pipeplines

In [59]:
import transformers

transformers.logging.set_verbosity_error()

In [60]:
input_content = """Braille is a tactile writing system used by people who are visually impaired or blind. 
It was developed by Louis Braille in 1824, who himself was blind. Braille consists of patterns of raised dots arranged in cells, 
where each cell can have up to six dots. The arrangement of these dots represents different letters, numbers, punctuation marks, 
or even entire words, depending on the language and application.
The Braille system is versatile, enabling the blind to read and write not only text but also mathematical expressions 
(using the Nemeth Braille Code), music notation, and computer symbols. It can be read by touch, with the fingertips feeling the dot patterns. 
In modern applications, Braille is produced using specialized Braille printers (embossers) or manually with slates and styluses.
Advancements in technology have expanded Braille's accessibility through digital devices such as refreshable Braille displays, 
which dynamically change the dot patterns to represent screen content, helping users interact with computers, smartphones, 
and other digital systems. While voice recognition and audio interfaces offer alternatives, Braille remains essential for literacy, 
allowing users to understand written language structure and spelling independently."""

In [61]:
from transformers import pipeline

extractive_summarizer = pipeline("summarization",
                                min_length=10,
                                max_length=100)

extractive_summary = extractive_summarizer(input_content)

print(extractive_summary[0].get("summary_text"))

 Braille is a tactile writing system used by people who are visually impaired or blind . It was developed by Louis Braille in 1824, who himself was blind . Braille consists of patterns of raised dots arranged in cells .


In [62]:
print(f"checkpoint used: {extractive_summarizer.model.config}")

checkpoint used: BartConfig {
  "_name_or_path": "sshleifer/distilbart-cnn-12-6",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": true,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_enc

## Evaluating with rouge score

In [63]:
import evaluate

rouge_evaluator = evaluate.load("rouge")

In [64]:
#evaluate exact match strings
reference_text = ["Coding should not stop"]
predict_text = ["Coding should not stop"]

eval_results = rouge_evaluator.compute(predictions = predict_text, references= reference_text)
print("Results for exact match: ", eval_results)

Results for exact match:  {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}


In [65]:
#evaluate no-match strings
reference_text = ["Coding should not stop"]
predict_text = ["This is something else"]

eval_results = rouge_evaluator.compute(predictions = predict_text, references= reference_text)
print("Results for no-match: ", eval_results)

Results for no-match:  {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


In [66]:
# evaluate summary

eval_results =rouge_evaluator.compute(
    predictions = [extractive_summary[0].get("summary_text")],
    references = [input_content]
)

print("Results for summary generated: ", eval_results)

Results for summary generated:  {'rouge1': 0.33333333333333337, 'rouge2': 0.32727272727272727, 'rougeL': 0.33333333333333337, 'rougeLsum': 0.33333333333333337}


# Text Generation

In [67]:
import transformers

# to avoid warning messages
transformers.logging.set_verbosity_error()

In [68]:
from transformers import pipeline

text_generator = pipeline("text-generation",
                         model="gpt2")

transformers.set_seed(1)

In [69]:
input_text ="Natural language processing is a growing domain in machine learning"

synthetic_text = text_generator(input_text,
                               num_return_sequences = 3,
                               max_new_tokens = 50)

for index, text in enumerate(synthetic_text):
    print(f"Sequence No: {index+1} \n {text.get("generated_text")} \n---------")

Sequence No: 1 
 Natural language processing is a growing domain in machine learning. A large percentage of machine-created content is written in human-friendly, non-technical languages. For example, the following example shows the development of three artificial intelligences: one that can understand the English language but not language that is written on 
---------
Sequence No: 2 
 Natural language processing is a growing domain in machine learning and there are numerous applications, from the classification of text in books to the ability as an information-processing application for the computer.

In this talk I will discuss the core properties of natural language processing and then try to create software for applying it 
---------
Sequence No: 3 
 Natural language processing is a growing domain in machine learning. This means that learning based on the structure of speech and speech processing have tremendous potential to change the way we think about the world. It just has to be

In [174]:
import transformers

transformers.logging.set_verbosity_error()

In [175]:
import transformers

conversational_pipeline = pipeline("text2text-generation",model="facebook/blenderbot_small-90M")

# print(conversational_pipeline.model.config)

transformers.set_seed(49)

In [52]:
# sample inputs

first_input = "Do you have any hobbies?"
second_input = "I like to watch movies"
third_input = "action movies"

#create a context
bot_conversation = Conversation(first_input)
print(bot_conversation)

print("\nFirst Exchange: \n---------------")
bot_conversation.generated_responses.append(conversational_pipeline(first_input)[0]['generated_text'])
print("User Input: ", bot_conversation.past_user_inputs[0])
print("Bot Output: ", bot_conversation.generated_responses[0])

bot_conversation.add_user_input(second_input)

print("\nSecond Exchange: \n---------------")
bot_conversation.generated_responses.append(conversational_pipeline(second_input)[0]['generated_text'])
conversational_pipeline(second_input)
print("User Input: ", bot_conversation.past_user_inputs[1])
print("User Input: ", bot_conversation.generated_responses[1])

bot_conversation = Conversation(third_input)

print("\nThird Exchange: \n---------------")

conversational_pipeline(bot_conversation)
print("User Input: ", bot_conversation.past_user_inputs[2])
print("User Input: ", bot_conversation.generated_responses[2])

print("\nAccessing All ")

<__main__.Conversation object at 0x16cb3f770>

First Exchange: 
---------------
User Input:  Do you have any hobbies?
Bot Output:  yes , i love going to the beach . what about you ? do you have any hobbies ?


AttributeError: 'Conversation' object has no attribute 'add_user_input'

### Loading Dataset

In [176]:
from datasets import load_dataset

dataset_name = 'poem_sentiment'

poem_sentiments = load_dataset(dataset_name)

print(poem_sentiments)
print(poem_sentiments['test'][15:20])

print("\nSentiment lables used", poem_sentiments['train'].features.get('label').names)

DatasetDict({
    train: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 892
    })
    validation: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 105
    })
    test: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 104
    })
})
{'id': [15, 16, 17, 18, 19], 'verse_text': ["that we must change for heav'n, this mournful gloom", 'lo now, o daughter of kings, let us rise in the face of the day,', 'for penance, by a saintly styrian monk', 'upon a mountain crag, young angelo--', "down in lovah's lane."], 'label': [0, 1, 2, 2, 2]}

Sentiment lables used ['negative', 'positive', 'no_impact', 'mixed']


### Encoding Text

In [177]:
# Encoding Text
from transformers import DistilBertTokenizer

model_name = 'distilbert-base-uncased'
db_tokenizer = DistilBertTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return db_tokenizer(batch['verse_text'],
                       padding=True,
                       truncation=True)
    
enc_poem_sentiment = poem_sentiments.map(tokenize,
                                       batched=True,
                                       batch_size=None)

print(enc_poem_sentiment['train'][0:5])

{'id': [0, 1, 2, 3, 4], 'verse_text': ['with pale blue berries. in these peaceful shades--', 'it flows so long as falls the rain,', 'and that is why, the lonesome day,', 'when i peruse the conquered fame of heroes, and the victories of mighty generals, i do not envy the generals,', 'of inward strife for truth and liberty.'], 'label': [1, 2, 0, 3, 3], 'input_ids': [[101, 2007, 5122, 2630, 22681, 1012, 1999, 2122, 9379, 13178, 1011, 1011, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1998, 2008, 2003, 2339, 1010, 1996, 10459, 14045, 2154, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2043, 1045, 7304, 3366, 1996, 11438, 4476, 1997, 7348, 1010, 1998, 1996, 9248, 1997, 10478, 11593, 1010, 1045, 2079, 2025, 21103, 1996, 11593, 1010, 102, 0, 0], [101, 1997, 20546, 27865, 2005, 3606, 1998, 7044, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [121]:
# Explore Input Ids and Attention mask

print("Text: ",
     enc_poem_sentiment['train'][1].get('verse_text'))

print("Input Map: ",
     enc_poem_sentiment['train'][1].get('input_ids'))

print("Attention Mask: ",
     enc_poem_sentiment['train'][1].get('attention_mask'))

print("\nTotal Tokens: ",
     len(enc_poem_sentiment['train'][1].get('input_ids')))

print("Non Zero Tokens: ",
     len(list(
         filter(
             lambda x: x!=0, 
             enc_poem_sentiment['train'][1].get('input_ids')
         )
     )))

Text:  it flows so long as falls the rain,
Input Map:  [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Mask:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Total Tokens:  28
Non Zero Tokens:  11


### Separating Train & validation sets

In [122]:
# Separate training and validation sets

training_dataset = enc_poem_sentiment['train']
validation_dataset = enc_poem_sentiment['validation']

print("\nColumn Names: ", training_dataset.column_names)
print("\nFeatures: ", training_dataset.features)

labels = training_dataset.features.get("label")
num_labels = len(labels.names)


Column Names:  ['id', 'verse_text', 'label', 'input_ids', 'attention_mask']

Features:  {'id': Value(dtype='int32', id=None), 'verse_text': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive', 'no_impact', 'mixed'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


### Creating model architecture

In [123]:
from transformers import TFAutoModelForSequenceClassification

# load transformer checkpoint from huggingface
sentiment_model = (
    TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
)

sentiment_model.get_config()

{'vocab_size': 30522,
 'max_position_embeddings': 512,
 'sinusoidal_pos_embds': False,
 'n_layers': 6,
 'n_heads': 12,
 'dim': 768,
 'hidden_dim': 3072,
 'dropout': 0.1,
 'attention_dropout': 0.1,
 'activation': 'gelu',
 'initializer_range': 0.02,
 'qa_dropout': 0.1,
 'seq_classif_dropout': 0.2,
 'return_dict': True,
 'output_hidden_states': False,
 'output_attentions': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'chunk_size_feed_forward': 0,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_s

In [124]:
# freeze the first layer if needed
sentiment_model.layers[0].trainable = True

#add remove layers if needed.
#sentiment_model.layers[append()/insert()/remove()]

print(sentiment_model.summary())

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_39 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66956548 (255.42 MB)
Trainable params: 66956548 (255.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


### training the sentiment model

In [168]:
# using features from pretrained model

import tensorflow as tf

batch_size = 64
tokenizer_columns = db_tokenizer.model_input_names

# convert to tf_tensors
train_dataset = training_dataset.to_tf_dataset(columns = tokenizer_columns,
                                               label_cols = ["label"], 
                                               shuffle=True,
                                               batch_size=batch_size)

val_dataset = validation_dataset.to_tf_dataset(columns = tokenizer_columns,
                                               label_cols = ["label"], 
                                               shuffle=True,
                                               batch_size=batch_size)
# compile the model
sentiment_model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5),
                       loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                       metrics = tf.metrics.SparseCategoricalAccuracy())

sentiment_model.fit(train_dataset,
                   validation_data = val_dataset,
                   epochs=50)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tf_keras.src.callbacks.History at 0x3227dfb60>

### Predicting sentiments with custom models

In [178]:
from datasets import Dataset, DatasetDict

In [179]:
#Input data for inference to predict sentiment
# the 'label' is not needed but provided to compare true labels 
infer_data = {'id':[0,1],
             'verse_text': ['and be glad in the summer morning when the kindred ride on their way', 'not happy with the lifes ways'],
             'label':[1,0]}

In [180]:
infer_dataset = Dataset.from_dict(infer_data)
ds_dict = DatasetDict()

ds_dict['infer'] = infer_dataset

print(ds_dict)

#encode dataset similar to training
enc_dataset = ds_dict.map(tokenize, batched=True, batch_size=None)
#convert to tensors
infer_final_dataset = enc_dataset["infer"].to_tf_dataset(columns = tokenizer_columns,
                                               shuffle=True,
                                               batch_size=batch_size)
print(infer_final_dataset)

DatasetDict({
    infer: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 2
    })
})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

<_PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None)}>


In [181]:
predictions = sentiment_model.predict(infer_final_dataset)
predictions.logits



array([[-4.1005545,  6.787961 , -3.7919471, -2.8340926],
       [ 1.868875 , -3.9656112, -0.5426691,  0.5009544]], dtype=float32)

In [182]:
import numpy as np
pred_label_ids = np.argmax(predictions.logits, axis=1)

for index, pred_label_id in enumerate(pred_label_ids):
    print("\nPoem: ", infer_data['verse_text'][index],
         "\n\tPredicted Label: ", labels.names[pred_label_ids[index]],
         "\n\tTrue Label: ", labels.names[infer_data['label'][index]])


Poem:  and be glad in the summer morning when the kindred ride on their way 
	Predicted Label:  positive 
	True Label:  positive

Poem:  not happy with the lifes ways 
	Predicted Label:  negative 
	True Label:  negative
