<a href="https://colab.research.google.com/github/EdBerg21/AI-Professional-Prompts/blob/main/Copy_of_Smallest_train_ipynb_txt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate>=0.21.0

#RESTART

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16


#SUMMARIZATION

In [None]:

from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline

# Load a smaller dataset
dataset = load_dataset('ag_news', split='train[:1%]')

# Initialize tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the tokenized dataset into training and evaluation sets
train_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))  # Using 1000 samples for training
eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000, 1200))  # Using 200 samples for evaluation

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset            # evaluation dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./trained_model')

# Load the trained model for inference
model = DistilBertForSequenceClassification.from_pretrained('./trained_model')

# Create a text classification pipeline
text_classification = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer
)

# Example usage of the pipeline
result = text_classification("Microsoft Copilot is an AI designed to assist users.")
print(result)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[{'label': 'LABEL_3', 'score': 0.9061861038208008}]


In [None]:

# Install necessary libraries
!pip install transformers datasets torch
!pip install PyMuPDF  # For PDF processing

import fitz  # PyMuPDF
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset

# Load your Q&A dataset
# This is an example, replace with your actual dataset
qa_dataset = load_dataset('squad')

# Initialize tokenizer and model for question answering
tokenizer_qa = BertTokenizerFast.from_pretrained('bert-base-uncased')
model_qa = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Tokenize the Q&A dataset
def tokenize_qa(examples):
    # Tokenize the questions and contexts
    tokenized_examples = tokenizer_qa(
        examples['question'], examples['context'],
        truncation="only_second", max_length=384,
        stride=128, return_overflowing_tokens=True,
        return_offsets_mapping=True, padding="max_length"
    )

    # Let's label those examples!
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # The labels should be the start and end token positions of the answer in the context
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We grab the sequence corresponding to that example
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer_qa.cls_token_id)

        # Find the start and end of the answer in the tokens
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        start_char = answers["answer_start"][0]
        end_char = start_char + len(answers["text"][0])

        # If no answers are provided, set the cls_index as answer
        if len(answers["text"][0]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Otherwise move the token_start_index and token_end_index to the two ends of the answer
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index)
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

tokenized_qa_datasets = qa_dataset.map(tokenize_qa, batched=True, remove_columns=qa_dataset['train'].column_names)

# Split the tokenized dataset into training and evaluation sets
train_qa_dataset = tokenized_qa_datasets['train']
eval_qa_dataset = tokenized_qa_datasets['validation']

# Define training arguments for Q&A
training_args_qa = TrainingArguments(
    output_dir='./qa_results',          # output directory
    num_train_epochs=3,                 # total number of training epochs
    per_device_train_batch_size=16,     # batch size per device during training
    warmup_steps=500,                   # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                  # strength of weight decay
    logging_dir='./qa_logs',            # directory for storing logs
    logging_steps=10,
    fp16=True,                          # Use mixed precision
)

# Initialize the Trainer for Q&A
trainer_qa = Trainer(
    model=model_qa,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args_qa,                  # training arguments, defined above
    train_dataset=train_qa_dataset,         # training dataset
    eval_dataset=eval_qa_dataset            # evaluation dataset
)

# Train the Q&A model
trainer_qa.train()

# Save the Q&A model
model_qa.save_pretrained('./qa_trained_model')

# Initialize tokenizer and model for summarization
tokenizer_sum = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model_sum = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Function to summarize a PDF file
def summarize_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page in pdf_document:
        text += page.get_text()
    pdf_document.close()

    # Tokenize the text
    inputs = tokenizer_sum(text, return_tensors='pt', max_length=1024, truncation=True)

    # Generate summary
    summary_ids = model_sum.generate(inputs['input_ids'], num_beams=4, max_length=200, early_stopping=True)
    summary = tokenizer_sum.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Example usage of the summarization function
# Replace 'your_pdf_file.pdf' with the path to your actual PDF file
summary = summarize_pdf('/content/1709210740-importance-of-obligatory-prayer-baha-u-llah-the-bab.pdf')
print(summary)



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Step,Training Loss
10,5.9576
20,5.9588
30,5.9293
40,5.8077
50,5.6914
60,5.4718
70,5.1775
80,4.9235
90,4.7074
100,4.6386


KeyboardInterrupt: 

#Q&A

In [None]:

# Install necessary libraries
!pip install transformers datasets torch
!pip install PyMuPDF  # For PDF processing

import fitz  # PyMuPDF
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset

# Load the SQuAD dataset
squad_dataset = load_dataset('squad')

# Initialize tokenizer and model for question answering
tokenizer_qa = BertTokenizerFast.from_pretrained('bert-base-uncased')
model_qa = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Tokenize the SQuAD dataset
def tokenize_qa(examples):
    # Tokenize the questions and contexts
    tokenized_examples = tokenizer_qa(
        examples['question'], examples['context'],
        truncation="only_second", max_length=384,
        stride=128, return_overflowing_tokens=True,
        return_offsets_mapping=True, padding="max_length"
    )

    # The labels should be the start and end token positions of the answer in the context
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer_qa.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)

        # If no answers are provided, set the cls_index as answer
        answers = examples["answers"][sample_mapping[i]]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start token index of the current span in the text
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span
            if offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
            else:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)

    return tokenized_examples

tokenized_squad_datasets = squad_dataset.map(tokenize_qa, batched=True, remove_columns=squad_dataset['train'].column_names)

# Split the tokenized dataset into training and evaluation sets
train_qa_dataset = tokenized_squad_datasets['train']
eval_qa_dataset = tokenized_squad_datasets['validation']

# Define training arguments for Q&A
training_args_qa = TrainingArguments(
    output_dir='./qa_results',          # output directory
    num_train_epochs=3,                 # total number of training epochs
    per_device_train_batch_size=16,     # batch size per device during training
    warmup_steps=500,                   # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                  # strength of weight decay
    logging_dir='./qa_logs',            # directory for storing logs
    logging_steps=100,
    fp16=True,                          # Use mixed precision
)

# Initialize the Trainer for Q&A
trainer_qa = Trainer(
    model=model_qa,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args_qa,                  # training arguments, defined above
    train_dataset=train_qa_dataset,         # training dataset
    eval_dataset=eval_qa_dataset            # evaluation dataset
)

# Train the Q&A model
trainer_qa.train()

# Save the Q&A model
model_qa.save_pretrained('./qa_trained_model')

# Function to ask a question from a PDF file
def ask_question_from_pdf(pdf_path, question):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page in pdf_document:
        text += page.get_text()
    pdf_document.close()

    # Tokenize the text with the question
    inputs = tokenizer_qa(question, text, return_tensors='pt', truncation=True, padding=True)

    # Get model predictions
    outputs = model_qa(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Get the most likely beginning and end of answer with the argmax of the score
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    # Convert tokens to answer
    answer = tokenizer_qa.convert_tokens_to_string(tokenizer_qa.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    return answer

# Example usage of the ask_question_from_pdf function
# Replace 'your_pdf_file.pdf' with the path to your actual PDF file
question = "What is being fasted for?"
answer = ask_question_from_pdf('/content/1709210740-importance-of-obligatory-prayer-baha-u-llah-the-bab.pdf', question)
print(answer)



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Step,Training Loss
10,5.9212
20,5.9267
30,5.8528
40,5.8237
50,5.7249
60,5.6016
70,5.4131
80,5.1759
90,4.8373
100,4.6879


Checkpoint destination directory ./qa_results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./qa_results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Step,Training Loss
10,5.9212
20,5.9267
30,5.8528
40,5.8237
50,5.7249
60,5.6016
70,5.4131
80,5.1759
90,4.8373
100,4.6879
