In [None]:
# from HuggingFace https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad

from transformers import DistilBertTokenizer, DistilBertModel
import torch
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = DistilBertModel.from_pretrained('distilbert-base-cased-distilled-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[ 1.1810, -0.4073,  0.9986,  ..., -0.7445,  0.0380, -0.5510],
         [ 1.6172, -0.6785,  1.6932,  ..., -0.8216, -0.2387, -0.6187],
         [ 2.0840, -0.5496,  1.3313,  ..., -0.7791,  0.1698, -0.3950],
         ...,
         [ 0.2879, -0.1813,  1.2631,  ..., -0.2022,  0.4699,  0.5535],
         [ 0.6069, -0.1943,  0.7584,  ..., -0.5106, -0.4027, -0.4910],
         [ 1.0183, -0.8215,  0.9088,  ..., -0.8094,  0.8372, -0.2027]]]), hidden_states=None, attentions=None)


In [2]:
# copilot modified for text output.

from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

answer_start_index = torch.argmax(start_logits)
answer_end_index = torch.argmax(end_logits)

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
answer = tokenizer.decode(predict_answer_tokens)

print(answer)

a nice puppet


In [34]:
from docx import Document

# Load the Word document
doc = Document('./example.docx')

# Extract text
text = []
for paragraph in doc.paragraphs:
    text.append(paragraph.text)

# Combine paragraphs into a single string (optional)
full_text = '\n'.join(text)

In [6]:
len(full_text)

7651

In [5]:
question, text = "What is the identity of the unknown acid?", full_text[:1000]

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

answer_start_index = torch.argmax(start_logits)
answer_end_index = torch.argmax(end_logits)

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
answer = tokenizer.decode(predict_answer_tokens)

print(answer)

citric acid


In [6]:
import os
import glob

# Define the directory you want to search in
directory = './data'

# Use glob to find all .docx files in the directory
docx_files = glob.glob(os.path.join(directory, '*.docx'))
print(type(docx_files))

# Print the list of .docx files
# for file in docx_files:
#     print(file)

<class 'list'>


In [30]:
questions = ["who wrote this?", 'what is the molar mass of the unknown?', "what is the identity of the unknown?", 'is this well written?']

In [31]:
for file in docx_files:
    
    answers = {file:{}}
    
    # Load the Word document
    doc = Document(f'{file}')

    # Extract text
    text = []
    for paragraph in doc.paragraphs:
        text.append(paragraph.text)

    # Combine paragraphs into a single string (optional)
    full_text = '\n'.join(text)

    for question in questions:
        
        asking, text = question, full_text[:1000]

        inputs = tokenizer(asking, text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)

        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        answer_start_index = torch.argmax(start_logits)
        answer_end_index = torch.argmax(end_logits)

        predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        answer = tokenizer.decode(predict_answer_tokens)

        answers[f'{file}'][question] = answer

In [10]:
story1 = """The Adventure of Bob's Day
Bob woke up in the morning when his alarm went off and he got out of bed and he was really tired. He went to the bathroom and brushed his teeth with toothpaste that was minty fresh and then he went downstairs. The kitchen was where he went next and he made some breakfast which was eggs and toast and orange juice.
Bob worked at an office building downtown where lots of other people worked too and he had to drive there in his car which was blue. Traffic was really bad and horrible and terrible and it made Bob feel very angry and mad. When he finally got to work, his boss Tom was waiting for him and Tom was mad because Bob was late.
"Your late Bob," said Tom angrily and mad. "I know and I'm sorry because of the traffic being bad," Bob replied back to Tom in response. Bob sat at his desk and typed on his computer all day long and it was really boring and not fun at all and he wished he could go home.
Finally it was time to go home and Bob was happy and glad. But then something crazy happened - there was a alien spaceship in the parking lot!!! The aliens were green and had three eyes and antennae on their heads that were long. "Take me to your leader!" they said to Bob loudly. But Bob just ran away really fast and got in his car and drove home really quick.
When Bob got home he was scared and frightened but happy to be alive and ok. He decided to never tell anyone about the aliens because they wouldn't believe him anyway. The end."""

In [32]:
answers

{'./data/Pendleton.docx': {'who wrote this?': '<s>',
  'what is the molar mass of the unknown?': ' 205.0 g/mol',
  'what is the identity of the unknown?': '<s>',
  'is this well written?': 'By counting the number of moles of acid that reacted with the NaOH and comparing it to the mass of the acid, the molar mass of the unknown acid was ascertained.. The mean molar mass of the unidentified acid was 205.0 g/mol, with a margin of error ascribed to systematic differences in handling and titration volume measurements'}}

In [20]:
question, text = "Summarize the story.", story1

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

answer_start_index = torch.argmax(start_logits)
answer_end_index = torch.argmax(end_logits)

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
answer = tokenizer.decode(predict_answer_tokens)

print(answer)

[CLS]


In [21]:
from transformers import LongformerTokenizer, LongformerForSequenceClassification
import torch

# Load pre-trained Longformer model and tokenizer
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerForSequenceClassification.from_pretrained(model_name)

# Function to analyze a string
def analyze_string(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=4096)
    
    # Perform the classification
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted class (assuming binary classification)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    
    return predicted_class

# Example usage
text = "This lab report is very detailed and well-written."
result = analyze_string(text)
print("Predicted class:", result)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Initializing global attention on CLS token...


Predicted class: 0


In [92]:
from transformers import LongformerForQuestionAnswering

# Load pre-trained Longformer model and tokenizer
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerForQuestionAnswering.from_pretrained(model_name)

# Function to ask a question
def ask_question(context, question):
    # Tokenize the input context and question
    inputs = tokenizer(context, question, return_tensors='pt', truncation=True, padding='max_length', max_length=4096)
    
    # Perform the question answering
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the answer
    start_scores, end_scores = outputs.start_logits, outputs.end_logits
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index]))
    
    return answer


  return self.fget.__get__(instance, owner)()
Some weights of LongformerForQuestionAnswering were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [93]:
# Example usage
context = "The unknown is sodium chloride."
question = "What is the molar mass of the unknown?"
answer = ask_question(full_text, question)
print("Answer:", answer)


Answer: 


In [89]:
from docx import Document

# Load the Word document
doc = Document('./example.docx')

# Extract text
text = []
for paragraph in doc.paragraphs:
    text.append(paragraph.text)

# Combine paragraphs into a single string (optional)
full_text = '\n'.join(text)

In [94]:
from transformers import RobertaTokenizer, RobertaForQuestionAnswering
import torch

# Load pre-trained RoBERTa model and tokenizer
model_name = 'deepset/roberta-base-squad2'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForQuestionAnswering.from_pretrained(model_name)

In [95]:
# Function to ask a question
def ask_question(context, question):
    # Tokenize the input context and question
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, padding='max_length', max_length=512)
    
    # Perform the question answering
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the answer
    start_scores, end_scores = outputs.start_logits, outputs.end_logits
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_index:end_index]))
    
    return answer

# Example usage
context = "The unknown was sodium chloride."
question = "What is the unknown?"
answer = ask_question(full_text, question)
print("Answer:", answer)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Answer:  citric acid


In [38]:
from transformers import pipeline

classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

Classification result: Well Written


In [40]:
def classify_report(report_text):
    labels = ["Well Written", "Poorly Written"]
    result = classifier(report_text, candidate_labels=labels)
    return result['labels'][0]

# Example usage
report_text = "This lab report describes the experimental procedure clearly and concisely."
result = classify_report(story1)
print("Classification result:", result)


Classification result: Well Written
