# predicting with bart large trained on full dataset

In [None]:
import pandas as pd
test_data = pd.read_parquet('test_data.parquet')
from datasets import Dataset

test_dataset = Dataset.from_pandas(test_data)

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Specify the path to your saved model directory
model_path = "./saved_model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./trained_tokenizer_bart_large_on_full_dataset")

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("./trained_model_bart_large_on_full_dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from datasets import Dataset

def preprocess_data(batch):
    inputs = tokenizer(batch['document'], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    outputs = tokenizer(batch['summary'], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    inputs['labels'] = outputs['input_ids']
    return inputs

# Preprocess test data using Hugging Face Dataset map function
tokenized_test = test_dataset.map(preprocess_data, batched=True)

Map: 100%|██████████| 11334/11334 [00:07<00:00, 1541.51 examples/s]


In [6]:
# Predict summaries for the first 10 documents
predictions = []
documents = tokenized_test["document"][:10]  # Replace "document" with your dataset's input column name

for doc in documents:
    # Tokenize the input document
    inputs = tokenizer(doc, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=128, min_length=30, length_penalty=2.0, num_beams=4)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Store the generated summary
    predictions.append(generated_summary)


# Retrieve actual documents and reference summaries
actual_documents = [tokenized_test[i]["document"] for i in range(10)]  # Replace "document" with your actual column name
reference_summaries = [tokenized_test[i]["summary"] for i in range(10)]  # Replace "summary" with your actual column name


# Create a DataFrame for better visualization
results_df = pd.DataFrame({
    "Actual Document": actual_documents,
    "Reference Summary": reference_summaries,
    "Generated Summary": predictions
})


from IPython.display import display
display(results_df)


results_df.to_csv("summary_predictions_with_bart_large_on_test_set.csv", index=False)


Unnamed: 0,Actual Document,Reference Summary,Generated Summary
0,"Prison Link Cymru had 1,099 referrals in 2015-...","There is a ""chronic"" need for more housing for...",More affordable homes should be built for ex-p...
1,Officers searched properties in the Waterfront...,"A man has appeared in court after firearms, am...",A man has been charged in connection with the ...
2,"Jordan Hill, Brittany Covington and Tesfaye Co...",Four people accused of kidnapping and torturin...,A judge has ordered four people accused of bea...
3,The 48-year-old former Arsenal goalkeeper play...,West Brom have appointed Nicky Hammond as tech...,West Brom have appointed Steve Round as their ...
4,Restoring the function of the organ - which he...,The pancreas can be triggered to regenerate it...,"A short, intense fasting regime can restore th..."
5,But there certainly should be.\nThese are two ...,Since their impending merger was announced in ...,The merger of the world's two biggest eyewear ...
6,Media playback is not supported on this device...,"A ""medal at any cost"" approach created a ""cult...",Wendy Houvenaghel says she was dropped from Br...
7,It's no joke. But Kareem Badr says people did ...,Have you heard the one about the computer prog...,If you go to a comedy club in the US and there...
8,Relieved that the giant telecoms company would...,The reaction from BT's investors told us much ...,When news of the proposed merger of BT and Ofc...
9,"""I'm really looking forward to it - the home o...",Manager Brendan Rodgers is sure Celtic can exp...,Celtic manager Brendan Rodgers is relishing th...


# predicting with t5-small on test set

In [7]:
import pandas as pd
test_data = pd.read_parquet('test_data.parquet')
from datasets import Dataset

test_dataset = Dataset.from_pandas(test_data)

In [8]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./trained_tokenizer_t5-small")

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("./trained_model_t5-small")

from datasets import Dataset

def preprocess_data(batch):
    inputs = tokenizer(batch['document'], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    outputs = tokenizer(batch['summary'], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    inputs['labels'] = outputs['input_ids']
    return inputs

# Preprocess test data using Hugging Face Dataset map function
tokenized_test = test_dataset.map(preprocess_data, batched=True)

# Generate predictions for the first 10 rows
predictions = []
for i in range(10):
    # Get the input document
    input_text = tokenized_test[i]["document"]  # Replace "document" with your actual column name

    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = model.generate(inputs["input_ids"], max_length=55, min_length=5, length_penalty=2.0, num_beams=4)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Save the generated summary
    predictions.append(generated_summary)

# Retrieve actual documents and reference summaries
actual_documents = [tokenized_test[i]["document"] for i in range(10)]  # Replace "document" with your actual column name
reference_summaries = [tokenized_test[i]["summary"] for i in range(10)]  # Replace "summary" with your actual column name


# Create a DataFrame for better visualization
results_df = pd.DataFrame({
    "Actual Document": actual_documents,
    "Reference Summary": reference_summaries,
    "Generated Summary": predictions
})


from IPython.display import display
display(results_df)


results_df.to_csv("summary_predictions_with_t5-small_on_test_set.csv", index=False)

Map: 100%|██████████| 11334/11334 [00:08<00:00, 1305.49 examples/s]


Unnamed: 0,Actual Document,Reference Summary,Generated Summary
0,"Prison Link Cymru had 1,099 referrals in 2015-...","There is a ""chronic"" need for more housing for...",The need for housing for prison leavers in Wal...
1,Officers searched properties in the Waterfront...,"A man has appeared in court after firearms, am...",A man has appeared in court charged with firea...
2,"Jordan Hill, Brittany Covington and Tesfaye Co...",Four people accused of kidnapping and torturin...,Four men have appeared in court charged with a...
3,The 48-year-old former Arsenal goalkeeper play...,West Brom have appointed Nicky Hammond as tech...,West Brom have appointed West Brom's former yo...
4,Restoring the function of the organ - which he...,The pancreas can be triggered to regenerate it...,A diet that regenerated a special type of cell...
5,But there certainly should be.\nThese are two ...,Since their impending merger was announced in ...,The UK eyewear industry has merged with the UK...
6,Media playback is not supported on this device...,"A ""medal at any cost"" approach created a ""cult...",British Cycling's new chair Jonathan Browning ...
7,It's no joke. But Kareem Badr says people did ...,Have you heard the one about the computer prog...,US comedian Kareem Badr says he was able to ta...
8,Relieved that the giant telecoms company would...,The reaction from BT's investors told us much ...,Ofcom's chief executive has said a break-up of...
9,"""I'm really looking forward to it - the home o...",Manager Brendan Rodgers is sure Celtic can exp...,"Celtic midfielder David Rodgers says he is ""re..."
