# **Text Summarization using BART Transformers**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install tensorflow
!pip install torch
!pip install scikit-learn
!pip install pandas
!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

## Loading data

In [20]:
import pandas as pd
import numpy as np
import nltk
import transformers
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

 I will use the processed data to train and fine tune the BART model on reviews as it is pretarained on summarizing articles. The columns used will be the 'clean_review_text' which is the customers review and it will be the input. The 'clean_summary' column is our target.

In [4]:
df = pd.read_csv("/content/drive/MyDrive/datasets_BART_project/processed_subset_data.csv")

### Split the datasets into training, validation and test datasets

In [5]:
# Reducing training set size

sample_size = int(0.05 * len(df))
subset = df.sample(n=sample_size).copy()

In [6]:
subset.shape

(9738, 19)

In [15]:
# split the datasets into training, validation and test datasets
X = subset["clean_review_text"]
y = subset["clean_summary"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.8, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.6, random_state=42)

# Combine splits into DataFrames
train_df = pd.DataFrame({"clean_review_text": X_train, "clean_summary": y_train}).reset_index(drop=True)
validation_df = pd.DataFrame({"clean_review_text": X_valid, "clean_summary": y_valid}).reset_index(drop=True)
test_df = pd.DataFrame({"clean_review_text": X_test, "clean_summary": y_test}).reset_index(drop=True)


In [16]:
# Check size
print(train_df.shape)
print(validation_df.shape)
print(test_df.shape)

(7790, 2)
(779, 2)
(1169, 2)


In [17]:
train_df.head()

Unnamed: 0,clean_review_text,clean_summary
0,wow im glad purchased mask saw immediate resul...,love
1,im heaven thyroid problem affecting skin nothi...,excellent must
2,supossed extent docking station port thing not...,works great
3,recently tried new oat cleansing balm overall ...,good value good product
4,nice soft feel cast fits well snug may little ...,fits snuggly


In [21]:
# check number of null values and remove them in the training, validation, and test set

print(train_df.isnull().sum())
print(validation_df.isnull().sum())

clean_review_text    0
clean_summary        0
dtype: int64
clean_review_text    0
clean_summary        0
dtype: int64


In [19]:
train_df=train_df.dropna(axis=0)

validation_df = validation_df.dropna(axis=0)

test_df = test_df.dropna(axis=0)

In [22]:
# identifiers for each split
train_df["id"] = range(len(train_df))
validation_df["id"] = range(len(train_df), len(train_df) + len(validation_df))
test_df["id"] = range(len(train_df) + len(validation_df), len(train_df) + len(validation_df) + len(test_df))

# Convert from dataframe to hugging face dataset
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)



In [23]:
print(validation_dataset)

Dataset({
    features: ['clean_review_text', 'clean_summary', 'id', '__index_level_0__'],
    num_rows: 773
})


In [24]:
# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

In [25]:
# Save the datasets
dataset_dict.save_to_disk("/content/drive/MyDrive/datasets_finetuning_BART/training_dataset_splits")

Saving the dataset (0/1 shards):   0%|          | 0/7738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/773 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1159 [00:00<?, ? examples/s]