In [None]:
!pip install transformers

!pip install datasets

!pip install evaluate

!pip install rouge-score

#!pip install py7zr

!pip install transformers[torch]

!pip install accelerate

!pip install torch

!pip install nltk sacrebleu

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [None]:
# Importing Libraries

# Data Handling
import pandas as pd
import numpy as np
from datasets import Dataset, load_metric
import shutil

# Data Visualization
import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.io as pio
from IPython.display import display
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

# Statistics & Mathematics
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import shapiro, skew, anderson, kstest, gaussian_kde,spearmanr
import math

# Hiding warnings
import warnings
warnings.filterwarnings("ignore")

# Transformers
from transformers import BartTokenizer, BartForConditionalGeneration      # BERT Tokenizer and architecture
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments         # These will help us to fine-tune our model
from transformers import pipeline                                         # Pipeline
from transformers import DataCollatorForSeq2Seq                           # DataCollator to batch the data
import torch                                                              # PyTorch
import evaluate                                                           # Hugging Face's library for model evaluation


# Other NLP libraries
from textblob import TextBlob                                             # This is going to help us fix spelling mistakes in texts
from sklearn.feature_extraction.text import TfidfVectorizer               # This is going to helps identify the most common terms in the corpus
import re                                                                 # This library allows us to clean text data
import nltk                                                               # Natural Language Toolkit
nltk.download('punkt')                                                    # This divides a text into a list of sentences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# # Configuring Pandas to exhibit larger columns
# '''
# This is going to allow us to fully read the dialogues and their summary
# '''
# pd.set_option('display.max_colwidth', 1000)

In [None]:
# # Configuring notebook
# seed = 42
# #paper_color =
# #bg_color =
# colormap = 'cividis'
# template = 'plotly_dark'

In [None]:
# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available. \nUsing GPU")
    device = torch.device('cuda')
else:
    print("GPU is not available. \nUsing CPU")
    device = torch.device('cpu')

def display_feature_list(features, feature_type):

    '''
    This function displays the features within each list for each type of data
    '''

    print(f"\n{feature_type} Features: ")
    print(', '.join(features) if features else 'None')

def describe_df(df):
    """
    This function prints some basic info on the dataset and
    sets global variables for feature lists.
    """

    global categorical_features, continuous_features, binary_features
    categorical_features = [col for col in df.columns if df[col].dtype == 'object']
    binary_features = [col for col in df.columns if df[col].nunique() <= 2 and df[col].dtype != 'object']
    continuous_features = [col for col in df.columns if df[col].dtype != 'object' and col not in binary_features]

    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} samples")
    print(f"\n{df.shape[1]:,.0f} attributes")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')
    print(f'\nData Types: \n{df.dtypes}')

    #negative_valued_features = [col for col in df.columns if (df[col] < 0).any()]
    #print(f'\nFeatures with Negative Values: {", ".join(negative_valued_features) if negative_valued_features else "None"}')

    display_feature_list(categorical_features, 'Categorical')
    display_feature_list(continuous_features, 'Continuous')
    display_feature_list(binary_features, 'Binary')

    print(f'\n{type(df).__name__} Head: \n')
    display(df.head(5))
    print(f'\n{type(df).__name__} Tail: \n')
    display(df.tail(5))

GPU is available. 
Using GPU


In [None]:
def histogram_boxplot(df,hist_color, box_color, height, width, legend, name):
    '''
    This function plots a Histogram and a Box Plot side by side

    Parameters:
    hist_color = The color of the histogram
    box_color = The color of the boxplots
    heigh and width = Image size
    legend = Either to display legend or not
    '''

    features = df.select_dtypes(include = [np.number]).columns.tolist()

    for feat in features:
        try:
            fig = make_subplots(
                rows=1,
                cols=2,
                subplot_titles=["Box Plot", "Histogram"],
                horizontal_spacing=0.2
            )

            density = gaussian_kde(df[feat])
            x_vals = np.linspace(min(df[feat]), max(df[feat]), 200)
            density_vals = density(x_vals)

            fig.add_trace(go.Scatter(x=x_vals, y = density_vals, mode = 'lines',
                                     fill = 'tozeroy', name="Density", line_color=hist_color), row=1, col=2)
            fig.add_trace(go.Box(y=df[feat], name="Box Plot", boxmean=True, line_color=box_color), row=1, col=1)

            fig.update_layout(title={'text': f'<b>{name} Word Count<br><sup><i>&nbsp;&nbsp;&nbsp;&nbsp;{feat}</i></sup></b>',
                                     'x': .025, 'xanchor': 'left'},
                             margin=dict(t=100),
                             showlegend=legend,
                             template = template,
                             #plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                             height=height, width=width
                            )

            fig.update_yaxes(title_text=f"<b>Words</b>", row=1, col=1, showgrid=False)
            fig.update_xaxes(title_text="", row=1, col=1, showgrid=False)

            fig.update_yaxes(title_text="<b>Frequency</b>", row=1, col=2,showgrid=False)
            fig.update_xaxes(title_text=f"<b>Words</b>", row=1, col=2, showgrid=False)

            fig.show()
            print('\n')
        except Exception as e:
            print(f"An error occurred: {e}")

In [None]:
def plot_correlation(df, title, subtitle, height, width, font_size):
    '''
    This function is resposible to plot a correlation map among features in the dataset.

    Parameters:
    height = Define height
    width = Define width
    font_size = Define the font size for the annotations
    '''
    corr = np.round(df.corr(numeric_only = True), 2)
    mask = np.triu(np.ones_like(corr, dtype = bool))
    c_mask = np.where(~mask, corr, 100)

    c = []
    for i in c_mask.tolist()[1:]:
        c.append([x for x in i if x != 100])



    fig = ff.create_annotated_heatmap(z=c[::-1],
                                      x=corr.index.tolist()[:-1],
                                      y=corr.columns.tolist()[1:][::-1],
                                      colorscale = colormap)

    fig.update_layout(title = {'text': f"<b>{title} Heatmap<br><sup>&nbsp;&nbsp;&nbsp;&nbsp;<i>{subtitle}</i></sup></b>",
                                'x': .025, 'xanchor': 'left', 'y': .95},
                    margin = dict(t=210, l = 110),
                    yaxis = dict(autorange = 'reversed', showgrid = False),
                    xaxis = dict(showgrid = False),
                    template = template,
                    #plot_bgcolor=bg_color,paper_bgcolor=paper_color,
                    height = height, width = width)


    fig.add_trace(go.Heatmap(z = c[::-1],
                             colorscale = colormap,
                             showscale = True,
                             visible = False))
    fig.data[1].visible = True

    for i in range(len(fig.layout.annotations)):
        fig.layout.annotations[i].font.size = font_size

    fig.show()

In [None]:
def compute_tfidf(df_column, ngram_range=(1,1), max_features=15):
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english', ngram_range=ngram_range)
    x = vectorizer.fit_transform(df_column.fillna(''))
    df_tfidfvect = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())
    return df_tfidfvect

In [None]:
# Loading data
test = pd.read_csv('samsum-test.csv')

In [None]:
# Extracting info on the test dataset
describe_df(test)


DataFrame shape: (819, 3)

819 samples

3 attributes

Missing Data: 
id          0
dialogue    0
summary     0
dtype: int64

Duplicates: 0

Data Types: 
id          object
dialogue    object
summary     object
dtype: object

Categorical Features: 
id, dialogue, summary

Continuous Features: 
None

Binary Features: 
None

DataFrame Head: 



Unnamed: 0,id,dialogue,summary
0,13862856,"Hannah: Hey, do you have Betty's number?\nAman...",Hannah needs Betty's number but Amanda doesn't...
1,13729565,Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric:...,Eric and Rob are going to watch a stand-up on ...
2,13680171,"Lenny: Babe, can you help me with something?\r...",Lenny can't decide which trousers to buy. Bob ...
3,13729438,"Will: hey babe, what do you want for dinner to...",Emma will be home soon and she will let Will k...
4,13828600,"Ollie: Hi , are you in Warsaw\r\nJane: yes, ju...",Jane is in Warsaw. Ollie and Jane has a party....



DataFrame Tail: 



Unnamed: 0,id,dialogue,summary
814,13611902-1,Alex: Were you able to attend Friday night's b...,Benjamin didn't come to see a basketball game ...
815,13820989,Jamilla: remember that the audition starts at ...,The audition starts at 7.30 P.M. in Antena 3.
816,13717193,"Marta: <file_gif>\r\nMarta: Sorry girls, I cli...","Marta sent a file accidentally,"
817,13829115,Cora: Have you heard how much fuss British med...,There was a meet-and-greet with James Charles ...
818,13818810,Rachel: <file_other>\r\nRachel: Top 50 Best Fi...,Rachel sends a list of Top 50 films of 2018. J...


In [None]:
# Removing 'Id' from categorical features list
categorical_features.remove('id')

In [None]:
print(test['dialogue'].iloc[0])

Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye


In [None]:
def clean_tags(text):
    clean = re.compile('<.*?>') # Compiling tags
    clean = re.sub(clean, '', text) # Replacing tags text by an empty string

    # Removing empty dialogues
    clean = '\n'.join([line for line in clean.split('\n') if not re.match('.*:\s*$', line)])

    return clean

In [None]:
test1 = clean_tags(test['dialogue'].iloc[0]) # Applying function to example text
print('\n' *3)
print(test1)





Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye


In [None]:
# Defining function to clean every text in the dataset.
def clean_df(df, cols):
    for col in cols:
        df[col] = df[col].fillna('').apply(clean_tags)
    return df

In [None]:
test = clean_df(test,['dialogue', 'summary'])

In [None]:
test.tail(3) # Visualizing results

Unnamed: 0,id,dialogue,summary
816,13717193,"Marta: Sorry girls, I clicked something by acc...","Marta sent a file accidentally,"
817,13829115,Cora: Have you heard how much fuss British med...,There was a meet-and-greet with James Charles ...
818,13818810,Rachel: Top 50 Best Films of 2018\r\nRachel: :...,Rachel sends a list of Top 50 films of 2018. J...


In [None]:
test_ds = Dataset.from_pandas(test)

In [None]:
print('\n' * 2)
print(test_ds)




Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 819
})


In [None]:
test_ds[0] # Visualizing the first row

{'id': '13862856',
 'dialogue': "Hannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye",
 'summary': "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."}

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from datasets import load_metric

In [None]:
import os

In [None]:
os.environ['HF_TOKEN'] = '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'

In [None]:
# Load the pre-trained PEGASUS model and tokenizer
model_name = "google/pegasus-xsum"
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = PegasusTokenizer.from_pretrained(model_name)  # No need for .to(device) here

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

In [None]:
# Make sure you have these packages installed
!pip install rouge nltk transformers

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
!pip install nltk
import nltk
nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Define batch size (optimized to fit within 12 GB RAM)
batch_size = 4

In [None]:
from datasets import load_metric

In [None]:
# Load ROUGE, BLEU, and METEOR metrics
rouge_metric = load_metric('rouge')
bleu_metric = load_metric('bleu')
meteor_metric = load_metric('meteor')

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
from torch.cuda.amp import autocast

In [None]:
# Function to generate summaries in batches
def generate_summaries_in_batches(texts, batch_size):
    summaries = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        tokenized_batch = tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt").to(device)  # Move to device
        with torch.no_grad():
            with autocast():
                batch_summaries = model.generate(**tokenized_batch, max_length=60, num_beams=5, early_stopping=True)
        decoded_batch_summaries = tokenizer.batch_decode(batch_summaries, skip_special_tokens=True)
        summaries.extend(decoded_batch_summaries)
    return summaries

In [None]:
# Generate summaries for the test dataset
test_texts = test['dialogue'].tolist()
decoded_summaries = generate_summaries_in_batches(test_texts, batch_size)


In [None]:
# Prepare references
references = test['summary'].tolist()

In [None]:
# Function to compute ROUGE scores
def compute_rouge(predictions, references):
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)
    rouge_result = {key: value.mid.fmeasure * 100 for key, value in rouge_result.items()}
    return rouge_result

# Function to compute BLEU score
def compute_bleu(predictions, references):
    # Modify the references to ensure they are in the correct format
    modified_references = [[ref] for ref in references]

    # Compute BLEU scores
    bleu_result = bleu_metric.compute(predictions=predictions, references=modified_references)
    return {'bleu': bleu_result['score']}


In [None]:
# Calculate and print the scores
rouge_scores = compute_rouge(decoded_summaries, references)
print("ROUGE Scores:")
print(rouge_scores)

ROUGE Scores:
{'rouge1': 21.836228385565796, 'rouge2': 3.5137314072426604, 'rougeL': 17.21532652595203, 'rougeLsum': 17.18373797768097}


In [None]:
!pip install py7zr

Collecting py7zr
  Downloading py7zr-0.21.0-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m777.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.8/413.8 kB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np

In [None]:
# Tokenize references into sentences
tokenized_references = [sent_tokenize(ref) for ref in references]

In [None]:
# Transpose the tokenized_references list
transposed_references = list(map(list, zip(*tokenized_references)))

In [None]:
# Convert the element at index 0 of the transposed_references list to a string
transposed_references[0] = str(transposed_references[0])


In [None]:
# Convert the string in the list to a list of strings
transposed_references[0] = transposed_references[0].split()

In [None]:
# Define the candidate
candidate = decoded_summaries # Assuming you want to calculate the BLEU score for the first sample

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
# Calculate the BLEU score
bleu_score = sentence_bleu(transposed_references, candidate)
print("BLEU Score:")
print(bleu_score)

BLEU Score:
0


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import meteor_score as meteor_scorer

In [None]:
# Tokenize references into lists of words
tokenized_references = [word_tokenize(ref) for ref in references]

# Tokenize decoded summaries into lists of words
tokenized_summaries = [word_tokenize(summary) for summary in decoded_summaries]

In [None]:
# Define the compute_meteor function
def compute_meteor(predictions, references):
    meteor_scores = []
    for pred, ref in zip(predictions, references):
        meteor_score = meteor_scorer([ref], pred)
        meteor_scores.append(meteor_score)
    return sum(meteor_scores) / len(meteor_scores)

In [None]:
# Calculate the METEOR score
meteor_score = compute_meteor(tokenized_summaries, tokenized_references)
print("METEOR Score:")
print(meteor_score)

METEOR Score:
0.028454167486141144
