In [44]:
import os
import pandas as pd
import numpy as np
import librosa
import yaml

with open("../config/project_config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Access model paths
model_large = config["models"]["summarization"]["large"]
model_small = config["models"]["summarization"]["small"]
model_tiny = config["models"]["summarization"]["tiny"]

# Access other config
max_length_large = config["settings"]["max_length_large"]
max_length_small = config["settings"]["max_length_small"]
min_length_large = config["settings"]["min_length_large"]
min_length_small = config["settings"]["min_length_small"]

# Access audio paths
data_path = config["data"]["processed_data_path"]

In [50]:
# Get datapath from configuration

df = pd.read_csv(data_path)[['speaker_id', 'chapter_id', 'sentence_id', 'file_path', 'wav2vec_transcript_clean']].rename(
    columns={
        'wav2vec_transcript_clean': 'transcript'
    })
df.head()

Unnamed: 0,speaker_id,chapter_id,sentence_id,file_path,transcript
0,1069,133709,0,/Users/christinexu/Desktop/MLDS/spring2025/clo...,had laid before her a pair of alternatives now...
1,1069,133709,41,/Users/christinexu/Desktop/MLDS/spring2025/clo...,vivid light of a judgment day the girl moreove...
2,1069,133709,16,/Users/christinexu/Desktop/MLDS/spring2025/clo...,asked isabel abruptly why as a kind of complim...
3,1069,133709,36,/Users/christinexu/Desktop/MLDS/spring2025/clo...,grown used to feeling rich the consciousness i...
4,1069,133709,20,/Users/christinexu/Desktop/MLDS/spring2025/clo...,henrietta doesnt oh hang henrietta said ralph ...


In [51]:
# Merge sentences in the sequence of sentence_id based on speaker_id and chapter_id
def merge_transcripts_by_chapter(df):
    """
    Merge transcripts from the same speaker and chapter, ordered by sentence_id.
    
    Args:
        df: DataFrame with speaker_id, chapter_id, sentence_id, and transcript columns
        
    Returns:
        DataFrame with merged transcripts by chapter
    """
    # Make a copy to avoid modifying the original
    merged_df = df.copy()
    
    # Ensure sentence_id is numeric for proper sorting
    merged_df['sentence_id'] = merged_df['sentence_id'].astype(int)
    
    # Sort by speaker_id, chapter_id, and sentence_id
    merged_df = merged_df.sort_values(['speaker_id', 'chapter_id', 'sentence_id'])
    
    # Group by speaker_id and chapter_id, and aggregate
    chapter_df = merged_df.groupby(['speaker_id', 'chapter_id']).agg({
        'transcript': lambda x: ' '.join(x),
        'sentence_id': 'count',  # Count sentences in each chapter
        'file_path': lambda x: list(x)  # Keep track of original files
    }).reset_index()
    
    # Rename columns for clarity
    chapter_df = chapter_df.rename(columns={
        'sentence_id': 'sentence_count',
        'transcript': 'chapter_transcript',
        'file_path': 'source_files'
    })
    
    return chapter_df

In [52]:
# Apply the function to your dataframe
chapter_transcripts = merge_transcripts_by_chapter(df)

# Display the first few merged chapters
print(f"Total chapters: {len(chapter_transcripts)}")
chapter_transcripts.head()

Total chapters: 585


Unnamed: 0,speaker_id,chapter_id,chapter_transcript,sentence_count,source_files
0,19,198,northanger abbey this little work was finished...,38,[/Users/christinexu/Desktop/MLDS/spring2025/cl...
1,19,227,chapter thirty catherines disposition was not ...,73,[/Users/christinexu/Desktop/MLDS/spring2025/cl...
2,26,495,in sixteen sixty five written by a citizen who...,91,[/Users/christinexu/Desktop/MLDS/spring2025/cl...
3,26,496,it was now mid july and the plague which had c...,27,[/Users/christinexu/Desktop/MLDS/spring2025/cl...
4,27,123349,some of the scottish emigrants heated with rep...,59,[/Users/christinexu/Desktop/MLDS/spring2025/cl...


### Define long audiofile

In [None]:
# Get the duration of each audio file
def get_duration(file_path):
    try:
        duration = librosa.get_duration(filename=file_path)
        return duration
    except:
        return None

# Apply to your DataFrame
df["duration_sec"] = df["file_path"].apply(get_duration)

	This alias will be removed in version 1.0.
  duration = librosa.get_duration(filename=file_path)


Unnamed: 0,file_path,duration_sec,is_long
0,/Users/christinexu/Desktop/MLDS/spring2025/clo...,15.06,True
1,/Users/christinexu/Desktop/MLDS/spring2025/clo...,8.925,False
2,/Users/christinexu/Desktop/MLDS/spring2025/clo...,16.405,True
3,/Users/christinexu/Desktop/MLDS/spring2025/clo...,14.165,True
4,/Users/christinexu/Desktop/MLDS/spring2025/clo...,15.06,True


In [23]:
# Define threshold at the 75th percentile
q75_duration = df["duration_sec"].quantile(0.75)

# Flag long audio files as those above the 75th percentile
df["is_long"] = df["duration_sec"] > q75_duration

# Optional: Print threshold
print(f"75th percentile duration threshold: {q75_duration:.2f} seconds")

# Preview result
df[["file_path", "duration_sec", "is_long"]].head()

75th percentile duration threshold: 15.16 seconds


Unnamed: 0,file_path,duration_sec,is_long
0,/Users/christinexu/Desktop/MLDS/spring2025/clo...,15.06,False
1,/Users/christinexu/Desktop/MLDS/spring2025/clo...,8.925,False
2,/Users/christinexu/Desktop/MLDS/spring2025/clo...,16.405,True
3,/Users/christinexu/Desktop/MLDS/spring2025/clo...,14.165,False
4,/Users/christinexu/Desktop/MLDS/spring2025/clo...,15.06,False


In [27]:
df["word_count"] = df["whisper_transcript"].apply(lambda x: len(x.split()))
q75_words = df["word_count"].quantile(0.75)
df["is_long"] = df["word_count"] > q75_words
print(f"75th percentile word count threshold: {q75_words:.0f} words")
df[["file_path", "word_count", "is_long"]].head()

75th percentile word count threshold: 42 words


Unnamed: 0,file_path,word_count,is_long
0,/Users/christinexu/Desktop/MLDS/spring2025/clo...,42,False
1,/Users/christinexu/Desktop/MLDS/spring2025/clo...,25,False
2,/Users/christinexu/Desktop/MLDS/spring2025/clo...,32,False
3,/Users/christinexu/Desktop/MLDS/spring2025/clo...,36,False
4,/Users/christinexu/Desktop/MLDS/spring2025/clo...,31,False


In [28]:
df.is_long.value_counts()

is_long
False    21502
True      7037
Name: count, dtype: int64

## Perform summarization task

In [20]:
from transformers import pipeline

# Long and short summarizer (same model, different max/min lengths)
bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Tiny summarizer (pegasus-xsum is optimized for one-sentence summaries)
pegasus_summarizer = pipeline("summarization", model="google/pegasus-xsum")


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0


In [21]:
#  Define Summary Functions
def generate_bart_summary(text, min_len=60, max_len=300):
    try:
        return bart_summarizer(text, min_length=min_len, max_length=max_len, do_sample=False)[0]["summary_text"]
    except Exception as e:
        return f"[ERROR: {e}]"

def generate_pegasus_summary(text):
    try:
        return pegasus_summarizer(text)[0]["summary_text"]
    except Exception as e:
        return f"[ERROR: {e}]"


In [57]:
test_sum = chapter_transcripts['chapter_transcript'].iloc[0]
# Generate summaries for long audio files
print(generate_bart_summary(test_sum, min_len=100, max_len=300))

Northanger abbey by catherine morland was written in the year eighteen o three and intended for immediate publication it was disposed of to a bookseller it was even advertised neither the author nor the public have any other concern than as some observation is necessary upon those parts of the work which thirteen years have made comparatively obsolete. The author never could learn or understand anything before she was taught and sometimes not even then for she was often inattentive and occasionally stupid her mother was three months in teaching her only to repeat the beggars petition and after all her next sister could say it better than she did.


In [56]:
test_sum

'northanger abbey this little work was finished in the year eighteen o three and intended for immediate publication it was disposed of to a bookseller it was even advertised neither the author nor the public have any other concern than as some observation is necessary upon those parts of the work which thirteen years have made comparatively obsolete the public are entreated to bear in mind that thirteen years have passed since it was finished many more since it was begun and that during that period places manners books and opinions have undergone considerable changes chapter one no one who had ever seen catherine morland in her infancy would have supposed her born to be an heroine her situation in life the character of her father and mother her own person and disposition were all equally against her her father was a clergyman without being neglected or poor and a very respectable man her mother was a woman of useful plain sense with a good temper and what is more remarkable with a good

In [None]:
# Apply to Long Audio Files
# Optionally filter to reduce compute time
long_df = df[df["is_long"]].copy()

# Generate three types of summaries
long_df["long_summary"] = long_df["whisper_transcript"].apply(lambda x: generate_bart_summary(x, min_len=100, max_len=300))
long_df["short_summary"] = long_df["whisper_transcript"].apply(lambda x: generate_bart_summary(x, min_len=40, max_len=100))
long_df["tiny_summary"] = long_df["whisper_transcript"].apply(generate_pegasus_summary)# 