In [6]:
# sentiment_analysis_per_second.py
import re
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [None]:
# -----------------------------
# Helper functions
# -----------------------------


def time_to_seconds(t):
    """Convert HH:MM:SS.mmm to seconds (float)."""
    h, m, s = t.split(":")
    return int(h) * 3600 + int(m) * 60 + float(s)


def parse_transcript(text):
    """
    Parse blocks like:
    [Speaker A (00:00:00.000 --> 00:00:01.200)]
    I'm nervous as hell.
    """
    pattern = re.compile(
        r"\[(Speaker [A-Z]) \((.*?) --> (.*?)\)\]\s*(.*?)\n(?=\[|$)",
        re.DOTALL,
    )

    entries = []

    for speaker, start, end, utterance in pattern.findall(text):
        start_s = time_to_seconds(start)
        end_s = time_to_seconds(end)

        utterance = utterance.strip().replace("\n", " ")
        entries.append((speaker, start_s, end_s, utterance))

    return entries


def expand_to_seconds(entries):
    """
    For each second, assign the sentiment of any utterance covering that second.
    """
    analyzer = SentimentIntensityAnalyzer()
    results = []

    for speaker, start, end, utt in entries:
        sentiment = analyzer.polarity_scores(utt)

        # seconds covered by this utterance
        start_sec = int(start)
        end_sec = int(end)

        for sec in range(start_sec, end_sec + 1):
            # Only include seconds actually inside the interval
            if start <= sec < end:
                results.append({
                    "second": sec,
                    "speaker": speaker,
                    "pos": sentiment["pos"],
                    "neu": sentiment["neu"],
                    "neg": sentiment["neg"],
                    "compound": sentiment["compound"],
                    "text": utt
                })

    return results


def save_csv(results, out_path="StephenKeala.csv"):
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["second", "speaker", "compound", "pos", "neu", "neg", "text"])

        for row in results:
            writer.writerow([
                row["second"],
                row["speaker"],
                row["compound"],
                row["pos"],
                row["neu"],
                row["neg"],
                row["text"]
            ])

    print(f"Saved CSV to {out_path}")


In [15]:
with open("DatasetCercetare/Transcription/Stephen_Keala.wav.txt", "r", encoding="utf-8") as f:
    transcript_text = f.read()

entries = parse_transcript(transcript_text)
results = expand_to_seconds(entries)
save_csv(results, "sentiment_output.csv")

Saved CSV to sentiment_output.csv


In [4]:
def time_to_seconds(t):
    """Convert HH:MM:SS.mmm to seconds (float)."""
    # Assuming input format is H:M:S.m or M:S.m or S.m
    parts = list(map(float, t.split(':')))
    
    if len(parts) == 1: # Only seconds (e.g., "1.200")
        return parts[0]
    elif len(parts) == 2: # Minutes and seconds (e.g., "00:01.200")
        m, s = parts
        return int(m) * 60 + s
    elif len(parts) == 3: # Hours, minutes, and seconds (e.g., "00:00:01.200")
        h, m, s = parts
        return int(h) * 3600 + int(m) * 60 + s
    else:
        raise ValueError(f"Invalid time format: {t}")


def parse_transcript(text):
    """
    Parse blocks like:
    [Speaker A (00:00:00.000 --> 00:00:01.200)]
    I'm nervous as hell.
    
    The regex is adjusted slightly to be robust to the timestamp format 
    used in your previous VTT files and the expected bracketed format.
    """
    pattern = re.compile(
        r"\[(Speaker [A-Z]) \((.*?) --> (.*?)\)\]\s*(.*?)\n(?=\[|$)",
        re.DOTALL,
    )

    entries = []

    # Ensure the time format passed to time_to_seconds can handle H:M:S.m or just M:S.m
    for speaker, start, end, utterance in pattern.findall(text):
        try:
            start_s = time_to_seconds(start)
            end_s = time_to_seconds(end)
        except ValueError as e:
            print(f"Skipping entry due to time error: {e} in {start} or {end}")
            continue

        utterance = utterance.strip().replace("\n", " ")
        entries.append((speaker, start_s, end_s, utterance))

    return entries


def expand_to_seconds(entries):
    """
    For each second, assign the sentiment of any utterance covering that second.
    """
    analyzer = SentimentIntensityAnalyzer()
    results = []

    # To avoid processing the same second multiple times (e.g., if utterances overlap),
    # use a dictionary to track the most recent/relevant entry for each second.
    second_map = {} 
    
    for speaker, start, end, utt in entries:
        sentiment = analyzer.polarity_scores(utt)

        start_sec = int(start)
        end_sec = int(end) # This is the last second *covered*
        
        # Iterate over all whole seconds from the start of the utterance
        for sec in range(start_sec, end_sec + 1):
            # Only include seconds actually inside the float interval [start, end)
            if start <= sec < end:
                # Create the data structure for this second
                data = {
                    "second": sec,
                    "speaker": speaker,
                    "pos": sentiment["pos"],
                    "neu": sentiment["neu"],
                    "neg": sentiment["neg"],
                    "compound": sentiment["compound"],
                    "text": utt
                }
                # Store it in the map, overwriting if a later utterance covers this second.
                second_map[sec] = data

    # Convert the map back to a list, sorted by second
    results = [second_map[sec] for sec in sorted(second_map.keys())]
    
    return results


def save_csv(results, out_path):
    """Saves the processed results to the specified CSV file."""
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["second", "speaker", "compound", "pos", "neu", "neg", "text"])

        for row in results:
            writer.writerow([
                row["second"],
                row["speaker"],
                row["compound"],
                row["pos"],
                row["neu"],
                row["neg"],
                row["text"]
            ])

    print(f"Saved CSV to {out_path}")

# -----------------------------
# Main processing logic
# -----------------------------

def process_all_transcriptions(base_dir="DatasetCercetare"):
    """
    Iterates through all .txt files in the Transcription folder,
    processes them, and saves the output as .csv files.
    """
    input_dir = os.path.join(base_dir, "Transcription")
    
    if not os.path.exists(input_dir):
        print(f"Error: Directory not found. Please ensure the path '{input_dir}' exists.")
        return

    print(f"--- Starting batch processing in: {input_dir} ---")
    
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            input_filepath = os.path.join(input_dir, filename)
            
            # Determine output filename (e.g., Stephen_Keala.wav.txt -> Stephen_Keala.wav.csv)
            # Removes '.txt' and adds '.csv'
            base_name = filename[:-4]
            output_filename = f"{base_name}.csv"
            output_filepath = os.path.join(input_dir, output_filename)
            
            print(f"Processing file: {filename}")
            
            try:
                # 1. Read the transcript text
                with open(input_filepath, "r", encoding="utf-8") as f:
                    transcript_text = f.read()
                
                # 2. Parse the transcript
                entries = parse_transcript(transcript_text)
                
                # 3. Expand to seconds and calculate sentiment
                results = expand_to_seconds(entries)
                
                # 4. Save the CSV
                save_csv(results, output_filepath)
                
            except Exception as e:
                print(f"!! An ERROR occurred while processing {filename}: {e} !!")

In [8]:
process_all_transcriptions("DatasetCercetare")

--- Starting batch processing in: DatasetCercetare/Transcription ---
Processing file: Alex_Paige.txt
Saved CSV to DatasetCercetare/Transcription/Alex_Paige.csv
Processing file: Marshall_Britney.txt
Saved CSV to DatasetCercetare/Transcription/Marshall_Britney.csv
Processing file: Stephen_Keala.txt
Saved CSV to DatasetCercetare/Transcription/Stephen_Keala.csv
Processing file: Stephen_Miette.txt
Saved CSV to DatasetCercetare/Transcription/Stephen_Miette.csv
Processing file: Sarah_TexasGuy.txt
Saved CSV to DatasetCercetare/Transcription/Sarah_TexasGuy.csv
Processing file: Zahariah_Erin.txt
Saved CSV to DatasetCercetare/Transcription/Zahariah_Erin.csv
Processing file: Chase_Gianna.txt
Saved CSV to DatasetCercetare/Transcription/Chase_Gianna.csv
Processing file: Nate_Alexis.txt
Saved CSV to DatasetCercetare/Transcription/Nate_Alexis.csv
Processing file: Moose_Paige.txt
Saved CSV to DatasetCercetare/Transcription/Moose_Paige.csv
Processing file: Eli_Gianna.txt
Saved CSV to DatasetCercetare/Tr