In [1]:
# sentiment_analysis_per_second.py
import re
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [10]:
import re
import csv
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# -----------------------------
# Helper functions
# -----------------------------


def time_to_seconds(t):
    """Convert HH:MM:SS.mmm to seconds (float)."""
    h, m, s = t.split(":")
    return int(h) * 3600 + int(m) * 60 + float(s)


def parse_transcript(text):
    """
    Parse blocks like:
    [Speaker A (00:00:00.000 --> 00:00:01.200)]
    I'm nervous as hell.
    """
    pattern = re.compile(
        r"\[(Speaker [A-Z]) \((.*?) --> (.*?)\)\]\s*(.*?)\n(?=\[|$)",
        re.DOTALL,
    )

    entries = []

    for speaker, start, end, utterance in pattern.findall(text):
        start_s = time_to_seconds(start)
        end_s = time_to_seconds(end)

        utterance = utterance.strip().replace("\n", " ")
        entries.append((speaker, start_s, end_s, utterance))

    return entries


def expand_to_seconds(entries):
    """
    For each second, assign the sentiment of any utterance covering that second.
    """
    analyzer = SentimentIntensityAnalyzer()
    results = []

    for speaker, start, end, utt in entries:
        sentiment = analyzer.polarity_scores(utt)

        # seconds covered by this utterance
        start_sec = int(start)
        end_sec = int(end)

        for sec in range(start_sec, end_sec + 1):
            # Only include seconds actually inside the interval
            if start <= sec < end:
                results.append({
                    "second": sec,
                    "speaker": speaker,
                    "pos": sentiment["pos"],
                    "neu": sentiment["neu"],
                    "neg": sentiment["neg"],
                    "compound": sentiment["compound"],
                    "text": utt
                })

    return results


def save_csv(results, out_path="StephenKeala.csv"):
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["second", "speaker", "compound", "pos", "neu", "neg", "text"])

        for row in results:
            writer.writerow([
                row["second"],
                row["speaker"],
                row["compound"],
                row["pos"],
                row["neu"],
                row["neg"],
                row["text"]
            ])

    print(f"Saved CSV to {out_path}")


In [15]:
with open("DatasetCercetare/Transcription/Stephen_Keala.wav.txt", "r", encoding="utf-8") as f:
    transcript_text = f.read()

entries = parse_transcript(transcript_text)
results = expand_to_seconds(entries)
save_csv(results, "sentiment_output.csv")

Saved CSV to sentiment_output.csv


In [12]:

if not os.path.exists(filepath):
        print(f"❌ File '{filepath}' not found. Please ensure it's in the same directory.")
else:
        print(f"✅ Loading transcript: {filepath}")
        segments = parse_transcript_from_content(filepath)
        print(f"✅ Parsed {len(segments)} segments. Consolidated into Speaker A and Speaker B.")

        sentiment_data = analyze_sentiment_per_second(segments)

        print("\n=== SENTIMENT ANALYSIS PER SECOND ===\n")
        print("Second\tSpeaker A\tSpeaker B")
        print("-" * 40)

        # Print results for each second
        max_len = max(len(sentiment_data['A']), len(sentiment_data['B']))
        for i in range(max_len):
            sec = i + int(min(seg['start_sec'] for seg in segments))
            a_sent = sentiment_data['A'][i][1] if i < len(sentiment_data['A']) else 'neutral'
            b_sent = sentiment_data['B'][i][1] if i < len(sentiment_data['B']) else 'neutral'
            print(f"{sec}\t{a_sent}\t\t{b_sent}")

✅ Loading transcript: DatasetCercetare/Transcription/Daemahni_Gianna.wav.txt
✅ Parsed 0 segments. Consolidated into Speaker A and Speaker B.
⚠️ No valid segments found. Check transcript format.

=== SENTIMENT ANALYSIS PER SECOND ===

Second	Speaker A	Speaker B
----------------------------------------
