In [3]:
def process_speaker_segments(input_file, output_file):
    merged_segments = []

    with open(input_file, "r") as file:
        lines = file.readlines()

    for line in lines:
        parts = line.strip().split()
        start_time = float(parts[0].split("=")[1][:-1])  # Извлекаем число без "s"
        stop_time = float(parts[1].split("=")[1][:-1])
        speaker = parts[2]  # Полное имя спикера

        # Убираем строки, где время старта совпадает со временем стопа
        if start_time == stop_time:
            continue

        # Если список пуст или спикер изменился, добавляем новый интервал
        if not merged_segments or merged_segments[-1][2] != speaker:
            merged_segments.append([start_time, stop_time, speaker])
        else:
            # Если разница между stop предыдущего и start текущего <= 0.1, объединяем интервалы
            if abs(merged_segments[-1][1] - start_time) <= 0.1:
                merged_segments[-1][1] = stop_time
            else:
                merged_segments.append([start_time, stop_time, speaker])

    # Фильтруем интервалы, оставляя только те, у которых продолжительность > 1 сек
    filtered_segments = [seg for seg in merged_segments if seg[1] - seg[0] > 1.0]

    # Дополнительное объединение, если спикер совпадает (объединяем соседние интервалы)
    final_segments = []
    for seg in filtered_segments:
        if not final_segments or final_segments[-1][2] != seg[2]:
            final_segments.append(seg)
        else:
            final_segments[-1][1] = seg[1]  # Расширяем предыдущий интервал

    # Сохраняем результат
    with open(output_file, "w") as file:
        for start, stop, speaker in final_segments:
            file.write(f"start={start:.1f}s stop={stop:.1f}s {speaker}\n")
    
    print(f"Обработанные интервалы сохранены в {output_file}")

# Пути к файлам в Jupyter Notebook
input_path = "results/diarization/interview_2.txt"
output_path = "results/diarization/interview_2_merged.txt"

process_speaker_segments(input_path, output_path)
with open(output_path, "r") as file:
    print(file.read())


Обработанные интервалы сохранены в results/diarization/interview_2_merged.txt
start=0.0s stop=3.8s speaker_SPEAKER_01
start=3.8s stop=5.5s speaker_SPEAKER_00
start=5.4s stop=10.0s speaker_SPEAKER_01
start=10.3s stop=14.3s speaker_SPEAKER_03
start=14.8s stop=24.7s speaker_SPEAKER_01
start=28.2s stop=50.5s speaker_SPEAKER_02
start=51.1s stop=127.8s speaker_SPEAKER_00
start=127.1s stop=131.8s speaker_SPEAKER_02
start=134.2s stop=156.4s speaker_SPEAKER_00
start=160.1s stop=171.5s speaker_SPEAKER_02
start=172.0s stop=204.9s speaker_SPEAKER_00
start=204.0s stop=207.0s speaker_SPEAKER_02
start=207.3s stop=208.7s speaker_SPEAKER_00
start=208.8s stop=217.7s speaker_SPEAKER_02
start=218.3s stop=239.1s speaker_SPEAKER_00
start=241.5s stop=246.9s speaker_SPEAKER_02
start=247.9s stop=261.0s speaker_SPEAKER_00
start=263.0s stop=271.7s speaker_SPEAKER_02
start=271.6s stop=298.6s speaker_SPEAKER_00
start=300.2s stop=313.0s speaker_SPEAKER_02
start=314.0s stop=362.2s speaker_SPEAKER_00
start=364.3s sto

In [4]:
import re

def convert_seconds_to_timestamp(seconds):
    """Преобразует секунды в формат HH:MM:SS,SSS"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    sec = int(seconds % 60)
    millisec = int((seconds % 1) * 1000)
    return f"{hours:02}:{minutes:02}:{sec:02},{millisec:03}"

def convert_diarization_file(input_path, output_path):
    """
    Читает файл диаризации, преобразует время в формат HH:MM:SS,SSS и сохраняет новый файл.
    """
    with open(input_path, "r") as file:
        lines = file.readlines()

    converted_lines = []
    for line in lines:
        match = re.match(r"start=([\d\.]+)s stop=([\d\.]+)s (speaker_SPEAKER_\d+)", line.strip())
        if match:
            start_seconds = float(match.group(1))
            stop_seconds = float(match.group(2))
            speaker = match.group(3)

            start_timestamp = convert_seconds_to_timestamp(start_seconds)
            stop_timestamp = convert_seconds_to_timestamp(stop_seconds)

            converted_lines.append(f"{start_timestamp} - {stop_timestamp} - {speaker}")

    # Сохраняем преобразованный файл
    with open(output_path, "w") as file:
        for line in converted_lines:
            file.write(line + "\n")

    print(f"Файл с преобразованным временем сохранён в {output_path}")

# Пути к файлам
input_file = "results/diarization/interview_2_merged.txt"
output_file = "results/diarization/interview_2_converted.txt"

# Запуск функции
convert_diarization_file(input_file, output_file)



Файл с преобразованным временем сохранён в results/diarization/interview_2_converted.txt
