In [None]:
import re
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

In [None]:
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


True

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree

# Download necessary NLTK datasets if not already present
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Read the file
def read_transcript(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return lines

# Extracts named entities (Person names) using NLTK
def extract_named_entities(lines):
    named_entities = set()
    for line in lines:
        match = re.match(r"(Speaker\d+): (.+)", line)  # Extract only the text after "SpeakerX:"
        if match:
            _, text = match.groups()
            tokens = word_tokenize(text)
            tags = pos_tag(tokens)

            chunks = ne_chunk(tags)
            for chunk in chunks:
                if isinstance(chunk, Tree) and chunk.label() == "PERSON":
                    name = " ".join(c[0] for c in chunk)  # multi-word names combining
                    named_entities.add(name)
    return list(named_entities)

# Filters extracted named entities to associate speaker labels
def filter_speakers(lines, named_entities):
    speaker_mapping = {}
    seen_names = set()  # To prevent duplicate assignments

    for line in lines:
        match = re.match(r"(Speaker\d+): (.+)", line)
        if match:
            speaker, text = match.groups()

            # Check for direct name introductions
            name_intro_match = re.search(r"my name is ([A-Z][a-z]+(?: [A-Z][a-z]+)?)", text, re.IGNORECASE)
            if name_intro_match:
                name = name_intro_match.group(1)
                if name not in seen_names:  # For unique names
                    speaker_mapping[speaker] = name
                    seen_names.add(name)
                    continue

            # Check for "That's me" introductions and ensure the correct speaker is assigned
            thats_me_match = re.search(r"([A-Z][a-z]+ [A-Z][a-z]+),? that'?s me", text, re.IGNORECASE)
            if thats_me_match:
                name = thats_me_match.group(1)
                if name not in seen_names:
                    speaker_mapping[speaker] = name
                    seen_names.add(name)
                    continue

            # Check if the speaker is mentioned by others
            for name in named_entities:
                if name in text and name not in seen_names:
                    speaker_mapping[speaker] = name
                    seen_names.add(name)
                    break

            # If no name found from other methods
            if speaker not in speaker_mapping:
                speaker_mapping[speaker] = f"Unknown Speaker {len(speaker_mapping) + 1}"

    return speaker_mapping

# Replaces SpeakerX labels with corresponding names
def replace_speaker_labels(lines, speaker_mapping):
    updated_lines = []
    for line in lines:
        match = re.match(r"(Speaker\d+): (.+)", line)
        if match:
            speaker, text = match.groups()
            if speaker in speaker_mapping:
                updated_lines.append(f"{speaker_mapping[speaker]}: {text}\n")
            else:
                updated_lines.append(line)
        else:
            updated_lines.append(line)  # Keep non-matching lines (e.g., blank lines)
    return updated_lines

# Update transcript
def write_transcript(output_path, updated_lines):
    with open(output_path, 'w') as file:
        file.writelines(updated_lines)

def main():
    # File paths
    input_path = "/content/drive/My Drive/Research/RGU/ner-transcript.txt"
    output_path = "/content/drive/My Drive/Research/RGPU/updated_transcript.txt"

    # Workflow
    lines = read_transcript(input_path)
    named_entities = extract_named_entities(lines)
    speaker_mapping = filter_speakers(lines, named_entities)
    updated_lines = replace_speaker_labels(lines, speaker_mapping)
    write_transcript(output_path, updated_lines)

    print(f"Updated transcript written to {output_path}")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Updated transcript written to /content/drive/My Drive/Research/RGU/updated_transcript3.txt
