<a href="https://colab.research.google.com/github/Abraham2025-debug/HRNET/blob/main/Annotation_tool_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.43.2-py2.py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[

In [2]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1


In [4]:
%%writefile app.py
import streamlit as st

st.title("Hello, Streamlit!")
st.write("This is a test app.")

Writing app.py


In [3]:
import streamlit as st
import json
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import cv2
import numpy as np
from PIL import Image
import moviepy.editor as mp
import speech_recognition as sr

# Directory for storing annotations
ANNOTATION_FILE = "annotations.json"
if not os.path.exists(ANNOTATION_FILE):
    with open(ANNOTATION_FILE, "w") as f:
        json.dump({}, f)

# Load JSON annotations
def load_annotations():
    with open(ANNOTATION_FILE, "r") as f:
        return json.load(f)

# Save JSON annotations
def save_annotations(data):
    with open(ANNOTATION_FILE, "w") as f:
        json.dump(data, f, indent=4)

# Upload context and utterance videos
st.title("Multimodal Annotation Tool")
st.subheader("Step 1: Upload Videos")
st.info("Please upload the context video first, followed by the utterance video.")
context_video = st.file_uploader("Upload Context Video", type=["mp4", "avi"], key="context_video")
utterance_video = st.file_uploader("Upload Utterance Video", type=["mp4", "avi"], key="utterance_video")

# Upload other modalities
st.subheader("Step 2: Upload Additional Modalities")
audio_file = st.file_uploader("Upload Audio", type=["wav", "mp3"])
text_file = st.file_uploader("Upload Transcript", type=["txt"])
extracted_frames = st.file_uploader("Upload Extracted Frames (ZIP or images)", accept_multiple_files=True)

# Extract Audio from Video
def extract_audio(video_path, output_audio_path):
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(output_audio_path)

# Transcribe audio to text
def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
        try:
            transcript = recognizer.recognize_google(audio_data)
            return transcript
        except sr.UnknownValueError:
            return "Could not understand audio."
        except sr.RequestError:
            return "Error connecting to speech recognition service."

# Process audio features
def process_audio_features(audio_path):
    y, sr = librosa.load(audio_path)

    # Compute pitch using librosa's piptrack
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitch_values = np.max(pitches, axis=0)

    # Compute energy (amplitude changes)
    energy = librosa.feature.rms(y=y)[0]

    # Compute silence (unusual pauses)
    silence_threshold = np.mean(energy) * 0.5
    silent_frames = np.where(energy < silence_threshold)[0]

    # Detect pitch changes
    pitch_diff = np.abs(np.diff(pitch_values))
    pitch_change_indices = np.where(pitch_diff > np.mean(pitch_diff) * 1.5)[0]

    # Detect stress
    stress_indices = np.where(energy > np.mean(energy) * 1.5)[0]

    # Detect word stretching
    stretched_indices = np.where((pitch_values > np.mean(pitch_values)) & (energy > np.mean(energy)))[0]

    # Convert frame indices to timestamps
    def frames_to_time(indices, sr, hop_length=512):
        return librosa.frames_to_time(indices, sr=sr, hop_length=hop_length)

    return {
        "pitch_change_times": frames_to_time(pitch_change_indices, sr).tolist(),
        "stress_times": frames_to_time(stress_indices, sr).tolist(),
        "stretch_times": frames_to_time(stretched_indices, sr).tolist(),
        "silence_times": frames_to_time(silent_frames, sr).tolist(),
    }

# Display video
def display_video(video_file):
    st.video(video_file)

# Display audio waveform and spectrogram
def display_audio(audio_file):
    y, sr = librosa.load(audio_file, sr=None)
    fig, ax = plt.subplots(2, 1, figsize=(8, 4))
    librosa.display.waveshow(y, sr=sr, ax=ax[0])
    ax[0].set(title="Waveform")

    spec = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
    img = librosa.display.specshow(spec, sr=sr, x_axis='time', y_axis='log', ax=ax[1])
    ax[1].set(title="Spectrogram")
    fig.colorbar(img, ax=ax[1])
    st.pyplot(fig)

# Display extracted frames with metadata
def display_frames(frames):
    for frame in frames:
        st.image(Image.open(frame), caption=os.path.basename(frame))

# Show uploaded modalities
if context_video:
    st.subheader("Context Video")
    display_video(context_video)
if utterance_video:
    st.subheader("Utterance Video")
    display_video(utterance_video)
if audio_file:
    st.subheader("Audio Features and Transcription")
    extracted_audio_path = "extracted_audio.wav"
    with open(extracted_audio_path, "wb") as f:
        f.write(audio_file.read())
    audio_features = process_audio_features(extracted_audio_path)
    st.write("Detected Aural Patterns:")
    st.json(audio_features)
    display_audio(extracted_audio_path)
    transcript = transcribe_audio(extracted_audio_path)
    st.subheader("Transcription")
    st.write(transcript)
if extracted_frames:
    st.subheader("Extracted Frames")
    display_frames(extracted_frames)

# Annotation
st.subheader("Annotate")
timestamp = st.text_input("Timestamp (e.g., 00:00:12.345)")
selected_face = st.text_input("Face Index (if multiple)")
annotation_text = st.text_area("Annotation")

if st.button("Save Annotation"):
    annotations = load_annotations()
    annotation_entry = {
        "timestamp": timestamp,
        "face_index": selected_face,
        "annotation": annotation_text
    }
    annotations[timestamp] = annotation_entry
    save_annotations(annotations)
    st.success("Annotation saved!")

# Show existing annotations
st.subheader("Existing Annotations")
annotations = load_annotations()
if annotations:
    st.json(annotations)
else:
    st.write("No annotations yet.")


  if event.key is 'enter':

2025-03-19 06:04:56.670 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-03-19 06:04:56.742 Session state does not function when running a script without `streamlit run`


In [5]:
pip freeze > requirements.txt