In [44]:
from dotenv import load_dotenv
import os
from pathlib import Path
import boto3
import tempfile
import spacy
import sys

import spacy
import boto3
from transformers import pipeline, Wav2Vec2Processor, Wav2Vec2ForCTC
import random

# Ensure src folder is in path - fixed for Jupyter notebook
notebook_path = Path.cwd()  # Get current working directory
project_root = notebook_path.parent  # Assuming notebook is in app/ and src/ is at project root
sys.path.append(str(project_root))
import importlib
if 'src.s3_utils' in sys.modules:
    importlib.reload(sys.modules['src.s3_utils'])

from src.s3_utils import list_audio_files, load_ner_model_from_s3, trigger_lambda, fetch_result_json

In [45]:
# Load environment variables
env_path = Path("../config/secrets.env")
load_dotenv(dotenv_path=env_path)
s3 = boto3.client('s3', region_name="us-east-1")

In [46]:
# Try this in a new cell
print("Environment variables as loaded:")
print(f"BUCKET_NAME: '{os.getenv('BUCKET_NAME')}'")
print(f"BUCKET_PREFIX_DATA: '{os.getenv('BUCKET_PREFIX_DATA')}'")
print(f"BUCKET_PREFIX_MODEL: '{os.getenv('BUCKET_PREFIX_MODEL')}'")
print(f"Default data prefix value: '{os.getenv('BUCKET_PREFIX_DATA', 'default')}'")
print(f"Default model prefix value: '{os.getenv('BUCKET_PREFIX_MODEL', 'default')}'")
bucket = os.getenv("BUCKET_NAME", "").strip()
prefix_data = os.getenv("BUCKET_PREFIX_DATA", "").strip()
prefix_model = os.getenv("BUCKET_PREFIX_MODEL", "").strip()

Environment variables as loaded:
BUCKET_NAME: 'cloud-engineer-team7'
BUCKET_PREFIX_DATA: 'LibriSpeech/'
BUCKET_PREFIX_MODEL: 'ner_custom_model/'
Default data prefix value: 'LibriSpeech/'
Default model prefix value: 'ner_custom_model/'


In [55]:
BUCKET = os.getenv("BUCKET_NAME", "").strip()
PREFIX_DATA = os.getenv("BUCKET_PREFIX_DATA", "").strip()
PREFIX_MODEL = os.getenv("BUCKET_PREFIX_MODEL", "").strip()
lambda_func = os.getenv("LAMBDA_FUNCTION_NAME", "").strip()

In [56]:
PREFIX_DATA, PREFIX_MODEL, BUCKET, lambda_func

('LibriSpeech/',
 'ner_custom_model/',
 'cloud-engineer-team7',
 'transcribe-audio-lambda')

In [18]:
# list of files in models bucket
def list_files_in_bucket(bucket_name, prefix):
    s3 = boto3.client("s3")
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    if "Contents" in response:
        return [obj["Key"] for obj in response["Contents"]]
    return []
list_files_in_bucket(bucket, prefix_model)

['ner_custom_model/config.cfg',
 'ner_custom_model/meta.json',
 'ner_custom_model/ner/cfg',
 'ner_custom_model/ner/model',
 'ner_custom_model/ner/moves',
 'ner_custom_model/tokenizer',
 'ner_custom_model/vocab/key2row',
 'ner_custom_model/vocab/lookups.bin',
 'ner_custom_model/vocab/strings.json',
 'ner_custom_model/vocab/vectors',
 'ner_custom_model/vocab/vectors.cfg']

In [36]:
files = list_audio_files(bucket, prefix_data)
import random
random.sample(files, 5)

['LibriSpeech/test-clean/2961/961/2961-961-0003.flac',
 'LibriSpeech/test-clean/1089/134691/1089-134691-0004.flac',
 'LibriSpeech/test-clean/1089/134691/1089-134691-0020.flac',
 'LibriSpeech/test-clean/2961/961/2961-961-0007.flac',
 'LibriSpeech/test-clean/260/123288/260-123288-0007.flac']

## Testing loading files from S3

In [4]:
def list_s3_audio(bucket, prefix):
    s3 = boto3.client("s3")
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    return [obj["Key"] for obj in response.get("Contents", []) if obj["Key"].endswith((".flac", ".wav"))]

def download_from_s3(bucket, s3_key):
    s3 = boto3.client("s3")
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=Path(s3_key).suffix)
    s3.download_file(bucket, s3_key, tmp.name)
    return tmp.name

In [17]:
list_s3_audio(bucket, prefix_data)[:5]

['LibriSpeech/test-clean/1089/134686/1089-134686-0000.flac',
 'LibriSpeech/test-clean/1089/134686/1089-134686-0001.flac',
 'LibriSpeech/test-clean/1089/134686/1089-134686-0002.flac',
 'LibriSpeech/test-clean/1089/134686/1089-134686-0003.flac',
 'LibriSpeech/test-clean/1089/134686/1089-134686-0004.flac']

## Test NER model from S3

In [None]:
# Test NER model from S3
import boto3
import json

def trigger_lambda(bucket, key):
    lambda_client = boto3.client('lambda')
    payload = {
        "bucket": bucket,
        "key": key
    }
    response = lambda_client.invoke(
        FunctionName="AudioProcessorLambda",
        InvocationType='RequestResponse',
        Payload=json.dumps(payload)
    )
    result = json.load(response['Payload'])
    return result

In [20]:
def load_ner_model_from_s3(bucket_name: str, prefix: str):
    """
    Downloads a spaCy NER model folder from S3 and loads it into memory.
    """
    s3 = boto3.client('s3')
    temp_dir = tempfile.TemporaryDirectory()
    local_model_path = Path(temp_dir.name) / "ner_model"

    # Recursively download all files in the prefix
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
        for obj in page.get('Contents', []):
            key = obj['Key']
            print(key)
            rel_path = Path(key).relative_to(prefix)
            local_file_path = local_model_path / rel_path
            local_file_path.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket_name, key, str(local_file_path))

    print(f"Model downloaded to {local_model_path}")
    return spacy.load(str(local_model_path)), temp_dir  # Keep temp_dir alive

def test_ner_model(nlp):
    text = "Apple is looking to buy a startup in San Francisco"
    doc = nlp(text)
    for ent in doc.ents:
        print(f"{ent.text}: {ent.label_}")


#### Sample Test

In [23]:
bucket = "cloud-engineer-team7"
prefix = "ner_custom_model/"

nlp_model, tmp = load_ner_model_from_s3(bucket, prefix)
test_ner_model(nlp_model)

# Later, cleanup:
tmp.cleanup()

ner_custom_model/config.cfg
ner_custom_model/meta.json
ner_custom_model/ner/cfg
ner_custom_model/ner/model
ner_custom_model/ner/moves
ner_custom_model/tokenizer
ner_custom_model/vocab/key2row
ner_custom_model/vocab/lookups.bin
ner_custom_model/vocab/strings.json
ner_custom_model/vocab/vectors
ner_custom_model/vocab/vectors.cfg
Model downloaded to /var/folders/m1/rwxd3g8d2zl60slvhbm1p89w0000gn/T/tmpn1or3v_f/ner_model
Apple: ORG
San Francisco: GPE


In [None]:
ner_model, tmp = load_ner_model_from_s3(bucket, prefix)


In [51]:
files[0]

'LibriSpeech/test-clean/1089/134686/1089-134686-0000.flac'

In [59]:
lambda_response = trigger_lambda(
    bucket="cloud-engineer-team7",
    key="LibriSpeech/test-clean/2830/3980/2830-3980-0056.flac",
    model="Whisper",
    lambda_function_name=lambda_func
)
lambda_response

{'status': 'success',
 'model_used': 'Whisper',
 'transcript': ' Otherwise, Paul should have written, grace from God the Father, and peace from our Lord Jesus Christ.'}