<a href="https://colab.research.google.com/github/x1001000/raspberrypi3-yamnet-sed/blob/main/colab_notebooks/%E9%9B%A8%E8%81%B2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# At runtime reset

## install

In [None]:
!pip install soundfile
!pip install git+https://github.com/nficano/pytube
!pip install pydub

Collecting git+https://github.com/nficano/pytube
  Cloning https://github.com/nficano/pytube to /tmp/pip-req-build-5bcugynn
  Running command git clone -q https://github.com/nficano/pytube /tmp/pip-req-build-5bcugynn
Building wheels for collected packages: pytube
  Building wheel for pytube (setup.py) ... [?25l[?25hdone
  Created wheel for pytube: filename=pytube-10.7.1-cp37-none-any.whl size=42890 sha256=b9120f4c00d3c7c28cd02a8cc0e644c45a911a1bec91b92c8ba15a308c41f397
  Stored in directory: /tmp/pip-ephem-wheel-cache-9wq9r2tz/wheels/44/da/40/3b5e03abe33a91895343814fb44b309512375408f4a909555b
Successfully built pytube
Installing collected packages: pytube
Successfully installed pytube-10.7.1
Collecting pydub
  Downloading https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl
Installing collected packages: pydub
Successfully installed pydub-0.25.1


## download YAMNet (15M bytes)

In [None]:
# !curl -O https://storage.googleapis.com/audioset/yamnet.h5
# !git clone https://github.com/tensorflow/models
# !cp models/research/audioset/yamnet/* .
!git clone https://github.com/x1001000/raspberrypi3-yamnet-sed
!cp raspberrypi3-yamnet-sed/yamnet/* .

Cloning into 'raspberrypi3-yamnet-sed'...
remote: Enumerating objects: 352, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 352 (delta 48), reused 20 (delta 9), pack-reused 262[K
Receiving objects: 100% (352/352), 52.03 MiB | 25.42 MiB/s, done.
Resolving deltas: 100% (200/200), done.


# At runtime restart

## import

In [None]:
import numpy as np
import resampy
import soundfile as sf
import tensorflow as tf

import params as yamnet_params
import yamnet as yamnet_model

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models

from pytube import Playlist, YouTube
from pydub import AudioSegment

from IPython.display import display, Audio
from time import sleep

## load YAMNet (3.7M params)

In [None]:
params = yamnet_params.Params()
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet.h5')
yamnet_classes = yamnet_model.class_names('yamnet_class_map_zh-tw.csv')
yamnet_classes = np.concatenate([yamnet_classes, np.array(['台灣垃圾車'])])

yamnet.summary()

Model: "yamnet_frames"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
tf.compat.v1.shape (TFOpLambda) (1,)                 0           input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici ()                   0           tf.compat.v1.shape[0][0]         
__________________________________________________________________________________________________
tf.math.maximum_1 (TFOpLambda)  ()                   0           tf.__operators__.getitem[0][0]   
______________________________________________________________________________________

## def data_from_YouTube(url)

In [None]:
def data_from_YouTube(url):
    title = YouTube(url).title
    stream = YouTube(url).streams.get_audio_only()
    if stream:
        default_filename = stream.default_filename
        print(stream.download(), '✅已下載，匯出wav檔...', end='')
        AudioSegment.from_file(default_filename).export(default_filename+'.wav', format='wav')
        print('匯出waveform陣列...')
        file_name = default_filename+'.wav'

        # https://github.com/tensorflow/models/blob/master/research/audioset/yamnet/inference.py#L40
        wav_data, sr = sf.read(file_name, dtype=np.int16)
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
        waveform = wav_data / tf.int16.max#32768.0  # Convert to [-1.0, +1.0]
        waveform = waveform.astype('float32')

        # Convert to mono and the sample rate expected by YAMNet.
        if len(waveform.shape) > 1:
            waveform = np.mean(waveform, axis=1)
        if sr != params.sample_rate:
            waveform = resampy.resample(waveform, sr, params.sample_rate)
        
        return title, waveform
    else:
        print(f'No audio_only stream from {title}!')

## def data_from_Playlist(url)

In [None]:
def data_from_Playlist(url, begin=1, end=None):
    titles = []
    waveforms = []
    for url in Playlist(url).video_urls[begin-1:end]:
        title, waveform = data_from_YouTube(url)
        waveforms.append(waveform)
        titles.append(title)
        sleep(10) # HTTP 429 hack
    return titles, waveforms

# Run inference on a playlist w/wo custom output layer

## load the model as the output layer

In [None]:
model = models.load_model('20210316-3.h5')

## Run 5s MA inference

In [None]:
playlist = ['https://www.youtube.com/playlist?list=PL0Q2eQA7p-wTWf8KtSNes33asI9Nc4iRA'] # 雨聲

MA = 5
SR = int(params.sample_rate)

titles, waveforms = data_from_Playlist(*playlist)
print('\nRunning inference...')
for title, waveform in zip(titles, waveforms):
    print('\n'+title)
    #display(Audio(waveform, rate=SR)) # crashes if waveform is big
    for i in range(MA, len(waveform)//SR, MA):
        scores, embeddings, spectrogram = yamnet(waveform[(i-MA)*SR:i*SR])
        # Scores is a matrix of (time_frames, num_classes) classifier scores.
        # Average them along time to get an overall classifier output for the clip.
        prediction = np.mean(scores[:-1], axis=0) # last one scores comes from insufficient samples
        # w👇 wo☝️
        # prediction = np.mean(model.predict(embeddings[:-1]), axis=0)
        # Report the highest-scoring classes and their scores.
        top5 = np.argsort(prediction)[::-1][:5]
        print(f'{i//60}:{i%60:2d}',
            ''.join(f" {prediction[i]:.2f} 👉{yamnet_classes[i][:12].ljust(12, '　')}" for i in top5))

/content/フリー効果音『雨』.mp4 ✅已下載，匯出wav檔...匯出waveform陣列...
/content/Severe Thunder Storm.mp4 ✅已下載，匯出wav檔...匯出waveform陣列...

Running inference...

フリー効果音『雨』
0: 5  0.79 👉雨　　　　　　　　　　　 0.78 👉水　　　　　　　　　　　 0.60 👉雨表面上　　　　　　　　 0.51 👉雨滴　　　　　　　　　　 0.13 👉流　　　　　　　　　　　
0:10  0.78 👉雨　　　　　　　　　　　 0.74 👉水　　　　　　　　　　　 0.61 👉雨表面上　　　　　　　　 0.52 👉雨滴　　　　　　　　　　 0.07 👉流　　　　　　　　　　　
0:15  0.85 👉雨　　　　　　　　　　　 0.79 👉水　　　　　　　　　　　 0.65 👉雨表面上　　　　　　　　 0.53 👉雨滴　　　　　　　　　　 0.09 👉流　　　　　　　　　　　
0:20  0.87 👉雨　　　　　　　　　　　 0.82 👉水　　　　　　　　　　　 0.70 👉雨表面上　　　　　　　　 0.54 👉雨滴　　　　　　　　　　 0.08 👉流　　　　　　　　　　　
0:25  0.74 👉雨　　　　　　　　　　　 0.60 👉水　　　　　　　　　　　 0.57 👉雨表面上　　　　　　　　 0.47 👉雨滴　　　　　　　　　　 0.12 👉火　　　　　　　　　　　
0:30  0.85 👉雨　　　　　　　　　　　 0.83 👉水　　　　　　　　　　　 0.66 👉雨表面上　　　　　　　　 0.59 👉雨滴　　　　　　　　　　 0.07 👉流　　　　　　　　　　　
0:35  0.78 👉雨　　　　　　　　　　　 0.72 👉水　　　　　　　　　　　 0.59 👉雨表面上　　　　　　　　 0.52 👉雨滴　　　　　　　　　　 0.09 👉火　　　　　　　　　　　
0:40  0.91 👉雨　　　　　　　　　　　 0.82 👉水　　　　　　　　　　　 0.72 👉雨表面上　　　　　　　　 0.58 👉雨滴　　　　　　　　　　 0.06 👉火　　　　　　　　　　　
0:45  0.93 👉雨　　　　　　　　　　　 0.90 👉水　　　　　　　　　　