In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install ffmpeg

Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25l[?25hdone
  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6083 sha256=90d204d6e64ac49fdedbaf8ea91de351a600c7c38070216023216c5f91ffac80
  Stored in directory: /root/.cache/pip/wheels/64/80/6e/caa3e16deb0267c3cbfd36862058a724144e19fdb9eb03af0f
Successfully built ffmpeg
Installing collected packages: ffmpeg
Successfully installed ffmpeg-1.4


In [3]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import librosa
import librosa.display

from pydub import AudioSegment
from pydub.utils import make_chunks

import fastai
from fastai.vision import *
fastai.__version__

'1.0.61'

In [6]:
learn = load_learner('/content/drive/MyDrive/Colab Notebooks/GBCLessons/Math 2/classifier/')

In [5]:
file_path = '/content/drive/MyDrive/Colab Notebooks/GBCLessons/Math 2/classifier/Data/test_data/'

In [14]:
import math

def classify_speaker(audio_path):
  samples, sample_rate = librosa.load(audio_path)

  # Create spectrograms to analyze
  make_spectrograms(samples, sample_rate)

  # Read and tally predictions
  prediction = predict_speaker()

  # Clean up data
  clean_up_spectrograms(samples, sample_rate)

  print(f'\r\nI think that the speaker is {prediction}.')
  
def make_spectrograms(samples, sample_rate):
  chunk_count = math.floor(len(samples) / sample_rate)

  for i in range(chunk_count):
    start_timestamp = i * sample_rate
    end_timestamp = (i+1) * sample_rate

    chunk = samples[start_timestamp:end_timestamp]

    mel_spec_power = librosa.feature.melspectrogram(chunk, sr=sample_rate, power=2.0) 
    mel_spec_db = librosa.power_to_db(mel_spec_power, ref=np.max)

    filename = (f'spec_{i}.png')
    plt.imsave(filename, mel_spec_db)

def predict_speaker():
  speaker = ''
  predictions = {}
  count = 1

  for filename in os.listdir('.'):
    if filename[-4:] == '.png':
      img = open_image('/content/' + filename)
      speaker_name,_,prob = learn.predict(img)
      p = float(torch.max(prob) * 100)
      using_sample_char = '✘'

      # Only consider value if confidence is above 70%
      if p > 70:
        using_sample_char = '✔'

        if str(speaker_name) not in predictions:
          predictions[str(speaker_name)] = round(p,2)
        else:
          predictions[str(speaker_name)] += round(p,2)

      print(f'Sample {count} = {speaker_name} ({round(p,2)}% probability) {using_sample_char}')
      count += 1

  maxval = max(predictions.values())
  res = [(k, v) for k, v in predictions.items() if v == maxval]
  return res[0][0]

def clean_up_spectrograms(samples, sample_rate):
  chunk_count = math.floor(len(samples) / sample_rate)
  for i in range(chunk_count):
    filename = (f'spec_{i}.png')
    os.remove(filename) 

In [16]:
classify_speaker(file_path + 'dan_sample.wav')

Sample 1 = Juan (68.9% probability) ✘
Sample 2 = Dan (97.52% probability) ✔
Sample 3 = Juan (97.95% probability) ✔
Sample 4 = Mike (91.65% probability) ✔

I think that the speaker is Juan.


In [15]:
classify_speaker(file_path + 'Hom_test.wav')

Sample 1 = Hom (70.27% probability) ✔
Sample 2 = Ed (72.65% probability) ✔
Sample 3 = Hom (99.42% probability) ✔
Sample 4 = Hom (98.73% probability) ✔
Sample 5 = Ed (81.7% probability) ✔
Sample 6 = Hom (99.08% probability) ✔

I think that the speaker is Hom.


In [17]:
classify_speaker(file_path + 'eduardo_test.wav')

Sample 1 = Hom (84.36% probability) ✔
Sample 2 = Ed (83.01% probability) ✔
Sample 3 = Ed (70.82% probability) ✔
Sample 4 = Ed (99.92% probability) ✔
Sample 5 = Hom (99.72% probability) ✔

I think that the speaker is Ed.


In [18]:
classify_speaker(file_path + 'Juan_test.wav')

Sample 1 = Juan (54.21% probability) ✘
Sample 2 = Dan (94.81% probability) ✔
Sample 3 = Juan (77.45% probability) ✔
Sample 4 = Juan (96.71% probability) ✔

I think that the speaker is Juan.


In [19]:
classify_speaker(file_path + 'Mike.wav')

Sample 1 = Juan (96.77% probability) ✔
Sample 2 = Dan (94.49% probability) ✔
Sample 3 = Ed (69.54% probability) ✘

I think that the speaker is Juan.
