In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install ffmpeg



In [3]:
!pip install pydub



In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import IPython

import librosa
import librosa.display

from pydub import AudioSegment
from pydub.utils import make_chunks

import fastai
from fastai.vision import *
fastai.__version__

'1.0.61'

In [5]:
learn = load_learner('/content/drive/MyDrive/ML/GBC/Math2/classifier')

In [6]:
file_path = '/content/drive/MyDrive/ML/GBC/Math2/classifier/Data/test_data/'

In [7]:
import math

def classify_speaker(audio_path):
  samples, sample_rate = librosa.load(audio_path)

  # Create spectrograms to analyze
  make_spectrograms(samples, sample_rate)

  # Read and tally predictions
  prediction = predict_speaker()

  # Clean up data
  clean_up_spectrograms(samples, sample_rate)

  print(f'\r\nI think that the speaker is {prediction}.')
  
def make_spectrograms(samples, sample_rate):
  chunk_count = math.floor(len(samples) / sample_rate)

  for i in range(chunk_count):
    start_timestamp = i * sample_rate
    end_timestamp = (i+1) * sample_rate

    chunk = samples[start_timestamp:end_timestamp]

    mel_spec_power = librosa.feature.melspectrogram(chunk, sr=sample_rate, power=2.0) 
    mel_spec_db = librosa.power_to_db(mel_spec_power, ref=np.max)

    filename = (f'spec_{i}.png')
    plt.imsave(filename, mel_spec_db)

def predict_speaker():
  speaker = ''
  predictions = {}
  count = 1

  for filename in os.listdir('.'):
    if filename[-4:] == '.png':
      img = open_image('/content/' + filename)
      speaker_name,_,prob = learn.predict(img)
      p = float(torch.max(prob) * 100)
      using_sample_char = '✘'

      # Only consider value if confidence is above 70%
      if p > 70:
        using_sample_char = '✔'

        if str(speaker_name) not in predictions:
          predictions[str(speaker_name)] = round(p,2)
        else:
          predictions[str(speaker_name)] += round(p,2)

      print(f'Sample {count} = {speaker_name} ({round(p,2)}% probability) {using_sample_char}')
      count += 1

  maxval = max(predictions.values())
  res = [(k, v) for k, v in predictions.items() if v == maxval]
  return res[0][0]

def clean_up_spectrograms(samples, sample_rate):
  chunk_count = math.floor(len(samples) / sample_rate)
  for i in range(chunk_count):
    filename = (f'spec_{i}.png')
    os.remove(filename) 

Attempt to classify clips of the group of us reading the following text:

"NLP is an important component in a wide range of software applications that we use in our daily lives. In this section, we'll introduce some key applications and also take a look at some common tasks that you'll see across different NLP applications. This section reinforces the applications we showed you in Figure 1-1, which you'll see in more detail throughout the book."

In [8]:
dan_reading = file_path + 'nlp_dan.wav'
ed_reading = file_path + 'nlp_ed.wav'
hom_reading = file_path + 'nlp_hom.wav'
juan_reading = file_path + 'nlp_juan.wav'
mike_reading = file_path + 'nlp_mike.wav'

In [9]:
IPython.display.Audio(dan_reading)

In [10]:
classify_speaker(dan_reading)

Sample 1 = Juan (56.43% probability) ✘
Sample 2 = Juan (96.25% probability) ✔
Sample 3 = Juan (54.64% probability) ✘
Sample 4 = Dan (97.44% probability) ✔
Sample 5 = Juan (96.51% probability) ✔
Sample 6 = Dan (99.99% probability) ✔
Sample 7 = Dan (51.73% probability) ✘
Sample 8 = Dan (44.86% probability) ✘
Sample 9 = Juan (60.36% probability) ✘
Sample 10 = Dan (58.85% probability) ✘
Sample 11 = Juan (77.21% probability) ✔
Sample 12 = Dan (97.3% probability) ✔
Sample 13 = Mike (47.2% probability) ✘
Sample 14 = Dan (94.02% probability) ✔
Sample 15 = Dan (99.99% probability) ✔
Sample 16 = Juan (70.31% probability) ✔
Sample 17 = Dan (92.19% probability) ✔
Sample 18 = Juan (97.42% probability) ✔
Sample 19 = Dan (76.7% probability) ✔
Sample 20 = Dan (74.46% probability) ✔
Sample 21 = Dan (76.61% probability) ✔
Sample 22 = Juan (66.33% probability) ✘

I think that the speaker is Dan.


In [11]:
IPython.display.Audio(ed_reading)

In [12]:
classify_speaker(ed_reading)

Sample 1 = Ed (96.0% probability) ✔
Sample 2 = Ed (99.76% probability) ✔
Sample 3 = Ed (88.2% probability) ✔
Sample 4 = Ed (36.73% probability) ✘
Sample 5 = Ed (100.0% probability) ✔
Sample 6 = Ed (99.99% probability) ✔
Sample 7 = Ed (100.0% probability) ✔
Sample 8 = Ed (99.81% probability) ✔
Sample 9 = Dan (44.51% probability) ✘
Sample 10 = Ed (99.98% probability) ✔
Sample 11 = Ed (69.64% probability) ✘
Sample 12 = Ed (79.64% probability) ✔
Sample 13 = Ed (99.99% probability) ✔
Sample 14 = Ed (100.0% probability) ✔
Sample 15 = Ed (99.98% probability) ✔
Sample 16 = Ed (99.61% probability) ✔
Sample 17 = Ed (99.93% probability) ✔
Sample 18 = Ed (100.0% probability) ✔
Sample 19 = Ed (100.0% probability) ✔
Sample 20 = Ed (100.0% probability) ✔
Sample 21 = Ed (99.99% probability) ✔
Sample 22 = Ed (99.4% probability) ✔
Sample 23 = Ed (100.0% probability) ✔
Sample 24 = Ed (99.48% probability) ✔
Sample 25 = Ed (99.95% probability) ✔
Sample 26 = Ed (100.0% probability) ✔
Sample 27 = Ed (83.69% 

In [13]:
IPython.display.Audio(hom_reading)

In [14]:
classify_speaker(hom_reading)

Sample 1 = Ed (62.37% probability) ✘
Sample 2 = Hom (99.54% probability) ✔
Sample 3 = Juan (100.0% probability) ✔
Sample 4 = Hom (90.06% probability) ✔
Sample 5 = Hom (99.56% probability) ✔
Sample 6 = Mike (88.01% probability) ✔
Sample 7 = Juan (84.84% probability) ✔
Sample 8 = Hom (100.0% probability) ✔
Sample 9 = Ed (62.88% probability) ✘
Sample 10 = Ed (50.62% probability) ✘
Sample 11 = Hom (99.31% probability) ✔
Sample 12 = Hom (81.87% probability) ✔
Sample 13 = Ed (80.06% probability) ✔
Sample 14 = Hom (90.46% probability) ✔
Sample 15 = Hom (83.58% probability) ✔
Sample 16 = Juan (96.47% probability) ✔
Sample 17 = Mike (69.82% probability) ✘
Sample 18 = Dan (77.07% probability) ✔
Sample 19 = Hom (52.23% probability) ✘
Sample 20 = Ed (53.54% probability) ✘
Sample 21 = Ed (51.07% probability) ✘
Sample 22 = Hom (63.63% probability) ✘
Sample 23 = Juan (99.6% probability) ✔
Sample 24 = Hom (71.64% probability) ✔
Sample 25 = Hom (49.58% probability) ✘
Sample 26 = Juan (99.86% probabilit

In [15]:
IPython.display.Audio(juan_reading)

In [16]:
classify_speaker(juan_reading)

Sample 1 = Dan (99.46% probability) ✔
Sample 2 = Juan (93.5% probability) ✔
Sample 3 = Juan (35.45% probability) ✘
Sample 4 = Ed (52.59% probability) ✘
Sample 5 = Juan (98.1% probability) ✔
Sample 6 = Ed (93.88% probability) ✔
Sample 7 = Juan (83.38% probability) ✔
Sample 8 = Juan (95.53% probability) ✔
Sample 9 = Juan (99.98% probability) ✔
Sample 10 = Mike (51.42% probability) ✘
Sample 11 = Juan (65.36% probability) ✘
Sample 12 = Dan (59.09% probability) ✘
Sample 13 = Mike (60.34% probability) ✘
Sample 14 = Dan (64.6% probability) ✘
Sample 15 = Dan (76.42% probability) ✔
Sample 16 = Ed (65.17% probability) ✘
Sample 17 = Dan (45.52% probability) ✘
Sample 18 = Juan (70.75% probability) ✔
Sample 19 = Juan (58.85% probability) ✘
Sample 20 = Dan (99.96% probability) ✔
Sample 21 = Juan (96.49% probability) ✔

I think that the speaker is Juan.


In [17]:
IPython.display.Audio(mike_reading)

In [18]:
classify_speaker(mike_reading)

Sample 1 = Dan (99.89% probability) ✔
Sample 2 = Mike (99.98% probability) ✔
Sample 3 = Dan (99.33% probability) ✔
Sample 4 = Mike (99.47% probability) ✔
Sample 5 = Mike (96.33% probability) ✔
Sample 6 = Mike (99.9% probability) ✔
Sample 7 = Mike (51.36% probability) ✘
Sample 8 = Mike (92.34% probability) ✔
Sample 9 = Mike (99.66% probability) ✔
Sample 10 = Mike (99.35% probability) ✔
Sample 11 = Mike (98.91% probability) ✔
Sample 12 = Mike (82.92% probability) ✔
Sample 13 = Mike (98.5% probability) ✔
Sample 14 = Mike (84.37% probability) ✔
Sample 15 = Mike (95.69% probability) ✔
Sample 16 = Mike (95.31% probability) ✔
Sample 17 = Mike (99.03% probability) ✔
Sample 18 = Mike (95.74% probability) ✔
Sample 19 = Dan (68.68% probability) ✘
Sample 20 = Mike (98.42% probability) ✔
Sample 21 = Mike (99.27% probability) ✔
Sample 22 = Mike (100.0% probability) ✔
Sample 23 = Mike (97.07% probability) ✔
Sample 24 = Mike (99.98% probability) ✔
Sample 25 = Mike (98.81% probability) ✔

I think that 