In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install ffmpeg



In [3]:
!pip install pydub



In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import librosa
import librosa.display

from pydub import AudioSegment
from pydub.utils import make_chunks

import fastai
from fastai.vision import *
fastai.__version__

'1.0.61'

In [5]:
learn = load_learner('/content/drive/MyDrive/Colab Notebooks/GBCLessons/Math 2/classifier/')

In [6]:
file_path = '/content/drive/MyDrive/Colab Notebooks/GBCLessons/Math 2/classifier/Data/test_data/'

In [7]:
import math

def classify_speaker(audio_path):
  samples, sample_rate = librosa.load(audio_path)

  # Create spectrograms to analyze
  make_spectrograms(samples, sample_rate)

  # Read and tally predictions
  prediction = predict_speaker()

  # Clean up data
  clean_up_spectrograms(samples, sample_rate)

  print(f'\r\nI think that the speaker is {prediction}.')
  
def make_spectrograms(samples, sample_rate):
  chunk_count = math.floor(len(samples) / sample_rate)

  for i in range(chunk_count):
    start_timestamp = i * sample_rate
    end_timestamp = (i+1) * sample_rate

    chunk = samples[start_timestamp:end_timestamp]

    mel_spec_power = librosa.feature.melspectrogram(chunk, sr=sample_rate, power=2.0) 
    mel_spec_db = librosa.power_to_db(mel_spec_power, ref=np.max)

    filename = (f'spec_{i}.png')
    plt.imsave(filename, mel_spec_db)

def predict_speaker():
  speaker = ''
  prediction = 0
  predictions = {}
  count = 1

  for filename in os.listdir('.'):
    if filename[-4:] == '.png':
      count += 1
      img = open_image('/content/' + filename)
      speaker_name,_,prob = learn.predict(img)
      p = float(torch.max(prob) * 100)
      # Only consider value if confidence is above 70%
      if p > 70:
        print(f'Sample {count} = {speaker_name} ({round(p,2)}% probability)')
        if str(speaker_name) not in predictions:
          predictions[str(speaker_name)] = 1
        else:
          predictions[str(speaker_name)] += 1

  return max(predictions)

def clean_up_spectrograms(samples, sample_rate):
  chunk_count = math.floor(len(samples) / sample_rate)
  for i in range(chunk_count):
    filename = (f'spec_{i}.png')
    os.remove(filename) 

In [8]:
classify_speaker(file_path + 'Juan.wav')

Sample 2 = Juan (84.96% probability)
Sample 4 = Juan (72.61% probability)
Sample 5 = Juan (82.35% probability)
Sample 6 = Dan (99.45% probability)
Sample 7 = Dan (93.39% probability)
Sample 8 = Hom (74.76% probability)
Sample 9 = Dan (84.36% probability)
Sample 10 = Juan (99.12% probability)
Sample 12 = Ed (75.81% probability)
Sample 14 = Dan (95.79% probability)

I think that the speaker is Juan.


In [9]:
classify_speaker(file_path + 'Hom_test.wav')

Sample 3 = Hom (95.55% probability)
Sample 5 = Hom (98.77% probability)
Sample 6 = Ed (79.94% probability)
Sample 7 = Hom (98.27% probability)

I think that the speaker is Hom.
