<a href="https://colab.research.google.com/github/CatMcQueen/audio_fingerprint_identify/blob/main/Shazam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydub
#!pip install ffmpeg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#import ffmpeg
import os
# to convert mp3 to wav files
from pydub import AudioSegment

# for data transformation
import numpy as np
# for visualizing the data
import matplotlib.pyplot as plt
# for making the spectogram
import matplotlib.mlab as mlab
# for opening the media file
import wave
#import scipy.io.wavfile as wavfile
# to make the fingerprints from the spectograms
from skimage.feature import peak_local_max
# for making the hash
import hashlib
# for making the database
import sqlite3


In [3]:
# change these values ####
show_plots = False
debug = False
fan_value = 15 # the max number collected for each anchor point
threshold = 41000  # the threshold value to make values only hold on if they're bright
finger_reduct_val = 256 # the hash characters
sample_time = 30 # give it 10 seconds to figure out if it's the same song
mp3base = './mp3/' # where the mp3 are stored
wavbase = './wav/'  # where the wav are stored after conversion
searchfilename = 'dreams.wav'
######################


In [4]:
## Global Values ##
sample_freq = 44100 # this is a forced val, DO NOT CHANGE THIS ONE
samples_count = sample_time * sample_freq
# make the wav folder if it doesn't exist
isExist = os.path.exists(wavbase)
if not isExist:
  os.makedirs(wavbase)
all_mp3    = os.listdir(mp3base)
all_wav    = os.listdir(wavbase)
avgsamples = sample_time * sample_freq/100; # set up histogram block size

In [18]:
## CREATE DATABASE 

class SongDatabase():
  # set up the array
  def __init__(self):
    # self database = <songid, name of song, hash, offset>
    self.database = None    
    self.song_names = None

  def clear(self):
    self.database = None

  def add_values(self, values=None):
    #import pdb; pdb.set_trace()
    try:
      self.database = np.append(self.database, values, axis=0)  
    except:
      self.database = np.array(values)

  def search(self, values=None):
    # make a list of all the matches
    match = []
    #import pdb; pdb.set_trace()
    for hash in values:
        matches = [en for en in self.database if en[2]==hash]
        match.append(matches)
    
    # now filter from the matches, count songs by getting unique ID
    self.song_count, unique_indices = np.unique(self.database[:,0], return_index=True)

    self.song_names = [self.database[x,1] for x in unique_indices]
    histogram       = np.zeros(len(self.song_count))

    for mat in match:
      for m in mat:
        i = int(m[0])
        histogram[i] += 1
    most_likely_idx = np.argmax(np.array(histogram))
    scaled_histogram = histogram*100/len(values)
    return scaled_histogram, self.song_names[most_likely_idx]

In [6]:
# create the functions to make the fingerprinting
def fingerprint(wavefile, sample=False):
  # collect the wave data
  wave_obj = wave.open(wavefile, 'rb')
  sample_freq = wave_obj.getframerate()
  n_samples = wave_obj.getnframes()
  t_audio = n_samples/sample_freq
  signal_wave   = wave_obj.readframes(n_samples)
  signal_array  = np.frombuffer(signal_wave, dtype=np.int16)
  l_chan        = signal_array[0::2]
  r_chan        = signal_array[1::2]
  total_sig     = (l_chan/2.0 + r_chan/2.0) # prevent overflow
  times = np.linspace(0, n_samples/sample_freq, num=n_samples)

  # if this is the image that we're sampling
  # then read in the file, add gaussian noise, and trim to 10s
  if sample:
    sample_set = samples_count
    # if the file is less than 10s then just sample the whole song
    if (n_samples <= sample_set):
      sample_set = n_samples
    try:
      start = np.random.randint(0, high=n_samples-sample_set) 
    except:
      start = 0
    total_sig   = total_sig[start:start+sample_set]
    times       = times[start:start+sample_set]

  # and make the spectogram
  data, freqs, t = mlab.specgram(total_sig, Fs=sample_freq, NFFT=4096, noverlap=4096*.5)

  #if it's the image we're sampling, add some random gaussian noise
  #if sample:
  #  gauss_noise = np.random.random(data.shape)
  #  data        = data+gauss_noise*np.median(data)/8

  # then use that to make the fingerprinted image
  median = np.median(data)
  stdev = 0 + np.std(data)/8
  sumval = max(median+stdev, threshold)
  # get things in the higher than the threshold of half the stdev (this means it will scale with loud sounds)
  xy = peak_local_max(data, min_distance=25,threshold_abs=sumval)


  if debug:
    print('The song is {} seconds long.'.format(t_audio))
  if show_plots:
    fig , (ax1,ax2)  = plt.subplots(2, 1, figsize=(15,5),sharex = False, sharey=False)
    z1_plot = ax1.specgram(total_sig, Fs=sample_freq, vmin=-20, vmax=50)
    z2_plot = ax2.plot(xy[:,1], xy[:,0], '.')
    plt.show()

  return xy # return the fingerprint image to compute the hash


In [7]:
# and the function to make the hash values
def hash_image(xy):
  # sort them so that they're organized by sample/time
  pairs = np.sort(xy, axis=0) 
  results = []
  for i in range(len(pairs)):
    #fanresults = []
    for fan in range(1,fan_value):
      if (i+fan) >= len(pairs):
        break
      # freq is y axis
      freq1 = pairs[i,0]
      freq2 = pairs[i+fan,0]

      # time is x axis
      time1 = pairs[i,1]
      time2 = pairs[i+fan,1]
      tdelta = time2-time1 

      # take the values and put them in the hash using sha1 hash 
      if tdelta >=0 and tdelta <= finger_reduct_val and freq1 != freq2:
        result = "{freq1}|{freq2}|{delta}".format(freq1=freq1, freq2=freq2, delta= tdelta)
        result = result.encode('utf-8')
        res    = hashlib.sha1(result)
        results.append((res.hexdigest()[0:finger_reduct_val], time1))
    #results.append(fanresults)
  return results

In [8]:
# convert files to wave format
def convert_mp3_files():
  for filenm in all_mp3:
    # save off the images in wav format
    dst      = wavbase + filenm[:-3] + 'wav'
    filename = mp3base + filenm
    AudioSegment.from_mp3(filename).export(dst, format="wav")

In [9]:
# create the database by reading in the files
def create_database(db):
  # get the hash values
  for id, wavfile in enumerate(all_wav):
    songname  = wavfile[:-4]
    xy        = fingerprint(wavbase + wavfile)
    imagehash = hash_image(xy)
    data      = [[id, songname, x[0], x[1]] for x in imagehash]
    db.add_values(data)

In [10]:
def search_database(db):
  # parse the 10 random selected files:
  indexes = np.random.randint(0,high=len(all_wav), size=(10,1))
  for idx in indexes:
    songfile  = all_wav[idx[0]]
    songname  = songfile[:-4]
    xy        = fingerprint(wavbase + songfile, sample=True)
    imagehash = hash_image(xy)
    imagehash = np.array(imagehash)
                         
    #hashes only search
    try:
      hashkey = imagehash[:,0]
      histogram, song_choice = db.search(hashkey)
    except:
      print('Something went wrong with this sample')
      print(imagehash.shape)

    print('Most Likely: {}, Actual: {}'.format(song_choice, songname))
    print(histogram)
  

In [11]:
########### MAIN ############
if not isExist:
  convert_mp3_files()
  all_wav   = os.listdir(wavbase)


In [12]:
# initialize the database
db = SongDatabase()

In [13]:
# create the database off the wav file
create_database(db)

In [19]:
# create sample to test
search_database(db) 

Most Likely: dreams, Actual: dreams
[0.02788845 0.        ]
Most Likely: dreams, Actual: dreams
[0.02622951 0.        ]
Most Likely: Broke For Free - Night Owl, Actual: Broke For Free - Night Owl
[0.         0.25697211]
Most Likely: Broke For Free - Night Owl, Actual: Broke For Free - Night Owl
[0.         0.16074766]
Most Likely: dreams, Actual: dreams
[0.00520833 0.        ]
Most Likely: Broke For Free - Night Owl, Actual: Broke For Free - Night Owl
[0.         0.57476636]
Most Likely: dreams, Actual: dreams
[0.01908397 0.        ]
Most Likely: Broke For Free - Night Owl, Actual: Broke For Free - Night Owl
[0.         0.34560327]
Most Likely: dreams, Actual: dreams
[0.015625 0.      ]
Most Likely: dreams, Actual: dreams
[0.0183727 0.       ]


In [15]:
show_plots = False