In [4]:
import numpy
import math

def framesig(sig,frame_len,frame_step,winfunc=lambda x:numpy.ones((1,x))):
    slen = len(sig)
    frame_len = int(round(frame_len))
    frame_step = int(round(frame_step))
    if slen <= frame_len: 
        numframes = 1
    else:
        numframes = 1 + int(math.ceil((1.0*slen - frame_len)/frame_step))
    padlen = int((numframes-1)*frame_step + frame_len)
    zeros = numpy.zeros((padlen - slen,))
    padsignal = numpy.concatenate((sig,zeros))
    indices = numpy.tile(numpy.arange(0,frame_len),(numframes,1)) + numpy.tile(numpy.arange(0,numframes*frame_step,frame_step),(frame_len,1)).T
    indices = numpy.array(indices,dtype=numpy.int32)
    frames = padsignal[indices]
    win = numpy.tile(winfunc(frame_len),(numframes,1))
    return frames*win
    
def magspec(frames,NFFT):
    complex_spec = numpy.fft.rfft(frames,NFFT)
    return numpy.absolute(complex_spec)
          
def powspec(frames,NFFT):
    return 1.0/NFFT * numpy.square(magspec(frames,NFFT))
    
def preemphasis(signal,coeff=0.95):
    return numpy.append(signal[0],signal[1:]-coeff*signal[:-1])

In [5]:
import numpy
from scipy.fftpack import dct

def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True):
    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
    feat = numpy.log(feat)
    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
    feat = lifter(feat,ceplifter)
    if appendEnergy:
        feat[:,0] = numpy.log(energy)
    return feat

def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
    highfreq= highfreq or samplerate/2
    signal = preemphasis(signal,preemph)
    frames = framesig(signal, winlen*samplerate, winstep*samplerate)
    pspec = powspec(frames,nfft)
    energy = numpy.sum(pspec,1)                                     # this stores the total energy in each frame
    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
    fb = get_filterbanks(nfilt,nfft,samplerate)
    feat = numpy.dot(pspec,fb.T)                                    # compute the filterbank energies
    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat)       # if feat is zero, we get problems with log
    return feat,energy

def hz2mel(hz):
    return 2595 * numpy.log10(1+hz/700.0)
    
def mel2hz(mel):
    return 700*(10**(mel/2595.0)-1)

def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
    highfreq= highfreq or samplerate/2
    # compute points evenly spaced in mels
    lowmel = hz2mel(lowfreq)
    highmel = hz2mel(highfreq)
    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
    # our points are in Hz, but we use fft bins, so we have to convert
    #  from Hz to fft bin number
    bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
#     print nfilt,nfft/2+1
    fbank = numpy.zeros([nfilt,int(nfft/2+1)])
    for j in xrange(0,nfilt):
        for i in xrange(int(bin[j]),int(bin[j+1])):
            fbank[j,i] = (i - bin[j])/(bin[j+1]-bin[j])
        for i in xrange(int(bin[j+1]),int(bin[j+2])):
            fbank[j,i] = (bin[j+2]-i)/(bin[j+2]-bin[j+1])
    return fbank                 
    
def lifter(cepstra,L=22):
    if L > 0:
        nframes,ncoeff = numpy.shape(cepstra)
        n = numpy.arange(ncoeff)
        lift = 1+ (L/2)*numpy.sin(numpy.pi*n/L)
        return lift*cepstra
    else:
        # values of L <= 0, do nothing
        return cepstra

In [15]:
from __future__ import division

import csv
import os.path
import numpy as np
import scipy.io.wavfile as wavfile

from sklearn import svm

class trainModel():
    def __init__(self, data_dir):
        self.data_dir = os.path.abspath(data_dir)
        self.train_file = "training_data.csv"
        
        self.gen_features()
        mfcc_list, speaker_names = self.get_tdata()
        
        # generate speaker_ids from speaker_names
        self.spkr_ntoi = {}
        self.spkr_iton = {}

        i = 0 
        for name in speaker_names:
            if name not in self.spkr_ntoi:
                self.spkr_ntoi[name] = i
                self.spkr_iton[i] = name
                i += 1
        speaker_ids = map(lambda n: self.spkr_ntoi[n], speaker_names)
        
        # train a linear svm now
        self.recognizer = svm.LinearSVC()
        self.recognizer.fit(mfcc_list, speaker_ids)

    def mfcc_to_fvec(self, ceps):
        mean = np.mean(ceps, axis=0)                               # calculate the mean 
        std = np.std(ceps, axis=0)                                 # and standard deviation of MFCC vectors 
        fvec = np.concatenate((mean, std)).tolist()                # use [mean, std] as the feature vector
        return fvec
        
    def gen_features(self):
        with open(self.train_file, 'w') as ohandle:
            melwriter = csv.writer(ohandle)
            speakers = os.listdir(self.data_dir)
            
            for spkr_dir in speakers:
                for soundclip in os.listdir(os.path.join(self.data_dir, spkr_dir)):
                    clip_path = os.path.abspath(os.path.join(self.data_dir, spkr_dir, soundclip))
                    sample_rate, data = wavfile.read(clip_path)
                    mfcc_vectors = mfcc(data, sample_rate)
                
                    feature_vector = self.mfcc_to_fvec(mfcc_vectors)
                    feature_vector.append(spkr_dir)
                    melwriter.writerow(feature_vector)

    def get_tdata(self):
        mfcc_list = []
        speaker_names = []

        with open(self.train_file, 'r') as icsv_handle:
            melreader = csv.reader(icsv_handle)
            for row in melreader:
                mfcc_list.append(map(float, row[:-1]))
                speaker_names.append(row[-1])
        return mfcc_list, speaker_names
        
        
    def predict(self, soundclip):
        sample_rate, data = wavfile.read(os.path.abspath(soundclip))
        ceps = mfcc(data, sample_rate)
        fvec = self.mfcc_to_fvec(ceps)
        speaker_id = self.recognizer.predict([fvec])[0]
        return self.spkr_iton[speaker_id]



if __name__ == "__main__":
    trained_model = trainModel("train_data")

    test_dir = os.path.abspath("test_data")
    testset_size = 0
    testset_error = 0

    for spkr_dir in os.listdir(test_dir):
        for soundclip in os.listdir(os.path.join(test_dir, spkr_dir)):
            clippath = os.path.abspath(os.path.join(test_dir, spkr_dir, soundclip))
            prediction = trained_model.predict(clippath)
            
            testset_size += 1
            if prediction != spkr_dir:
                testset_error += 1    
                print "%s %s %s " % (prediction, spkr_dir, u"[\u2717]")
            else:
                print "%s %s %s " % (prediction, spkr_dir, u"[\u2713]")

    if testset_size == 0:
        print "No test data available."
    else:
        print "Error on test data: %.2f%%\n" % (testset_error / testset_size * 100)
        print "Accuracy : %.2f%%\n" % (100 - (testset_error / testset_size * 100))

Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Femal

Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Female Female [✓] 
Male Female [✗] 
Female Female [✓] 
Female Female [✓] 
Male Male [✓] 
Female Male [✗] 
Male Male [✓] 
Male Male [✓] 
Female Male [✗] 
Female Male [✗] 
Female Male [✗] 
Male Male [✓] 
Male Male [✓] 
Male Male [✓] 
Male Male [✓] 
Male Male [✓] 
Female Male [✗] 
Male Ma

In [5]:
import os
import cPickle
import numpy as np
from scipy.io.wavfile import read
from sklearn.mixture import GMM 
import python_speech_features as mfcc
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")

def get_MFCC(sr,audio):
    features = mfcc.mfcc(audio,sr, 0.025, 0.01, 13,appendEnergy = False)
    features = preprocessing.scale(features)
    return features

data_dir = "pygender\\"
dest = os.path.abspath(data_dir)
source = os.path.join(dest, "train_data\\youtube\\male\\")
#path to training data
# source   = "D:\\pygender\\train_data\\youtube\\male\\"   
#path to save trained model
# dest     = "D:\\pygender\\"         
files    = [os.path.join(source,f) for f in os.listdir(source) if f.endswith('.wav')] 
features = np.asarray(());

for f in files:
    sr,audio = read(f)
    vector   = get_MFCC(sr,audio)
    if features.size == 0:
        features = vector
    else:
        features = np.vstack((features, vector))

gmm = GMM(n_components = 8, n_iter = 200, covariance_type='diag', n_init = 3)
gmm.fit(features)
picklefile = f.split("\\")[-2].split(".wav")[0]+".gmm"

# model saved as male.gmm
cPickle.dump(gmm,open(dest + picklefile,'w'))
print 'modeling completed for gender:',picklefile

modeling completed for gender: male.gmm


In [6]:
import os
import cPickle
import numpy as np
from scipy.io.wavfile import read
import python_speech_features as mfcc
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")

def get_MFCC(sr,audio):
    features = mfcc.mfcc(audio,sr, 0.025, 0.01, 13,appendEnergy = False)
    feat     = np.asarray(())
    for i in range(features.shape[0]):
        temp = features[i,:]
        if np.isnan(np.min(temp)):
            continue
        else:
            if feat.size == 0:
                feat = temp
            else:
                feat = np.vstack((feat, temp))
    features = feat;
    features = preprocessing.scale(features)
    return features

data_dir = "pygender\\"
modelpath = os.path.abspath(data_dir)
sourcepath = os.path.join(modelpath, "test_data\\AudioSet\\female_clips\\")
# #path to testing data
# sourcepath = "D:\\pygender\\test_data\\AudioSet\\female_clips\\"      
# #path to saved models    
# modelpath  = "D:\\pygender\\"   

gmm_files = [os.path.join(modelpath,fname) for fname in 
              os.listdir(modelpath) if fname.endswith('.gmm')]
models    = [cPickle.load(open(fname,'r')) for fname in gmm_files]
genders   = [fname.split("\\")[-1].split(".gmm")[0] for fname 
              in gmm_files]
files     = [os.path.join(sourcepath,f) for f in os.listdir(sourcepath) 
              if f.endswith(".wav")] 

for f in files:
    print f.split("\\")[-1]
    sr, audio  = read(f)
    features   = get_MFCC(sr,audio)
    scores     = None
    log_likelihood = np.zeros(len(models)) 
    for i in range(len(models)):
        gmm    = models[i]         #checking with each model one by one
        scores = np.array(gmm.score(features))
        log_likelihood[i] = scores.sum()
    winner = np.argmax(log_likelihood)
    print "\tdetected as - ", genders[winner],"\n\tscores:female ",log_likelihood[0],",male ", log_likelihood[1],"\n"

--EQQVMYe50.wav


ValueError: attempt to get argmax of an empty sequence