In [1]:
import scipy.io.wavfile as wf
import numpy as np
import matplotlib.pyplot as mp
import wave as wav
import os
import librosa
import soundfile
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
import tensorflow as tf
import generate_masking_threshold as generate_mask
import base64
import hashlib
import json 
import time
import uuid
import requests
import scipy
import random
#import scikits.audiolab

def compute_mfcc(audio, **kwargs):
    """
    Compute the MFCC for a given audio waveform. This is
    identical to how DeepSpeech does it, but does it all in
    TensorFlow so that we can differentiate through it.
    """
    batch_size, size = audio.get_shape().as_list()
    audio = tf.cast(audio, tf.float32)

    # 1. Pre-emphasizer, a high-pass filter
    audio = tf.concat(
        (audio[:, :1], audio[:, 1:] - 0.97 * audio[:, :-1], np.zeros((batch_size, 512), dtype=np.float32)), 1)

    # 2. windowing into frames of 512 samples, overlapping
    windowed = tf.stack([audio[:, i:i + 512] for i in range(0, size - 320, 320)], 1)

    window = np.hamming(512)
    windowed = windowed * window

    # 3. Take the FFT to convert to frequency space
    ffted = tf.spectral.rfft(windowed, [512])
    ffted = 1.0 / 512 * tf.square(tf.abs(ffted))

    # 4. Compute the Mel windowing of the FFT
    energy = tf.reduce_sum(ffted, axis=2) + np.finfo(float).eps
    filters = np.load("filterbanks.npy").T
    feat = tf.matmul(ffted, np.array([filters] * batch_size, dtype=np.float32)) + np.finfo(float).eps

    # 5. Take the DCT again, because why not
    feat = tf.log(feat)
    feat = tf.spectral.dct(feat, type=2, norm='ortho')[:, :, :26]

    # 6. Amplify high frequencies for some reason
    _, nframes, ncoeff = feat.get_shape().as_list()
    n = np.arange(ncoeff)
    lift = 1 + (22 / 2.) * np.sin(np.pi * n / 22)
    feat = lift * feat
    width = feat.get_shape().as_list()[1]

    # 7. And now stick the energy next to the features
    feat = tf.concat((tf.reshape(tf.log(energy), (-1, width, 1)), feat[:, :, 1:]), axis=2)
    return feat

def pad(mfcc2,mfcc1):
    size1 = mfcc1.get_shape().as_list()
    size2 = mfcc2.get_shape().as_list()
    length = size1[1] - size2[1]
    paddings = [[0,0],[0,length],[0,0]]
    return tf.pad(mfcc2,paddings,"CONSTANT")

def SNR(origanl, current):
    noise=current-origanl
    origanl_ar = sess.run(origanl)
    noise_ar = sess.run(noise)
    ans = 20*np.log10(np.linalg.norm(origanl_ar, ord=2)/np.linalg.norm(noise_ar, ord=2))
    return ans

def new_input(noise, audio_sequence1):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        tensor = tf.clip_by_value(noise+audio_sequence1, -2**15, 2**15-1)
        array = sess.run(tensor)
    return array

def psychoacoustic_loss(adv_audio, original, psd_max_a, th):
    logits_delta = transform((adv_audio[0,:] - original[0,:]),psd_max_a[0])
    psychoacoustic_loss = tf.reduce_mean(tf.nn.relu(logits_delta - th[0]))
    return psychoacoustic_loss

def truncate(mfcc1, mfcc2):
    size1 = mfcc1.get_shape().as_list()#origin
    #size2 = mfcc2.get_shape().as_list()#target
    mfcc2 = tf.slice(mfcc2,[0,0,0],[1,size1[1],26])
    return mfcc2

def loss(original, target, noise, psychoacoustic_loss,alpha):
    mfcc1 = compute_mfcc(original + noise)
    size1 = original.get_shape().as_list()
    size2 = target.get_shape().as_list()
    if size1[1] > size2[1]:
        mfcc2 = pad(compute_mfcc(target),mfcc1)
    else:
        mfcc2 = truncate(mfcc1,compute_mfcc(target))
    loss = tf.norm(mfcc1-mfcc2,2) + alpha * psychoacoustic_loss
    #+ 0.01 * tf.cast(tf.norm(noise,2),dtype = tf.float32)
#     loss =  tf.norm(cal_mfcc(new_input(noise, audio_sequence1),sample_rate1) 
#     - pad(cal_mfcc(audio_sequence2, sample_rate2)),2) + 0.001 * tf.cast(tf.norm(noise, 2),dtype = tf.float64)
    return loss
    
def getSign(timestamp, nonce):
    hs = hashlib.sha256()
    appkey = "zvpcvm5hxib3jz3vt2jshqxjndcywwo2qnx7s6iy"
    secret = '3c8a61e2bdb0ee2d0af74814142ba2ee'
    hs.update((appkey + timestamp + secret + nonce).encode('utf-8'))
    signature = hs.hexdigest().upper()
    return signature
def identifyFeatureByGroupId(confirmFeatureFileName):
    identify_feature = open(confirmFeatureFileName, 'rb').read()
    # 声纹base64字符串
    audio_data = base64.b64encode(identify_feature)
    timestamp = str(int(time.time() * 1000))
    nonce = str(uuid.uuid1()).replace('-', '')
    sign = getSign(timestamp, nonce)
    headers = {"Content-Type": "application/json"}
    appkey = "zvpcvm5hxib3jz3vt2jshqxjndcywwo2qnx7s6iy"
    groupId = '12'
    host = 'https://ai-vpr.hivoice.cn'
    identifyFeatureByGroupIdEndPoint = '/vpr/v1/identifyFeatureByGroupId'
    identify_feature_param = {
        "appkey": appkey,
        "timestamp": timestamp,
        "nonce": nonce,
        "sign": sign,
        "groupId": groupId,
        "topN": 6,
        "audioData": audio_data.decode(),
        "audioSampleRate": 16000,
        "audioFormat": "wav"
    }
    #print('identify_feature_param', identify_feature_param)
    identify_feature_resp = requests.post(url=host + identifyFeatureByGroupIdEndPoint,
                                          data=json.dumps(identify_feature_param),
                                          headers=headers)
    identify_feature_result = json.loads(identify_feature_resp.content)
    return identify_feature_result['data'][0]['featureInfo'], identify_feature_result['data'][0]['score'],identify_feature_result['data'][1]['featureInfo'], identify_feature_result['data'][1]['score']


class Transform(object):
    '''
    Return: PSD
    '''
    def __init__(self, window_size):
        self.scale = 8. / 3.
        self.frame_length = int(window_size)
        self.frame_step = int(window_size // 4)
        self.window_size = window_size

    def __call__(self, x, psd_max_ori):
        win = tf.contrib.signal.stft(x, self.frame_length, self.frame_step)
        z = self.scale * tf.abs(win / self.window_size)
        psd = tf.square(z)
        PSD = tf.pow(10., 9.6) / tf.reshape(psd_max_ori, [-1, 1, 1]) * psd
        return PSD

In [2]:

path = "/mnt/data/Chenpinji/cmu_dataset"
filename2 = '/mnt/data/Chenpinji/cmu_dataset/cmu7/wav/arctic_a0008.wav'
for cnt in range(1000):
    select = random.randint(0,9)
    select2 = random.randint(0,500)
    filename_1 = path + '/' + os.listdir(path)[select] + '/' + 'wav'
    audioname = os.listdir(filename_1)[select2]
    filename1 = filename_1 + '/'+audioname
    sample_rate1, audio_sequence1 = wf.read(filename1)
    sample_rate2, audio_sequence2 = wf.read(filename2)
    #time = a1 / f1#音频时长
    audio1 = np.expand_dims(audio_sequence1,0)
    audio2 = np.expand_dims(audio_sequence2,0)
    len1 = len(audio1[0])
    len2 = len(audio2[0])
    th, psd_max = generate_mask.generate_th(audio_sequence1.astype(float), 16000, 2048)
    th=np.expand_dims(th,0)
    psd_max=np.expand_dims(psd_max,0)
    transform = Transform(2048)
    psd_max_a = tf.cast(psd_max, tf.float32)

    #stage 1, nearly no constraint on noise size, which aims to make the attack succeed
    original = tf.Variable(np.zeros((1, len1), dtype=np.float32))
    target  = tf.Variable(np.zeros((1, len2), dtype=np.float32))
    with tf.variable_scope("noise"):
        noise=tf.Variable(tf.random_normal([1,len1],stddev = 50))
        #noise=tf.clip_by_value(noise,-5000,5000)
        t_vars = tf.trainable_variables()
        noise_vars = [var for var in t_vars if "noise" in var.name]
    #config = tf.ConfigProto(device_count = {'CPU': 4})
    with tf.Session() as sess:
        train_teacher = tf.train.AdamOptimizer(2).minimize (loss(original, target, noise, psychoacoustic_loss(original + noise, original, psd_max_a, th),0.00001), var_list=noise_vars)
        sess.run(tf.global_variables_initializer())
        original = original.assign(np.array(audio1))
        target   = target.assign(np.array(audio2))
        select3 = random.randint(500,1000)
        for i in range(select3): 
            sess.run((train_teacher))
            if i % 100 == 0:
                a,b,NewInput= sess.run([loss(original, target, noise, psychoacoustic_loss(original + noise, original, psd_max_a, th), 0.00001),psychoacoustic_loss(original + noise, original, psd_max_a, th), original + noise])
                print(i)
                print('current loss:',a,'SNR:',SNR(original, NewInput))
                print('current psychoacoustic_loss', b)
            if i == select3 - 1:
                a,b,NewInput= sess.run([loss(original, target, noise, psychoacoustic_loss(original + noise, original, psd_max_a, th), 0.00001),psychoacoustic_loss(original + noise, original, psd_max_a, th), original + noise])
                print(i)
                print('current loss:',a,'SNR:',SNR(original, NewInput))
                print('current psychoacoustic_loss', b)
                audio_name = '/mnt/data/Chenpinji/mfcc_Gen1000/'+'mfccGen_'+str(cnt+193) + '.wav'
                scaled = np.array(np.clip(np.round(NewInput[0]),-2 ** 15, 2 ** 15 - 1),dtype = np.int16)
                wf.write(audio_name, 16000 ,scaled)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

0
current loss: 920.5245 SNR: 37.118709087371826
current psychoacoustic_loss 860.05023
100
current loss: 694.377 SNR: 30.698511600494385
current psychoacoustic_loss 4829.8394
200
current loss: 635.0102 SNR: 26.755526065826416
current psychoacoustic_loss 12508.816
300
current loss: 599.88226 SNR: 24.19147491455078
current psychoacoustic_loss 22863.82
400
current loss: 572.66626 SNR: 22.268283367156982
current psychoacoustic_loss 35686.766
500
current loss: 550.5837 SNR: 20.72648525238037
current psychoacoustic_loss 50686.473
600
current loss: 531.6199 SNR: 19.44007158279419
current psychoacoustic_loss 67237.99
700
current loss: 5

KeyboardInterrupt: 