In [5]:
import numpy as np
import sounddevice as sd
import pyaudio
import math
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [113]:
PI = np.pi
TYPE=np.float32

## Let's start by implementing the robot voice

In [108]:
def robot_voice(samples, index_offset, prev_val, table):
    """
    Must return 3 things :
        - the processed array for robot voice
        - the next index to be used for the lookup table
        - the last value still 'unprocessed'
    """
    last_val = samples[-1] # For the robot_voice() call of the next chunk
    N = len(table)
    
    # Remove the DC component
    samples[0] = samples[0] - prev_val
    samples[1:] = samples[1:] - samples[0:-1]
    
    k = index_offset
    
    for i in range(len(samples)):
        samples[i] = samples[i] * table[k]
        k = (k+1) % N
        
    
    return samples, k, last_val

In [115]:
# Parameters
rate = 16000 # Hz
fsin = 600 # Hz
CHUNK = 2**10 # samples. The higher the value, the larger the delay but the better the quality
LEN = 10 # to change the du ration of the simulation

In [116]:
def sin_table(fsin, rate):
    x = np.arange(0, rate/fsin)
    return np.sin(x*2*PI*fsin/rate)

SIN_TABLE = sin_table(fsin, rate)

In [117]:
p = pyaudio.PyAudio()

stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, input=True, frames_per_buffer=CHUNK)
player = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True, frames_per_buffer=CHUNK)

index = 0
prev_value = 0


for i in range(int(LEN*rate/CHUNK)):
    
    # Input
    data = -np.fromstring(stream.read(CHUNK),dtype=TYPE)
    
    # Processing
    data, index, prev_value = robot_voice(data, index, prev_value, SIN_TABLE) 
    
    # Output
    player.write(data,CHUNK)


stream.stop_stream()
stream.close()
p.terminate()

##  Now we move on to the granular synthesis

In [118]:
def ms2smp(ms, Fs):
    """
    This function returns the number of samples contained in 'ms' milliseconds of an audio file with sampling rate Fs
    """
    return int(float(Fs) * float(ms) / 1000.0)

In [119]:
def win_taper(N, a):
    """
    This function fades a window of length N with an 'a'/2 long increasing slope at the start
    and an 'a'/2 long decreasing slope at the end 
    
    OUTPUT :
        - win is an N-long array containing the 'shading values' (ie the trapezoid)
        - stride is an integer corresponding to the number of values that will not be affected by the next grain
         (eg. the last a/2 values are going to be affected by the next grain)"""
    
    
    R = int(N * a / 2)
    r = np.arange(0, R) / float(R)
    win = np.r_[r, np.ones(N - 2*R), r[::-1]]
    stride = N - R - 1
    return win, stride

In [120]:
def double_len(x, G):
    N = len(x)
    y = np.zeros(2 * N, dtype=TYPE)
    for n in range(0, len(x) - G, G):
        y[2*n:2*n+G] = x[n:n+G]
        y[2*n+G:2*n+2*G] = x[n:n+G]
    return y

In [121]:
def subsample(x, t):
    
    """
    This function gives the value of array x at index t.
    As t is a double here, the output value is a mix between x[floor(t)] and x[floor(t)+1].
    The 'mixing' coefficients is given by the fractional part of t.
    """
    
    # Integer part of t
    n = int(t)
    
    # 1 - fractional part of t => coefficient for x[floor(t)]
    a = 1.0 - (t - n)
    
    
    try:
        return a * x[n] + (1 - a) * x[n + 1] 
    except IndexError:
        try:
            return a * x[n]
        except IndexError:
            return 0

In [122]:
def resample(x, factor):
    
    """
    This function performs a turntable pitch shifting.
    If f > 1 then the voice will be accelerated and thus the frequency of speech will be higher.
    If f < 1 then the voice will be slowed down and thus the frequency of speed will be smaller
    """
    n_out = int(np.floor(len(x) / factor))
    y = np.zeros(n_out)
    
    for n in range(0, n_out):
        # Take 1 sample out of f 
        y[n] = subsample(x, float(n) * factor)
        
    return y.astype(TYPE)

In [170]:
def GS_pshift(x, factor, G, overlap=0.2):
    N = len(x)
    y = np.zeros(N, dtype=TYPE)
    
    # size of input buffer given grain size and resampling factor
    igs = int(G * factor + 0.5)
    win, stride = win_taper(G, overlap)
    
    """Remember the step must be 'stride' and not len(win) because the next windows overlaps the last samples
    of the previous window"""
    
    for n in range(0, len(x) - max(igs, G), stride):
        
        
        w = resample(x[n:n+igs], factor)
        
        if len(w) == len(win)-1:
            w = np.append(w,w[-1]/2.)
        
        y[n:n+G] += w * win
        
        
        
    return y

In [171]:
# Parameters
rate = 16000 # Hz
fsin = 600 # Hz
CHUNK = 2**10 # samples. The higher the value, the larger the delay but the better the quality
LEN = 10 # to change the duration of the simulation

G = int(CHUNK/8)

In [172]:
p = pyaudio.PyAudio()

stream = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, input=True, frames_per_buffer=CHUNK)
player = p.open(format=pyaudio.paFloat32, channels=1, rate=rate, output=True, frames_per_buffer=CHUNK)

for i in range(int(LEN*rate/CHUNK)):
    
    # Input
    data = -np.fromstring(stream.read(CHUNK),dtype=TYPE)
    
    # Processing. 
    # NOTE : THE RESULT HAS TO BE IN NP.FLOAT32
    # => return result.astype(TYPE) in the processing function
    
    data = GS_pshift(data, 1.5, G)
    
    # Output
    player.write(data,CHUNK)


stream.stop_stream()
stream.close()
p.terminate()