In [4]:
import pandas as pd
import numpy as np
import random
import re

import os, sys, glob, pickle

import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

import soundfile as sf

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import IPython.display as ipd

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
from tqdm import tqdm, tqdm_pandas

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [20]:
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
from keras import losses, models, optimizers
from keras.activations import relu, softmax
from keras.layers import (Convolution2D, GlobalAveragePooling2D, BatchNormalization, Flatten, Dropout,
                          GlobalMaxPool2D, MaxPool2D, concatenate, Activation, Input, Dense)

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Other  
from tqdm import tqdm, tqdm_pandas
import scipy
from scipy.stats import skew
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import sys
import IPython.display as ipd  # To play sound in the notebook
import warnings
# ignore warnings 
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
import pyttsx3 as pt
import speech_recognition as sr
import pywhatkit
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys

import os
import sys
os.system("clear")

In [5]:
audio_df = pd.read_csv('combined_data.csv')
audio_df.sample(5)

Unnamed: 0,Emotion,Path
8222,female_disgust,../SER/CREMA/1061_IWL_DIS_XX.wav
4797,male_fear,../SER/CREMA/1019_TAI_FEA_XX.wav
10010,male_calm,../SER/CREMA/1083_IOM_NEU_XX.wav
10827,female_disgust,../SER/SER-Ravdess-data/Actor_02/03-01-07-01-0...
8022,male_sad,../SER/CREMA/1059_DFA_SAD_XX.wav


In [231]:
audio_df_copy = audio_df.copy()

In [271]:
audio_df.Emotion.unique()

array(['male_angry', 'male_happy', 'male_calm', 'male_sad',
       'female_angry', 'female_happy', 'female_calm', 'female_sad'],
      dtype=object)

In [269]:
todrop = audio_df_copy[audio_df_copy['Emotion']=='female_disgust'].index
audio_df_copy.drop(todrop,inplace=True)

In [11]:
def prepare_data(df, n, mfcc):
    sampling_rate=44100
    audio_duration=2.5


    X = np.empty(shape=(df.shape[0], n, 216, 1))
    input_length = sampling_rate * audio_duration
    
    cnt = 0
    for fname in tqdm(df.Path):
        file_path = fname
        data, _ = librosa.load(file_path, sr=sampling_rate
                               ,res_type="kaiser_fast"
                               ,duration=2.5
                               ,offset=0.5
                              )

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, int(input_length) - len(data) - offset), "constant")

                
        # which feature?
        if mfcc == 1:
            # MFCC extraction 
            MFCC = librosa.feature.mfcc(data, sr=sampling_rate, n_mfcc=n_mfcc)
            MFCC = np.expand_dims(MFCC, axis=-1)
            X[cnt,] = MFCC
            
        else:
            # Log-melspectogram
            melspec = librosa.feature.melspectrogram(data, n_mels = n_melspec)   
            logspec = librosa.amplitude_to_db(melspec)
            logspec = np.expand_dims(logspec, axis=-1)
            X[cnt,] = logspec
            
        cnt += 1
    
    return X


In [272]:
n_mfcc = 30
mfcc = prepare_data(audio_df_copy, n = n_mfcc, aug = 0, mfcc = 1)

100%|██████████| 7664/7664 [11:29<00:00, 11.12it/s]


In [273]:
mfcc.shape

(7664, 30, 216, 1)

In [274]:
lb = LabelEncoder()

In [276]:
#  Split between train and test 
X_train, X_test, y_train, y_test = train_test_split(mfcc
                                                    , audio_df_copy.Emotion
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )


# one hot encode the target 
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))


In [277]:
y_test.shape

(1916, 8)

In [339]:

# Normalization as per the standard NN process
def get_2d_conv_model(n):
    ''' Create a standard deep 2D convolutional neural network'''
    nclass = 8
    inp = Input(shape=(n,216,1))  #2D matrix of 30 MFCC bands by 216 audio length.
    x = Convolution2D(64, (2,10), padding="same")(inp)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(64
    , (2,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(64
    , (2,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Convolution2D(64
    , (2,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Flatten()(x)
    x = Dense(64)(x)
    x = Dropout(rate=0.2)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dropout(rate=0.2)(x)
    
    out = Dense(nclass, activation=softmax)(x)
    model = models.Model(inputs=inp, outputs=out)
    
    opt = optimizers.Adam(0.001)
    model.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
    return model

# Build CNN model 
model = get_2d_conv_model(n=n_mfcc)
model_history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    batch_size=32, verbose = 2, epochs=150)

Epoch 1/150
180/180 - 153s - loss: 1.8103 - acc: 0.3173 - val_loss: 1.7406 - val_acc: 0.3706 - 153s/epoch - 847ms/step
Epoch 2/150
180/180 - 155s - loss: 1.3391 - acc: 0.4843 - val_loss: 1.7861 - val_acc: 0.3408 - 155s/epoch - 862ms/step
Epoch 3/150
180/180 - 150s - loss: 1.1262 - acc: 0.5743 - val_loss: 1.4492 - val_acc: 0.4802 - 150s/epoch - 836ms/step
Epoch 4/150
180/180 - 150s - loss: 1.0141 - acc: 0.6103 - val_loss: 1.3281 - val_acc: 0.5678 - 150s/epoch - 835ms/step
Epoch 5/150
180/180 - 150s - loss: 0.9349 - acc: 0.6491 - val_loss: 0.7946 - val_acc: 0.6931 - 150s/epoch - 835ms/step
Epoch 6/150
180/180 - 150s - loss: 0.8881 - acc: 0.6548 - val_loss: 0.8431 - val_acc: 0.6748 - 150s/epoch - 834ms/step
Epoch 7/150
180/180 - 149s - loss: 0.8453 - acc: 0.6815 - val_loss: 0.9695 - val_acc: 0.6096 - 149s/epoch - 830ms/step
Epoch 8/150
180/180 - 151s - loss: 0.8163 - acc: 0.6872 - val_loss: 0.7266 - val_acc: 0.7135 - 151s/epoch - 836ms/step
Epoch 9/150
180/180 - 151s - loss: 0.7750 - acc:

In [341]:
preds = model.predict(X_test, 
                         batch_size=16, 
                         verbose=1)

preds=preds.argmax(axis=1)
preds



array([7, 0, 5, ..., 4, 7, 0], dtype=int64)

In [342]:
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})


# Actual labels
actual=y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})

# Lets combined both of them into a single dataframe
finaldf = actual.join(preds)

In [343]:
finaldf.sample(10)

Unnamed: 0,actualvalues,predictedvalues
1294,female_angry,female_angry
1024,female_angry,female_angry
994,female_sad,female_sad
63,female_happy,female_happy
1892,female_sad,female_sad
130,male_angry,male_angry
707,female_sad,female_sad
1498,female_sad,female_sad
1742,male_calm,male_calm
310,male_calm,male_calm


In [344]:
# print("-------COMMANDS-------\nplay music (This automatically plays music base on your mood)\nplay song_name (self choosen music or movie) \n\n--control playing song as-----\npause\nresume\nstart again")
# print("to repeat song:REPLAY")


Testing code section

In [345]:
n_mfcc = 30

In [346]:
nwpath = ['../SER/output10.wav']

In [347]:
#deleting  recorded file
# os.remove('../SER/testing.wav')

In [372]:
newinput = pd.DataFrame({'Path': ['../SER/output16.wav']})
newinput

Unnamed: 0,Path
0,../SER/output16.wav


In [373]:
n_mfcc = 30
mfcc2 = prepare_data(newinput, n = n_mfcc, aug = 0, mfcc = 1)

100%|██████████| 1/1 [00:00<00:00, 17.85it/s]


In [374]:
preds2 = model.predict(mfcc2, 
                         batch_size=16, 
                         verbose=1)

preds2=preds2.argmax(axis=1)
preds2



array([3], dtype=int64)

In [375]:
final = preds2.astype(int).flatten()
final = (lb.inverse_transform((final)))
print(final) #emo(final) #gender(final) 

['female_sad']


1

In [176]:
recognizer = sr.Recognizer()

In [205]:
def getaudio():
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        print('listei')
        audio2 = recognizer.listen(source) 
    try:
        s = recognizer.recognize_google(audio2,language="en").lower()
        print(s)
        return s
    except:
        print('rtr')
        

In [178]:
def Speak(text):
    # Initialize the engine
    engine = pt.init()
    voice = engine.getProperty('voices')
    engine.setProperty('voice',voice[1].id)
    engine.say(text)
    engine.runAndWait()

In [211]:
Speak('kim')

In [227]:
commands = 'play'

In [229]:
# if commands == 'turn on music':
#     newinput = pd.DataFrame({'Path': '../SER/femi.wav'})    
#     mfcc2 = prepare_data(newinput, n = n_mfcc, aug = 0, mfcc = 1)
#     preds2 = model.predict(mfcc2, batch_size=16, verbose=1)
#     preds2=preds2.argmax(axis=1)
#     final = preds2.astype(int).flatten()
#     final = (lb.inverse_transform((final)))
#     print(final)
 



j
