# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio
import speech_recognition as sr
from tqdm import tqdm
import warnings
from textblob import TextBlob
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import text2emotion as te
from sklearn.model_selection import train_test_split
import  tensorflow as tf
from transformers import BertTokenizer
import keras
from tensorflow.keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Input
from keras import regularizers
from keras.models import Model, Sequential
import warnings
warnings.simplefilter('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ankus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ankus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ankus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!





## Load the Dataset

In [2]:
filePaths = []
labels = []
for dirname, _, files in os.walk('archive/TESS/'):
    for filename in files:
        filePaths.append(os.path.join(dirname, filename))
        label = filename.split('_')[-1]
        label = (label.split('.')[0]).lower()
        labels.append(label)
print('Dataset is Loaded')

Dataset is Loaded


# Data Processing

In [3]:
print(len(filePaths))
filePaths[:15]

5600


['archive/TESS/OAF_angry\\OAF_back_angry.wav',
 'archive/TESS/OAF_angry\\OAF_bar_angry.wav',
 'archive/TESS/OAF_angry\\OAF_base_angry.wav',
 'archive/TESS/OAF_angry\\OAF_bath_angry.wav',
 'archive/TESS/OAF_angry\\OAF_bean_angry.wav',
 'archive/TESS/OAF_angry\\OAF_beg_angry.wav',
 'archive/TESS/OAF_angry\\OAF_bite_angry.wav',
 'archive/TESS/OAF_angry\\OAF_boat_angry.wav',
 'archive/TESS/OAF_angry\\OAF_bone_angry.wav',
 'archive/TESS/OAF_angry\\OAF_book_angry.wav',
 'archive/TESS/OAF_angry\\OAF_bought_angry.wav',
 'archive/TESS/OAF_angry\\OAF_burn_angry.wav',
 'archive/TESS/OAF_angry\\OAF_cab_angry.wav',
 'archive/TESS/OAF_angry\\OAF_calm_angry.wav',
 'archive/TESS/OAF_angry\\OAF_came_angry.wav']

In [4]:
labels[:15]

['angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry',
 'angry']

In [5]:
## Create a dataframe
df = pd.DataFrame()
df['audio_file'] = filePaths
df['label'] = labels
df.head()

Unnamed: 0,audio_file,label
0,archive/TESS/OAF_angry\OAF_back_angry.wav,angry
1,archive/TESS/OAF_angry\OAF_bar_angry.wav,angry
2,archive/TESS/OAF_angry\OAF_base_angry.wav,angry
3,archive/TESS/OAF_angry\OAF_bath_angry.wav,angry
4,archive/TESS/OAF_angry\OAF_bean_angry.wav,angry


In [6]:
df['label'].value_counts()

label
angry      800
disgust    800
fear       800
happy      800
neutral    800
ps         800
sad        800
Name: count, dtype: int64

In [7]:
df

Unnamed: 0,audio_file,label
0,archive/TESS/OAF_angry\OAF_back_angry.wav,angry
1,archive/TESS/OAF_angry\OAF_bar_angry.wav,angry
2,archive/TESS/OAF_angry\OAF_base_angry.wav,angry
3,archive/TESS/OAF_angry\OAF_bath_angry.wav,angry
4,archive/TESS/OAF_angry\OAF_bean_angry.wav,angry
...,...,...
5595,archive/TESS/YAF_sad\YAF_witch_sad.wav,sad
5596,archive/TESS/YAF_sad\YAF_yearn_sad.wav,sad
5597,archive/TESS/YAF_sad\YAF_yes_sad.wav,sad
5598,archive/TESS/YAF_sad\YAF_young_sad.wav,sad


In [8]:
r = sr.Recognizer()
with sr.AudioFile(filePaths[0]) as source:
    audio_text = r.listen(source)
    try:
        text = r.recognize_google(audio_text)
        print('Generating audio transcripts')
        print(text)
    except:
         print('Error encountered')

Generating audio transcripts
say the word back


In [None]:
audioTexts=[]
for file in tqdm(filePaths):
    with sr.AudioFile(file) as source:
        audio_text = r.listen(source)
        try:
            text = r.recognize_google(audio_text)
        except:
            text = 0
    audioTexts.append(text)
print(audioTexts[0:10])

 20%|██        | 1120/5600 [27:41<40:04,  1.86it/s]   

In [None]:
df['audioTexts']=audioTexts

In [None]:
df[df['audioTexts']==0]

In [None]:
df = df[df['audioTexts']!=0]
df

In [None]:
sns.countplot(data=df, x='label')

In [None]:
def waveplot(data, sr, emotion):
    plt.figure(figsize=(10,4))
    plt.title(emotion, size=15)
    librosa.display.waveshow(data, sr=sr)
    plt.show()
    
def spectogramplot(data, sr, emotion):
    x = librosa.stft(data)
    amp_db = librosa.amplitude_to_db(abs(x))
    plt.figure(figsize=(10,4))
    plt.title(emotion, size=15)
    librosa.display.specshow(amp_db, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar()

In [None]:
emotion = 'angry'
path = np.array(df['audio_file'][df['label']==emotion])[1]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
emotion = 'fear'
path = np.array(df['audio_file'][df['label']==emotion])[0]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
emotion = 'ps'
path = np.array(df['audio_file'][df['label']==emotion])[0]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
emotion = 'angry'
path = np.array(df['audio_file'][df['label']==emotion])[0]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
emotion = 'disgust'
path = np.array(df['audio_file'][df['label']==emotion])[0]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
emotion = 'fear'
path = np.array(df['audio_file'][df['label']==emotion])[0]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
emotion = 'happy'
path = np.array(df['audio_file'][df['label']==emotion])[0]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
emotion = 'neutral'
path = np.array(df['audio_file'][df['label']==emotion])[0]
data, samplingrate = librosa.load(path)
waveplot(data, samplingrate, emotion)
spectogramplot(data, samplingrate, emotion)
Audio(path)

In [None]:
for text in df['audioTexts']:
    print(te.get_emotion(text))

In [None]:
df['emotion'] = df.audioTexts.apply(lambda x: te.get_emotion(x))
df = pd.concat([df, pd.DataFrame(df['emotion'].tolist())], axis =1)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
sentiment_list = []
polarity_score = []
for text in df['audioTexts']:
    # Perform sentiment analysis using TextBlob
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity

    # Categorize the sentiment
    if polarity > 0:
        sentiment = "Positive"
    elif polarity == 0:
        sentiment = "Neutral"
    else:
        sentiment = "Negative"

    # Display the sentiment analysis result
    print(f"Text: {text}", f" Sentiment: {sentiment} ( Polarity: {polarity})")
    sentiment_list.append(sentiment)
    polarity_score.append(polarity)

In [None]:
df['Sentiment'] = sentiment_list
df['Sentiment Score'] = polarity_score
df

In [None]:
df.to_csv('tess_audio_text_data.csv')

# Feature Extraction Using MFCC

In [None]:
def extract_mfcc(file):
    y, sr = librosa.load(file, duration=3, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    return mfcc

In [None]:
extract_mfcc(df['audio_file'][0])

In [None]:
X_mfcc = df['audio_file'].apply(lambda x: extract_mfcc(x))

In [None]:
X_mfcc

In [None]:
X = [x for x in X_mfcc]
X = np.array(X)
X.shape

In [None]:
## input split
X = np.expand_dims(X, -1)
X.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
y = enc.fit_transform(df[['label']])

In [None]:
y = y.toarray()

In [None]:
y.shape

# LSTM 

In [None]:
model = Sequential([
    LSTM(256, return_sequences=False, input_shape=(40,1)),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(7, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
# Train the model
history = model.fit(X, y, validation_split=0.2, epochs=50, batch_size=64)

In [None]:
epochs = list(range(50))
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, label='train accuracy')
plt.plot(epochs, val_acc, label='val accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs, loss, label='train loss')
plt.plot(epochs, val_loss, label='val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

In [None]:
# Spliting data in test and train data set(80:20)
X = df['audio_file']
Y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.10)

In [None]:
X_train = X_train.apply(lambda x: extract_mfcc(x))
X_train = [x for x in X_train]
X_train = np.array(X_train)
X_train = np.expand_dims(X_train, -1)
X_train.shape

In [None]:
X_test = X_test.apply(lambda x: extract_mfcc(x))
X_test = [x for x in X_test]
X_test = np.array(X_test)
X_test = np.expand_dims(X_test, -1)
X_test.shape

In [None]:
y_train = enc.fit_transform(y_train.to_numpy().reshape(-1, 1)).toarray()
y_train.shape

In [None]:
y_test = enc.fit_transform(y_test.to_numpy().reshape(-1, 1)).toarray()
y_test.shape

# AutoEncoder for MFCC Features

In [None]:
# Building the Auto-encoder neural network

# Building the Input Layer
input_layer = Input(shape =(X_test.shape[1], ))

# Building the Encoder network
encoded = Dense(512, activation ='relu', activity_regularizer = regularizers.l1(1e-1))(input_layer)
encoded = Dense(64, activation ='relu', activity_regularizer = regularizers.l1(1e-5))(encoded)
encoded = Dense(32, activation ='relu')(encoded)

# Building the Decoder network
decoded = Dense(16, activation ='relu')(encoded)
decoded = Dense(8, activation ='relu')(decoded)

# Building the Output Layer
output_layer = Dense(7, activation ='softmax')(decoded)

In [None]:
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the model
history = autoencoder.fit(X_train, y_train, validation_split=0.1, epochs = 100)

In [None]:
lossMFCC, accuracyMFCC=autoencoder.evaluate(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracyMFCC * 100.0))

In [None]:
epochs = list(range(100))
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, label='train accuracy')
plt.plot(epochs, val_acc, label='val accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs, loss, label='train loss')
plt.plot(epochs, val_loss, label='val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

# Bert Tokenizer

BERT tokenizer uses something known as subword-based tokenization. Subword-tokenization splits unknown words into smaller words or characters such that the model can derive some meaning from the tokens. Because BERT practices to predict missing words in the text, and because it analyzes every sentence with no specific direction, it does a better job at understanding the meaning of homonyms than previous NLP methodologies,

In [None]:
df['audioTexts'].duplicated()

In [None]:
data =df.drop_duplicates(subset=['audioTexts'], keep='first')
data

In [None]:
# Spliting data in test and train data set(80:20)
x= data['audioTexts']
y = enc.fit_transform(data['label'].to_numpy().reshape(-1, 1)).toarray()
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, shuffle=True)

In [None]:
y_train.shape, y_test.shape

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

In [None]:
tokenized_texts = [tokenizer.tokenize(com) for com in X_train]
tokenized_texts = [sent[:len(tokenized_texts)] for sent in tokenized_texts]
X_train = [tokenizer.convert_tokens_to_ids(com) for com in tokenized_texts]
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=15, truncating='post', padding='post')
X_train.shape

In [None]:
tokenized_texts = [tokenizer.tokenize(com) for com in X_test]
tokenized_texts = [sent[:len(tokenized_texts)] for sent in tokenized_texts]
X_test = [tokenizer.convert_tokens_to_ids(com) for com in tokenized_texts]
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=15, truncating='post', padding='post')
X_test.shape

# AutoEncoder for Bert Features

In [None]:
# Building the Auto-encoder neural network

# Building the Input Layer
input_layer = Input(shape =(X_test.shape[1], ))

# Building the Encoder network
encoded = Dense(512, activation ='relu', activity_regularizer = regularizers.l1(1e-1))(input_layer)
encoded = Dense(64, activation ='relu', activity_regularizer = regularizers.l1(1e-5))(encoded)
encoded = Dense(32, activation ='relu')(encoded)

# Building the Decoder network
decoded = Dense(16, activation ='relu')(encoded)
decoded = Dense(8, activation ='relu')(decoded)

# Building the Output Layer
output_layer = Dense(7, activation ='softmax')(decoded)

In [None]:
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer ="adam", loss ='categorical_crossentropy', metrics='accuracy')

In [None]:
# Train the model
history = autoencoder.fit(X_train, y_train, validation_split=0.10, epochs = 30)

In [None]:
lossBert, accuracyBert =autoencoder.evaluate(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracyBert * 100.0))

In [None]:
epochs = list(range(30))
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, label='train accuracy')
plt.plot(epochs, val_acc, label='val accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs, loss, label='train loss')
plt.plot(epochs, val_loss, label='val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

# Comaritive Analysis

In [None]:
comp = pd.DataFrame()
comp['Models']=['MFCC','Bert']
comp['Accuracy']=[accuracyMFCC* 100.0, accuracyBert* 100.0]
comp['Loss']=[lossMFCC, lossBert]
comp

In [None]:
comp.plot(kind='barh')