# Speech Text Emotion Recognition

## Text To Features

In [1]:
# Basic imports
import os
import sys
import glob
import pickle
import warnings
import re
from dotenv import load_dotenv

# Data manipulation and numerical processing libraries
import numpy as np
import pandas as pd

# Audio processing libraries
import librosa
import librosa.display
import soundfile as sf
import opensmile

# Visualization library
import matplotlib.pyplot as plt

# Machine learning and evaluation tools
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Progress bar for loops
from tqdm.notebook import tqdm

# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

# Custom feature extraction modules from psychai (assuming psychai is installed in your environment)
import psychai.feature.feature_extraction.feature_retriever
import psychai.feature.feature_extraction.feature_processor
import psychai.speech_acoustics.feature_extraction.feature_extraction

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

import jieba
import string
import pandas as pd
import os, sys
import psychai.data_visualization.chart
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize feature processor from psychai
feature_processor = psychai.feature.feature_extraction.feature_processor.FeatureProcessor()

# Define the current directory path. This is useful if you have a custom module or scripts located in a current
# directory that you want to import into this script.
current_dir = os.path.abspath('../')

# Add the parent directory path to the Python path (sys.path), which allows importing modules from it.
sys.path.append(current_dir)

import local_utilities
import local_data_loader
from datetime import datetime
# Set base directories and file paths

load_dotenv(override=True)
huggingface_cache_location = os.getenv("huggingface_cache_location")
datasets_cache_location = os.getenv("datasets_cache_location")
resutls_location = os.getenv("results_location")
resource_path= os.getenv("resources_location")

today = datetime.now().strftime('%Y%m%d')

csv_path = os.path.join(resutls_location, "examples", "paper5_mmer_moral","speech_text","csv")
record_path =  os.path.join(csv_path,"result_20241107.csv")
save_path_speech_text =  os.path.join(csv_path,f"result_speech_text_{today}.csv")
save_path_speech_sentiment =  os.path.join(csv_path,f"result_speech_sentiment_{today}.csv")
save_path_speech_sentiment_concatenated =  os.path.join(csv_path,f"result_speech_sentiment_concatenated_{today}.csv")

data_helper = local_utilities.DataHelper()


In [2]:
def preprocess_text(text):
    # Tokenize the Chinese text using jieba
    tokens = jieba.lcut(text)
    
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    
    return ' '.join(tokens)


In [3]:
df_read = pd.read_csv(record_path)
df_read["Processed_Text"] = df_read["text"].apply(data_helper.clean_llm_returned_text)
df_read= df_read.reset_index(drop=True)
df_read

Unnamed: 0.1,Unnamed: 0,user_id,modality,attribute_index,attribute_value,files,file_size,part_0,part_1,part_2,part_3,part_4,part_5,Group,text,Processed_Text
0,0,1,Segment_Audio,2,25,G:\Experiments\Moral Elevation\Disk1_2_combine...,13439054,PS-9,1,25,2022-05-21,125517,1.wav,2,(array([['<|startoftranscript|><|zh|><|transcr...,印象最強烈的部分應該是
1,1,1,Segment_Audio,2,26,G:\Experiments\Moral Elevation\Disk1_2_combine...,5763150,PS-9,1,26,2022-05-21,125627,1.wav,2,(array([['<|startoftranscript|><|zh|><|transcr...,好像有一點點起雞皮疙瘩其實他好像沒有特別明顯的反應
2,2,1,Segment_Audio,2,27,G:\Experiments\Moral Elevation\Disk1_2_combine...,12095566,PS-9,1,27,2022-05-21,125657,1.wav,2,(array([['<|startoftranscript|><|zh|><|transcr...,我的看法是感觉人都是有三个两面的然后可能因为一些因素他可能会产生一些异化就是比如说一些
3,3,1,Segment_Audio,2,28,G:\Experiments\Moral Elevation\Disk1_2_combine...,9789518,PS-9,1,28,2022-05-21,125800,1.wav,2,(array([['<|startoftranscript|><|zh|><|transcr...,我覺得肯定會是想要成為一個更好的人就是要做好自己的事情完成好自己的一些工作從小事做起然後
4,4,2,Segment_Audio,2,25,G:\Experiments\Moral Elevation\Disk1_2_combine...,9404494,PS-9,2,25,2022-05-21,124843,1.wav,1,(array([['<|startoftranscript|><|zh|><|transcr...,我感到印象最強烈的和情緒起伏最大的一點是當那個哥哥他被對手打趴下的時候然後那個小女孩她的哭聲以及
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,399,115,Segment_Audio,2,28,G:\Experiments\Moral Elevation\Disk1_2_combine...,7295054,PS-9,115,28,2022-06-25,100134,1.wav,1,(array([['<|startoftranscript|><|zh|><|transcr...,我想要成为一个更好的人想做的事就是好好陪家人吧陪家人在自己有时间有精力的时候陪他们去做一些事...
400,400,117,Segment_Audio,2,25,G:\Experiments\Moral Elevation\Disk1_2_combine...,7487566,PS-9,117,25,2022-06-25,193725,1.wav,2,(array([['<|startoftranscript|><|zh|><|transcr...,和幼幼回到一起强烈的印象强烈的印象他回到家以后吧走向他妈妈的时候
401,401,117,Segment_Audio,2,26,G:\Experiments\Moral Elevation\Disk1_2_combine...,3838030,PS-9,117,26,2022-06-25,193804,1.wav,2,(array([['<|startoftranscript|><|zh|><|transcr...,没有喉咙感觉有一点起鸡皮疙瘩也有可能是因为冷
402,402,117,Segment_Audio,2,27,G:\Experiments\Moral Elevation\Disk1_2_combine...,12095566,PS-9,117,27,2022-06-25,193824,1.wav,2,(array([['<|startoftranscript|><|nn|><|transcr...,"不是所有人都是善良的。嗯,也……咋说呢?"


In [4]:
# Function to remove all non-Chinese characters
def remove_non_chinese(text):
    return re.sub(r'[^\u4e00-\u9fff]', '', text)

# Apply the function to the 'text' column
df_read['Processed_Text'] = df_read['Processed_Text'].apply(remove_non_chinese)

# Define a list of specific Chinese substrings you want to remove from the 'text' column
substrings_to_remove = [
    "看完短片之后你是否感到这世上的人们实在是很好", 
    "看完这篇短你是否想要成为一个更好的人具体描述", 
    "问健一让我回答我给我的最强烈的印象和情绪感受", 
    "口头影片问卷二,我有一些感到一些生理反应",
]

# Function to remove specified substrings from a text
def remove_substrings(text, substrings):
    for substring in substrings:
        text = text.replace(substring, "")  # Replace each substring with an empty string
    return text.strip()  # Strip any leading/trailing whitespace

# Apply the removal function to each row in the 'text' column
df_read['Processed_Text'] = df_read['Processed_Text'].apply(lambda x: remove_substrings(x, substrings_to_remove))

# Apply preprocessing to the text data
df_read['Processed_Text'] = df_read['Processed_Text'].apply(preprocess_text)


Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.834 seconds.
Prefix dict has been built successfully.


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer() 

# Convert the processed text data to TF-IDF features
fitted_tfidf = tfidf.fit_transform(df_read['Processed_Text'])

# Step 4: Convert the result to a DataFrame for easy viewing
tfidf_df = pd.DataFrame(fitted_tfidf.toarray(), columns=tfidf.get_feature_names_out())

tfidf_df

Unnamed: 0,一下,一两个,一个,一些,一以,一件,一会儿,一個,一切,一切正常,...,點剛,點困,點在,點悶,點梗嚴,點熱,鼓励,鼓舞,鼻子,鼻涕
0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.00000,0.476231,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.00000,0.163963,0.0,0.0,0.0,0.226818,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.0,0.12135,0.159460,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
402,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Show only the first row with non-zero entries

In [6]:
# Select only the first row of tfidf_df
first_row_df = tfidf_df.iloc[[1]]  # Use double brackets to keep it as a DataFrame

# Remove columns where the value is zero in the first row
first_row_df_non_zero = first_row_df.loc[:, (first_row_df != 0).any(axis=0)]

# Display the resulting DataFrame
print(first_row_df_non_zero)

        一點點       其實        反應        好像        明顯       沒有        特別  \
1  0.336055  0.29661  0.287838  0.481587  0.336055  0.29661  0.359129   

        疙瘩       起雞皮  
1  0.26754  0.287838  


In [7]:
# Step 4: Concatenate the TF-IDF DataFrame with the original df_read
df_read_with_tfidf = pd.concat([df_read.reset_index(drop=True), tfidf_df], axis=1)

In [8]:
# Convert the existing selection to a list and add the new column at the beginning
selected_columns = df_read_with_tfidf.columns[16:]
selected_columns = selected_columns.insert(0, "attribute_value")
selected_columns = selected_columns.insert(0, "user_id")
selected_columns = selected_columns.insert(0, "Group")
# Reorder df_filtered_speech_acoustics with the new column order
# Ensure the new column exists in the DataFrame; otherwise, this will raise an error
df_trimmed_speech_text = df_read_with_tfidf[selected_columns]
df_trimmed_speech_text.to_csv(save_path_speech_text)
df_trimmed_speech_text

Unnamed: 0,Group,user_id,attribute_value,一下,一两个,一个,一些,一以,一件,一会儿,...,點剛,點困,點在,點悶,點梗嚴,點熱,鼓励,鼓舞,鼻子,鼻涕
0,2,1,25,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,26,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1,27,0.0,0.0,0.00000,0.476231,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,1,28,0.0,0.0,0.00000,0.163963,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2,25,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,1,115,28,0.0,0.0,0.12135,0.159460,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,2,117,25,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401,2,117,26,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
402,2,117,27,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Text To Sentiment

In [9]:
model_name = "SchuylerH/bert-multilingual-go-emtions"

# Load the model and tokenizer
tokenizer_go_emotions = AutoTokenizer.from_pretrained(model_name,cache_dir=huggingface_cache_location)
model_go_emotions = AutoModelForSequenceClassification.from_pretrained(model_name,cache_dir=huggingface_cache_location)

In [10]:
classifier_go_emotions  = pipeline("sentiment-analysis", model = model_go_emotions, 
                                   tokenizer = tokenizer_go_emotions, device=device,
                                   return_all_scores=True  # This returns scores for all emotions
                                   )

In [11]:
df_read_sentiment = df_read.copy()

# Process the entire 'text' column in a single batch
results = classifier_go_emotions(list(df_read_sentiment['Processed_Text'])[0])
results

[[{'label': 'admiration', 'score': 0.9715808033943176},
  {'label': 'amusement', 'score': 0.00048514516674913466},
  {'label': 'anger', 'score': 0.0008834972977638245},
  {'label': 'annoyance', 'score': 0.0016697858227416873},
  {'label': 'approval', 'score': 0.10835801064968109},
  {'label': 'caring', 'score': 0.001295395428314805},
  {'label': 'confusion', 'score': 0.0010936330072581768},
  {'label': 'curiosity', 'score': 0.0005050079780630767},
  {'label': 'desire', 'score': 0.0006678457139059901},
  {'label': 'disappointment', 'score': 0.002155864145606756},
  {'label': 'disapproval', 'score': 0.001520999358035624},
  {'label': 'disgust', 'score': 0.0006233627791516483},
  {'label': 'embarrassment', 'score': 0.0002489907492417842},
  {'label': 'excitement', 'score': 0.0015691425651311874},
  {'label': 'fear', 'score': 0.0004821899929083884},
  {'label': 'gratitude', 'score': 0.0010229941690340638},
  {'label': 'grief', 'score': 0.0002833385078702122},
  {'label': 'joy', 'score': 0.

In [12]:
import pandas as pd

df_read_sentiment = df_read.copy()

# Process the entire 'Processed_Text' column in a single batch
results = classifier_go_emotions(list(df_read_sentiment['Processed_Text']), return_all_scores=True)

# Extract unique emotion labels (from the first result assuming consistent labels across batches)
emotion_labels = [emotion['label'] for emotion in results[0]]

# Initialize columns for each emotion
for label in emotion_labels:
    df_read_sentiment[label] = 0.0

# Populate the DataFrame with scores for each emotion
for i, result in enumerate(results):
    for emotion in result:
        df_read_sentiment.at[i, emotion['label']] = emotion['score']

# Function to determine the dominant emotion based on conditions
def relabel_emotion(row):
    word_count = len(row['Processed_Text'].split())
    max_label = row[emotion_labels].idxmax()  # Find the emotion with the highest score
    max_score = row[max_label]
    if max_score < 0.6 or word_count < 10:
        return "na"
    return max_label

# Apply relabeling to the DataFrame
df_read_sentiment['emotion_relabeled'] = df_read_sentiment.apply(relabel_emotion, axis=1)


# Display the updated DataFrame
print(df_read_sentiment)


     Unnamed: 0  user_id       modality  attribute_index  attribute_value  \
0             0        1  Segment_Audio                2               25   
1             1        1  Segment_Audio                2               26   
2             2        1  Segment_Audio                2               27   
3             3        1  Segment_Audio                2               28   
4             4        2  Segment_Audio                2               25   
..          ...      ...            ...              ...              ...   
399         399      115  Segment_Audio                2               28   
400         400      117  Segment_Audio                2               25   
401         401      117  Segment_Audio                2               26   
402         402      117  Segment_Audio                2               27   
403         403      117  Segment_Audio                2               28   

                                                 files  file_size part_0  \

In [13]:
df_read_sentiment.to_csv(save_path_speech_sentiment)
df_read_sentiment

Unnamed: 0.1,Unnamed: 0,user_id,modality,attribute_index,attribute_value,files,file_size,part_0,part_1,part_2,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,emotion_relabeled
0,0,1,Segment_Audio,2,25,G:\Experiments\Moral Elevation\Disk1_2_combine...,13439054,PS-9,1,25,...,0.000265,0.013862,0.006310,0.005020,0.000823,0.000347,0.000812,0.000734,0.007360,na
1,1,1,Segment_Audio,2,26,G:\Experiments\Moral Elevation\Disk1_2_combine...,5763150,PS-9,1,26,...,0.000209,0.008647,0.001045,0.076384,0.000915,0.000164,0.000231,0.003745,0.181037,approval
2,2,1,Segment_Audio,2,27,G:\Experiments\Moral Elevation\Disk1_2_combine...,12095566,PS-9,1,27,...,0.000101,0.000839,0.000255,0.002469,0.000257,0.000117,0.000283,0.000296,0.984829,neutral
3,3,1,Segment_Audio,2,28,G:\Experiments\Moral Elevation\Disk1_2_combine...,9789518,PS-9,1,28,...,0.000565,0.299161,0.014443,0.026861,0.001455,0.000561,0.000903,0.004233,0.019600,admiration
4,4,2,Segment_Audio,2,25,G:\Experiments\Moral Elevation\Disk1_2_combine...,9404494,PS-9,2,25,...,0.180183,0.001059,0.006205,0.013740,0.019779,0.002288,0.086570,0.000687,0.004348,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,399,115,Segment_Audio,2,28,G:\Experiments\Moral Elevation\Disk1_2_combine...,7295054,PS-9,115,28,...,0.000268,0.026741,0.001245,0.000998,0.000277,0.001281,0.003526,0.000172,0.566087,na
400,400,117,Segment_Audio,2,25,G:\Experiments\Moral Elevation\Disk1_2_combine...,7487566,PS-9,117,25,...,0.000396,0.001947,0.004033,0.001069,0.000573,0.000587,0.002090,0.000781,0.006972,admiration
401,401,117,Segment_Audio,2,26,G:\Experiments\Moral Elevation\Disk1_2_combine...,3838030,PS-9,117,26,...,0.000113,0.000497,0.000213,0.001298,0.000196,0.000126,0.000240,0.000293,0.991222,neutral
402,402,117,Segment_Audio,2,27,G:\Experiments\Moral Elevation\Disk1_2_combine...,12095566,PS-9,117,27,...,0.003480,0.003002,0.002703,0.024023,0.003735,0.000975,0.000924,0.005248,0.001520,annoyance


In [14]:
# Step 4: Concatenate the TF-IDF DataFrame with the original df_read
df_merged_speech_sentiment= pd.concat([df_trimmed_speech_text.reset_index(drop=True), df_read_sentiment.iloc[:,16:-1]], axis=1)
df_merged_speech_sentiment.to_csv(save_path_speech_sentiment_concatenated)
# Display the resulting DataFrame
df_merged_speech_sentiment


Unnamed: 0,Group,user_id,attribute_value,一下,一两个,一个,一些,一以,一件,一会儿,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,2,1,25,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000723,0.000265,0.013862,0.006310,0.005020,0.000823,0.000347,0.000812,0.000734,0.007360
1,2,1,26,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000368,0.000209,0.008647,0.001045,0.076384,0.000915,0.000164,0.000231,0.003745,0.181037
2,2,1,27,0.0,0.0,0.00000,0.476231,0.0,0.0,0.0,...,0.000081,0.000101,0.000839,0.000255,0.002469,0.000257,0.000117,0.000283,0.000296,0.984829
3,2,1,28,0.0,0.0,0.00000,0.163963,0.0,0.0,0.0,...,0.000836,0.000565,0.299161,0.014443,0.026861,0.001455,0.000561,0.000903,0.004233,0.019600
4,1,2,25,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.001373,0.180183,0.001059,0.006205,0.013740,0.019779,0.002288,0.086570,0.000687,0.004348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,1,115,28,0.0,0.0,0.12135,0.159460,0.0,0.0,0.0,...,0.033288,0.000268,0.026741,0.001245,0.000998,0.000277,0.001281,0.003526,0.000172,0.566087
400,2,117,25,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.001015,0.000396,0.001947,0.004033,0.001069,0.000573,0.000587,0.002090,0.000781,0.006972
401,2,117,26,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000094,0.000113,0.000497,0.000213,0.001298,0.000196,0.000126,0.000240,0.000293,0.991222
402,2,117,27,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000576,0.003480,0.003002,0.002703,0.024023,0.003735,0.000975,0.000924,0.005248,0.001520


## Footnote
- Copyright：Ivan Liu 
- Last Update: 2024
- Env：psychai241104, env41124
- References: 
    - None