Data preprocessing

In [None]:
import pandas as pd
import time


def read_file_in_chunks_return_head(file_path, sep='\t', compression='bz2', chunksize=1000):
    try:
        chunk_iter = pd.read_csv(file_path, sep=sep, compression=compression, chunksize=chunksize)
        first_chunk = next(chunk_iter)
        for chunk in chunk_iter:
            # check the data in each chunk
            print(chunk.head())
        print(f"File {file_path} read successfully in chunks")
        return first_chunk.head(2)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")


audio_features_mfcc_head = read_file_in_chunks_return_head('./audio_features/id_compare_mfcc_stats.tsv.bz2')
print("succeed!")
print("Audio Features - MFCC (head):")
print(audio_features_mfcc_head)

print("\nFinish read all files head. ")


In [None]:
import pandas as pd
import time

def process_chunk(chunk, all_chunks):
    # process data in each chunk
    all_chunks.append(chunk)
    print(f"Processed chunk of size: {chunk.shape}")

def read_file_in_chunks(file_path, sep='\t', compression='bz2', chunksize=10000):
    try:
        chunk_iter = pd.read_csv(file_path, sep=sep, compression=compression, chunksize=chunksize)
        all_chunks = []
        for chunk in chunk_iter:
            process_chunk(chunk, all_chunks)
        data = pd.concat(all_chunks, ignore_index=True)
        print(f"File {file_path} read successfully in chunks")
        return data
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None



In [None]:
audio_features_mfcc = read_file_in_chunks('./audio_features/id_compare_mfcc_stats.tsv.bz2')
print("Finish reading audio features!")

lyrics_vad_bow = read_file_in_chunks('./id_vad_bow.tsv.bz2')
print("Finish reading lyrice vad_bow features sentiment!")

In [None]:
def sort_by_id(df):
    """
    sort dataframe based on 'id'
    """
    sorted_df = df.sort_values(by='id').reset_index(drop=True)
    return sorted_df

audio_features_mfcc = sort_by_id(audio_features_mfcc)
print(audio_features_mfcc.head())

lyrics_vad_bow = sort_by_id(lyrics_vad_bow)
print(lyrics_vad_bow.head())


select data

In [None]:
def extract_chunks(data, chunk_size, extraction_size):
    chunks = []
    for i in range(0, len(data), chunk_size):
        chunk = data[i:i+chunk_size]
        extracted_chunk = chunk.head(extraction_size)
        chunks.append(extracted_chunk)
        print(f"Extracted chunk {i // chunk_size + 1}: {extracted_chunk.shape}")
    return pd.concat(chunks, ignore_index=True)


chunk_size = 1000
extraction_size = 4
audio_chunks = extract_chunks(audio_features_mfcc, chunk_size, extraction_size)
audio_chunks.to_csv('./audio_features/extracted_audio_features.tsv.bz2', sep='\t', index=False, compression='bz2')
print("Extraction and saving completed.")


lyrics_chunks_vad_bow = extract_chunks(lyrics_vad_bow, chunk_size, extraction_size)
lyrics_chunks_vad_bow.to_csv('./extracted_lyrics_vad_bow.tsv.bz2', sep='\t', index=False, compression='bz2')
print("Extraction and saving completed.")

data cleaning

In [None]:
import pandas as pd

def process_chunk(chunk, chunk_type, all_chunks):
    # chack and delete missing value
    chunk = chunk.dropna()
    # delete duplicates
    chunk = chunk.drop_duplicates()
    print(f"Processed {chunk_type} chunk of size: {chunk.shape}")
    all_chunks.append(chunk)

def read_and_process_chunks(file_path, chunk_type, sep=',', compression='gzip', chunksize=10000):
    try:
        chunk_iter = pd.read_csv(file_path, sep=sep, compression=compression, chunksize=chunksize)
        all_chunks = []
        for chunk in chunk_iter:
            process_chunk(chunk, chunk_type, all_chunks)
        data = pd.concat(all_chunks, ignore_index=True)
        print(f"File {file_path} read and processed successfully in chunks")
        return data
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None


# 分块读取和处理数据
audio_features = read_and_process_chunks('./audio_features/extracted_audio_features.csv.gz', 'audio')
lyrics_vad_bow_features = read_and_process_chunks('./extracted_lyrics_vad_bow_features.csv.gz', 'lyrics')




In [None]:

common_ids = set(audio_features['id']).intersection(set(lyrics_vad_bow_features['id']))
audio_features = audio_features[audio_features['id'].isin(common_ids)]
lyrics_vad_bow_features = lyrics_vad_bow_features[lyrics_vad_bow_features['id'].isin(common_ids)]

audio_features = audio_features.drop_duplicates(subset=['id'])
lyrics_vad_bow_features = lyrics_vad_bow_features.drop_duplicates(subset=['id'])

# save cleaned data
lyrics_sentiment_features.to_csv('./cleaned_lyrics_sentiment_features.csv.gz', sep=',', index=False, compression='gzip')
lyrics_vad_bow_features.to_csv('./cleaned_lyrics_vad_bow_features.csv.gz', sep=',', index=False, compression='gzip')
print("Data cleaning and saving completed.")

data standardization

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardize_features(input_path, output_path, compression='gzip'):
    df = pd.read_csv(input_path, compression=compression)
    feature_columns = df.columns.difference(['id'])
    # initial StandardScaler
    scaler = StandardScaler()
    df[feature_columns] = scaler.fit_transform(df[feature_columns])
    df.to_csv(output_path, index=False, compression=compression)
    print(f"Standardized features from {input_path} and saved to {output_path}")



audio_input_path = './cleaned_lyrics_sentiment_features.csv.gz'
lyrics_input_path = './cleaned_lyrics_vad_bow_features.csv.gz'

audio_output_path = './audio_features/standardized_audio_features.csv.gz'
lyrics_output_path = './lyrice_features/standardized_lyrics_features.csv.gz'

standardize_features(audio_input_path, audio_output_path)
standardize_features(lyrics_input_path, lyrics_output_path)

PCA

In [None]:
import pandas as pd
from sklearn.decomposition import PCA

def determine_n_components(input_path, compression='gzip'):
    df = pd.read_csv(input_path, compression=compression)
    feature_columns = df.columns.difference(['id'])
    pca = PCA()
    pca.fit(df[feature_columns])
    # calculate cumulative variance
    cumulative_variance = pca.explained_variance_ratio_.cumsum()
    print(f"Cumulative explained variance for {input_path}:")
    for i, variance in enumerate(cumulative_variance):
        print(f"PC{i+1}: {variance}")
        if variance >= 0.95:
            print(f"Selected number of components for {input_path}: {i+1}")
            return i+1

audio_input_path = './audio_features/standardized_audio_features.csv.gz'
lyrics_input_path = './lyrice_features/standardized_lyrics_features.csv.gz'

# print the dimension number
n_components_audio5 = determine_n_components(audio_input_path)
n_components_lyrics5 = determine_n_components(lyrics_input_path)
print(f"Number of components selected for audio features: {n_components_audio5}")
print(f"Number of components selected for lyrics features: {n_components_lyrics5}")



In [None]:
#apply PCA
def apply_pca(input_path, output_path, n_components, compression='gzip'):
    df = pd.read_csv(input_path, compression=compression)
    feature_columns = df.columns.difference(['id'])
    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(df[feature_columns])

    pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(n_components)])
    pca_df.insert(0, 'id', df['id'])
    pca_df.to_csv(output_path, index=False, compression=compression)
    print(f"Applied PCA on {input_path}, saved to {output_path}, explained variance ratio: {pca.explained_variance_ratio_.sum()}")


audio_output_path = './audio_features/pca_audio_features.csv.gz'
lyrics_output_path = './lyrice_features/pca_lyrics_features.csv.gz'


apply_pca(audio_input_path, audio_output_path, n_components=n_components_audio5)
print("finish!")
apply_pca(lyrics_input_path, lyrics_output_path, n_components=n_components_lyrics5)
print("finish!")


merge audio and lyrics features into multimodal features

In [None]:

pca_audio_features = pca_audio_features.drop(columns=['segment_id'], errors='ignore')
pca_vad_bow_features = pca_vad_bow_features.drop(columns=['segment_id'], errors='ignore')

multimodal_features = pd.merge(pca_audio_features, pca_vad_bow_features, on='id')

print("Columns in multimodal_features after merging:", multimodal_features.columns.tolist())
print(f"Shape of multimodal_features: {multimodal_features.shape}")

multimodal_features.to_csv('./multimodal_features.csv.gz', index=False, compression='gzip')

process metadata and transfer valences into emotion tags


In [None]:
import pandas as pd
import time

def process_chunk(chunk, all_chunks):
    all_chunks.append(chunk)
    print(f"Processed chunk of size: {chunk.shape}")

def read_file_in_chunks(file_path, sep='\t', chunksize=10000):
    try:
        start_time = time.time()
        chunk_iter = pd.read_csv(file_path, sep=sep, chunksize=chunksize)
        all_chunks = []
        for chunk in chunk_iter:
            process_chunk(chunk, all_chunks)
        data = pd.concat(all_chunks, ignore_index=True)
        end_time = time.time()
        print(f"File {file_path} read successfully in chunks. Total time: {end_time - start_time:.2f} seconds")
        return data
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

metadata = read_file_in_chunks('./id_metadata.csv')
print("Finish reading lyrice features!")
print(metadata.shape)


In [None]:
def sort_by_id(df):
    """
    sort the dataframe based on 'id' column
    """
    sorted_df = df.sort_values(by='id').reset_index(drop=True)
    return sorted_df

metadata = sort_by_id(metadata)
print(metadata.head())

chunk_size = 1000
extraction_size = 4

metadata_chunks = extract_chunks(metadata, chunk_size, extraction_size)
metadata_chunks.to_csv('./metadata.csv', sep='\t', index=False, compression='bz2')
print("Extraction and saving completed.")
metadata_chunks.to_csv('metadata.csv.gz', index=False, compression='gzip')


In [None]:
import pandas as pd

# read features files
pca_audio_features = pd.read_csv('./pca_audio_features.csv.gz', compression='gzip')
pca_vad_bow_features = pd.read_csv('./pca_vad_bow_features.csv.gz', compression='gzip')
multimodal_features = pd.read_csv('./multimodal_features.csv.gz', compression='gzip')


audio_ids = set(pca_audio_features['id'])
metadata_filtered = pd.DataFrame()

#read meta data
chunksize = 10000
for chunk in pd.read_csv('./metadata.csv.gz', compression='gzip', chunksize=chunksize):
    chunk_filtered = chunk[chunk['id'].isin(audio_ids)]
    metadata_filtered = pd.concat([metadata_filtered, chunk_filtered], ignore_index=True)

# only retain 'id' and 'valence' column
metadata_filtered = metadata_filtered[['id', 'valence']]

# function which merge valence column and add emotion column
def add_emotion_column(features_df, metadata_df):
    features_df = features_df.merge(metadata_df, on='id', how='inner')
    features_df['emotion'] = features_df['valence'].apply(lambda x: 'negative' if x < 0.5 else 'positive')
    features_df = features_df.drop(columns=['valence'])
    return features_df

# add 'emotion' column in three datasets
pca_audio_features = add_emotion_column(pca_audio_features, metadata_filtered)
pca_vad_bow_features = add_emotion_column(pca_vad_bow_features, metadata_filtered)
multimodal_features = add_emotion_column(multimodal_features, metadata_filtered)


pca_audio_features.to_csv('./pca_audio_features_with_emotion.csv.gz', index=False, compression='gzip')
pca_vad_bow_features.to_csv('./pca_vad_bow_features_with_emotion.csv.gz', index=False, compression='gzip')
multimodal_features.to_csv('./multimodal_features_with_emotion.csv.gz', index=False, compression='gzip')


print(f"Shape of pca_audio_features_with_emotion: {pca_audio_features.shape}")
print(pca_audio_features.head())
print(f"Shape of pca_vad_bow_features_with_emotion: {pca_vad_bow_features.shape}")
print(pca_vad_bow_features.head())
print(f"Shape of multimodal_features_with_emotion: {multimodal_features.shape}")
print(multimodal_features.head())


Split into training data and testing data

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

pca_audio_features = pd.read_csv('./pca_audio_features_with_emotion.csv.gz', compression='gzip')
pca_vad_bow_features = pd.read_csv('./pca_vad_bow_features_with_emotion.csv.gz', compression='gzip')
pca_muitlmodal_features = pd.read_csv('./multimodal_features_with_emotion.csv.gz', compression='gzip')


# target column 'emotion'
target_column = 'emotion'

# split dataset into training data and test data
X_audio = pca_audio_features.drop(columns=[target_column, 'id'])
y_audio = pca_audio_features[target_column]
X_train_audio, X_test_audio, y_train_audio, y_test_audio = train_test_split(X_audio, y_audio, test_size=0.2, random_state=42)

X_lyrics = pca_vad_bow_features.drop(columns=[target_column, 'id'])
y_lyrics = pca_vad_bow_features[target_column]
X_train_lyrics, X_test_lyrics, y_train_lyrics, y_test_lyrics = train_test_split(X_lyrics, y_lyrics, test_size=0.2, random_state=42)

X_multimodal = pca_muitlmodal_features.drop(columns=[target_column, 'id'])
y_multimodal = pca_muitlmodal_features[target_column]
X_train_multimodal, X_test_multimodal, y_train_multimodal, y_test_multimodal = train_test_split(X_multimodal, y_multimodal, test_size=0.2, random_state=42)




baseline


In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# define a simple RandomForestClassifier
model_audio = RandomForestClassifier(random_state=42)
model_lyrics = RandomForestClassifier(random_state=42)
model_multimodal = RandomForestClassifier(random_state=42)

# evaluate the model using cross-validation
def evaluate_model(X, y, model):
    print(f'Before cross-validation: X shape = {X.shape}')
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f'After cross-validation: X shape = {X.shape}')
    print(f'Cross-validation scores: {scores}')
    print(f'Mean cross-validation score: {scores.mean()}')
    return scores.mean()

print("Audio Features Model Evaluation:")
print(X_train_audio.shape)
audio_accuracy = evaluate_model(X_train_audio, y_train_audio, model_audio)

print("\nLyrics Features Model Evaluation:")
lyrics_accuracy = evaluate_model(X_train_lyrics, y_train_lyrics, model_lyrics)

print("\nMultimodal Features Model Evaluation:")
multimodal_accuracy = evaluate_model(X_train_multimodal, y_train_multimodal, model_multimodal)

print(f"\nBaseline Audio Model Accuracy: {audio_accuracy}")
print(f"Baseline Lyrics Model Accuracy: {lyrics_accuracy}")
print(f"Baseline Multimodal Model Accuracy: {multimodal_accuracy}")


Optimization

In [None]:
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification



# define parameters in the model
param_distributions = {
    'n_estimators': Integer(100, 200),
    'max_depth': Categorical([None, 10, 20]),
}

# create BayesSearchCV object
def tune_model_bayes(X_train, y_train):
    bayes_search = BayesSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        search_spaces=param_distributions,
        n_iter=30,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42
    )
    bayes_search.fit(X_train, y_train)
    print(X_train.shape)
    return bayes_search.best_params_, bayes_search.best_score_

In [None]:
best_params_audio, best_score_audio = tune_model_bayes(X_train_audio, y_train_audio)
print("finish")
print(f"Best Params (Audio): {best_params_audio}")
print(f"Best Score (Audio): {best_score_audio}")

best_params_lyrics, best_score_lyrics = tune_model_bayes(X_train_lyrics, y_train_lyrics)
print("finish")
print(f"Best Params (Lyrics): {best_params_lyrics}")
print(f"Best Score (Lyrics): {best_score_lyrics}")

best_params_multimodal, best_score_multimodal = tune_model_bayes(X_train_multimodal, y_train_multimodal)
print("finish")
print(f"Best Params (Multimodal): {best_params_multimodal}")
print(f"Best Score (Multimodal): {best_score_multimodal}")


model retraining and evaluation

In [None]:
import numpy as np
y_audio_encoded = np.where(y_audio == 'positive', 1, 0)
y_lyrics_encoded = np.where(y_lyrics == 'positive', 1, 0)
y_multimodal_encoded = np.where(y_multimodal == 'positive', 1, 0)

In [None]:
from sklearn.metrics import accuracy_score

# retrain model with best parameters
def train_and_evaluate_model(X_train, y_train, X_test, y_test, best_params):
    model = RandomForestClassifier(**best_params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    return test_accuracy

In [None]:
# repeat evaluation
def repeated_train_test(X, y, best_params, n_repeats=6, test_size=0.3):
    accuracies, precisions, recalls, f1s, roc_curves, conf_matrices = [], [], [], [], [], []
    accuracies = []
    for _ in range(n_repeats):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=None)
        accuracy = train_and_evaluate_model(X_train, y_train, X_test, y_test, best_params)
        accuracies.append(accuracy)
    mean_accuracy = np.mean(accuracies)
    return np.array(accuracies), mean_accuracy

In [None]:

accuracies_audio, mean_accuracy = repeated_train_test(X_audio, y_audio_encoded, best_params_audio)
print(f"Test Accuracies (Audio): {accuracies_audio}")
print(f"mean accuracy: {mean_accuracy}")

In [None]:
accuracies_lyrics, mean_accuracy = repeated_train_test(X_lyrics, y_lyrics_encoded, best_params_lyrics)
print(f"Test Accuracies (Lyrics): {accuracies_lyrics}")
print(f"mean accuracy: {mean_accuracy}")

In [None]:
accuracies_multi, mean_accuracy = repeated_train_test(X_multimodal, y_multimodal_encoded, best_params_multimodal)
print(f"Test Accuracies (Multi): {accuracies_multi}")
print(f"mean accuracy: {mean_accuracy}")

hypothesis test

In [None]:
# test whether the data is normal distribution
def check_normality(data):
    stat, p_value = shapiro(data)
    print('Shapiro-Wilk Test: stat=%.3f, p-value=%.3f' % (stat, p_value))
    #stats.probplot(data, dist="norm", plot=plt)
    #plt.show()
    return p_value > 0.05

# choose proper statistic test method
def perform_statistical_tests(data1, data2):
    if check_normality(data1) and check_normality(data2):
        t_stat, p_value = ttest_rel(data1, data2)

        print('Paired t-test: t-statistic=%.3f, p-value=%.3f' % (t_stat, p_value))
    else:
        stat, p_value = wilcoxon(data1, data2)

        print('Wilcoxon test: statistic=%.3f, p-value=%.3f' % (stat, p_value))

In [None]:
perform_statistical_tests(accuracies_multi, accuracies_audio)
perform_statistical_tests(accuracies_multi, accuracies_lyrics)
perform_statistical_tests(accuracies_audio, accuracies_lyrics)