In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for i, filename in enumerate(filenames):
        if i > 3:
            break
        print(os.path.join(dirname, filename))

/kaggle/input/predict-human-emotions-from-audio/dataset/sample_submission.csv
/kaggle/input/predict-human-emotions-from-audio/dataset/train.csv
/kaggle/input/predict-human-emotions-from-audio/dataset/test.csv
/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/13738.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/28919.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/28562.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/8142.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/23694.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/11507.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/28967.mp3
/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/5313.mp3


In [2]:
train_csv = "/kaggle/input/predict-human-emotions-from-audio/dataset/train.csv"
test_csv = "/kaggle/input/predict-human-emotions-from-audio/dataset/test.csv"
TRAIN_fp = "/kaggle/input/predict-human-emotions-from-audio/dataset/TrainAudioFiles/"
TEST_fp = "/kaggle/input/predict-human-emotions-from-audio/dataset/TestAudioFiles/"

train_files = [fn for fn in os.listdir(TRAIN_fp) if fn.split('.')[-1] in ['mp3', 'wav']]
test_files = [fn for fn in os.listdir(TEST_fp) if fn.split('.')[-1] in ['mp3', 'wav']]
len(train_files), len(test_files)

(5816, 2492)

In [3]:
train_df = pd.read_csv(train_csv)
train_df.to_csv("train.csv", index=False)
train_df.shape, train_df.columns, train_df['emotion'].value_counts()

((5816, 2),
 Index(['filename', 'emotion'], dtype='object'),
 neutral     2630
 joy          967
 surprise     640
 anger        596
 sadness      344
 fear         328
 disgust      311
 Name: emotion, dtype: int64)

In [4]:
test_df = pd.read_csv(test_csv)
test_df.to_csv("test.csv", index=False)
test_df.shape, test_df.columns

((2492, 1), Index(['filename'], dtype='object'))

In [5]:
import librosa.util.utils as util
import librosa
import librosa.display
import audioread

def audioread_load(path, offset=0.0,
                   duration=None, dtype=np.float32):
    
    """
    Load an audio buffer using audioread.
    This loads one block at a time, and then concatenates the results.
    """
    y = []
    with audioread.audio_open(path) as input_file:
        sr_native = input_file.samplerate
        n_channels = input_file.channels

        s_start = int(np.round(sr_native * offset)) * n_channels

        if duration is None:
            s_end = np.inf
        else:
            s_end = s_start + (int(np.round(sr_native * duration)) * n_channels)

        n = 0
        for frame in input_file:
            frame = util.buf_to_float(frame, dtype=dtype)
            n_prev = n
            n = n + len(frame)

            if n < s_start:
                # offset is after the current frame
                # keep reading
                continue

            if s_end < n_prev:
                # we're off the end.  stop reading
                break

            if s_end < n:
                # the end is in this frame.  crop.
                frame = frame[: s_end - n_prev]

            if n_prev <= s_start <= n:
                # beginning is in this frame
                frame = frame[(s_start - n_prev) :]
                
            # tack on the current frame
            y.append(frame)
    if y:
        y = np.concatenate(y)
        if n_channels > 1:
            y = y.reshape((-1, n_channels)).T
    else:
        y = np.empty(0, dtype=dtype)
        
    if n_channels > 1:
        y = librosa.to_mono(y)

    return y, sr_native


sample_fp = f"{TRAIN_fp}/28967.mp3"
# y, sr = audioread_load(sample_fp, duration=5, sr=16000)
y, sr = librosa.load(sample_fp, sr=None)
yt,_ = librosa.effects.trim(y)
y.shape, sr, yt.shape

((44982,), 44100, (44982,))

In [6]:
classes = sorted(train_df['emotion'].unique().tolist())
map_class_to_id = dict(zip(classes, range(0, len(classes))))
map_class_to_id

{'anger': 0,
 'disgust': 1,
 'fear': 2,
 'joy': 3,
 'neutral': 4,
 'sadness': 5,
 'surprise': 6}

In [7]:
filepaths =  train_df['filename'].apply(lambda x:os.path.join(TRAIN_fp,x))
class_ids = train_df['emotion'].apply(lambda n: map_class_to_id[n])
filepaths.head(2), class_ids.head(2)

(0    /kaggle/input/predict-human-emotions-from-audi...
 1    /kaggle/input/predict-human-emotions-from-audi...
 Name: filename, dtype: object,
 0    4
 1    4
 Name: emotion, dtype: int64)

In [8]:
import librosa as lb
import soundfile as sf

def load_audio(fp):
    y, sr = audioread_load(fp, duration=30)
    if sr != 22050:
        y = librosa.resample(y, sr, 22050)
    yt, _ = librosa.effects.trim(y)
    return yt, sr


def noise(data):
    noise_amp = 0.04*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.70):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sr, pitch_factor=0.8):
    return librosa.effects.pitch_shift(data, sr, pitch_factor)


def extract_feat(audio, sr):
    mfccs=np.mean(librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40).T, axis=0)
    stft = np.abs(librosa.stft(audio))
    chromas=np.mean(lb.feature.chroma_stft(S=stft, sr=sr).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sr).T,
                       axis=0)
    mels=np.mean(librosa.feature.melspectrogram(audio, sr=sr, 
                                    n_mels=128, fmax=8000).T,axis=0)        
    
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(audio),
                                              sr=sr).T,axis=0)
    
    rmse = np.mean(librosa.feature.rms(y=audio))
    spec_cent = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
    # spec_bw = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sr))
    # rolloff = np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sr))
    zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
    
    result = np.hstack((mfccs, chromas, mels, contrast, tonnetz,
                        rmse, spec_cent, zcr))
    
    return result

def audio_features(fp, augument=True):
    
    all_augs = []
    audio, sr = load_audio(fp)
    
    all_augs.append(extract_feat(audio, sr))
    
    if not augument:
        return all_augs[0]
    
    noise_data = noise(audio)
    all_augs.append(extract_feat(noise_data, sr))
    pitch_data = pitch(audio, sr)
    all_augs.append(extract_feat(pitch_data, sr))
    strech_data = stretch(audio)
    all_augs.append(extract_feat(strech_data, sr))
    shift_data = shift(audio)
    all_augs.append(extract_feat(shift_data, sr))

    return all_augs

In [9]:
from tqdm.notebook import tqdm

train_data = []
train_labels = []
for fn, label in tqdm(train_df.values[:]):
    path_load = os.path.join(TRAIN_fp,fn)
    audio_feats = audio_features(path_load)
    for feat in audio_feats:
        train_data.append(feat)
        train_labels.append(label)
    
train_feat = pd.DataFrame(train_data)
train_feat['emotion'] = train_labels
train_feat.to_csv("train_features.csv", index=False)
len(train_feat), len(train_labels)

  0%|          | 0/5816 [00:00<?, ?it/s]

(29080, 29080)

In [10]:
test_data = []
test_fns = []
for fn in tqdm(test_df['filename'].values[:]):
    path_load = os.path.join(TEST_fp,fn)
    audio_feat = audio_features(path_load, augument=False)
    test_data.append(audio_feat)
    test_fns.append(fn)

test_feat = pd.DataFrame(test_data)
test_feat.to_csv("test_features.csv", index=False)
len(test_feat)

  0%|          | 0/2492 [00:00<?, ?it/s]

2492

In [11]:
from IPython.display import FileLinks
print(train_feat.shape, test_feat.shape)
FileLinks('.')

(29080, 197) (2492, 196)


In [12]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler

# scaler = StandardScaler()
# X = scaler.fit_transform(train_feat)
# X_test = scaler.transform(test_feat)
# len(X), len(X_test), len(train_labels)

In [13]:
# from sklearn.model_selection import train_test_split # for splitting training and testing
# from sklearn.neural_network import MLPClassifier # multi-layer perceptron model
# from sklearn.metrics import accuracy_score # to measure how good we are
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import TimeSeriesSplit
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import cross_validate
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import make_scorer

# X_train, X_val, y_train, y_val = train_test_split(train_feat,
#                                                 train_labels, test_size=0.2)

In [14]:
# # Initialize the MLP Classifier and choose parameters we want to keep constant
# model = MLPClassifier(
#     # tune batch size later 
#     batch_size=256,  
#     # keep random state constant to accurately compare subsequent models
#     random_state=69
# )

# param_space = {
#     # It's a bad idea at guessing the number of hidden layers to have
#     # ...but we'll give 2 and 3 hidden layers a shot to reaffirm our suspicions that 1 is best
#     'hidden_layer_sizes': [(180,), (256,),
#                            (100,50,),
#                            (100,50,25),
#                            (128, 64),
#                            (128,64,32)], 
#     'activation': ['relu', 'logistic'],
#     'solver': ['sgd', 'adam'],
#     'alpha': [0.0001, 0.001, 0.01],
#     'epsilon': [1e-07, 1e-05, 1e-03, 0.1],
#     'learning_rate': ['adaptive']
# }

# # scoring=scoring, refit='AUC'
# clf = RandomizedSearchCV(model, param_space, n_iter=49,
#                          random_state=2021, cv=5,
#                          scoring='f1_weighted')
# search = clf.fit(X, train_labels)
# search.best_score_, search.best_params_

In [15]:
# rfc= RandomForestClassifier()

# param = dict(n_estimators=[500, 1000],
#              criterion=['entropy'],
#              max_features=['log2', 'sqrt'],
#              class_weight=["balanced"],
#              min_samples_leaf=range(3,10),
#              min_samples_split=range(3,10),
#              max_depth=[i for i in range(25, 52, 3)] + [None]
#             )

# # scoring=scoring, refit='AUC'
# clf = RandomizedSearchCV(rfc, param, n_iter=16,
#                          random_state=2021, cv=5,
#                          scoring='f1_weighted', n_jobs=-1)
# search = clf.fit(X, train_labels)
# search.best_score_, search.best_params_

In [16]:
# from sklearn.ensemble import GradientBoostingClassifier
# from lightgbm import LGBMClassifier

# lgc = LGBMClassifier()

# param = dict(n_estimators=[500, 1000, 1200, 2000],
#              class_weight=["balanced"],
#              learning_rate=[0.005, 0.001, 0.05, 0.01, 0.1],
#              max_depth=[i for i in range(11, 70, 3)] + [None]
#             )

# # scoring=scoring, refit='AUC'
# clf = RandomizedSearchCV(lgc, param, n_iter=49,
#                          random_state=2021, cv=5,
#                          scoring='f1_weighted', n_jobs=-1)
# search = clf.fit(X, train_labels)
# search.best_score_, search.best_params_

In [17]:
# # y_proba = clf.predict_proba(test_feat)
# pred_df = test_df[['filename']].copy()
# pred_df['emotion'] = clf.predict(X_test)
# pred_df.to_csv("test_pred_v5.csv", index=False)

In [18]:
# mel_spect = librosa.feature.melspectrogram(y=yt, sr=sr, n_fft=2048//2, hop_length=512//2)
# mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
# librosa.display.specshow(mel_spect, y_axis='mel', fmax=9000, x_axis='time');
#plt.title('Mel Spectrogram');
# plt.savefig('x.jpeg')

In [19]:

# def read_audio(fp):
#     with audioread.audio_open(fp) as f:
#         # totalsec contains the length in float
#         totalsec = f.duration
#     return totalsec

# train_durations = []
# for fn, label in tqdm(train_df.values):
#     path_load = os.path.join(TRAIN_fp,fn)
#     train_durations.append(read_audio(path_load))
    
    
# test_durations = []
# for fn in tqdm(os.listdir(TEST_fp)):
#     path_load = os.path.join(TEST_fp,fn)
#     test_durations.append(read_audio(path_load))

# sum(test_durations)/len(test_durations),min(test_durations), max(test_durations)

In [20]:
# from tqdm.notebook import tqdm

# save_path = "/kaggle/working/TrainImgFiles"
# if not os.path.exists(save_path):
#     os.mkdir(save_path)
    
# train_data = []
# for fn, label in tqdm(train_df.values):
#     path_load = os.path.join(TRAIN_fp,fn)
#     y, sr = audioread_load(path_load, duration=5)
#     yt, _ = librosa.effects.trim(y)
#     mel_spect = librosa.feature.melspectrogram(y=yt, sr=sr, n_fft=1024, hop_length=256)
#     mel_spect = librosa.power_to_db(mel_spect, ref=np.max)
#     print(mel_spect.shape)
#     librosa.display.specshow(mel_spect, y_axis='mel', fmax=9000, x_axis='time')
# #     save_dir = os.path.join(save_path,label)
# #     if not os.path.exists(save_dir):
# #         os.mkdir(save_dir)
#     img_fn = f"{fn.split('.')[0]}.png"
#     plt.savefig(os.path.join(save_path, img_fn))
#     train_data.append([img_fn, label])

In [21]:
# import librosa as lb
# import soundfile as sf


# def audio_features(file_title, mfcc, chroma, mel):
#     with sf.SoundFile(file_title) as audio_recording:
#         audio = audio_recording.read(dtype="float32")
#         sample_rate = audio_recording.samplerate
        
#         if chroma:
#             stft=np.abs(lb.stft(audio))
#             result=np.array([])
#         if mfcc:
#             mfccs=np.mean(lb.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)
#             result=np.hstack((result, mfccs))
#         if chroma:
#             chroma=np.mean(lb.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
#             result=np.hstack((result, chroma))
#         if mel:
#             mel=np.mean(lb.feature.melspectrogram(audio, sr=sample_rate).T,axis=0)
#             result=np.hstack((result, mel))
#         return result



#         chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
#         rmse = librosa.feature.rmse(y=y)
#         spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
#         spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
#         rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
#         zcr = librosa.feature.zero_crossing_rate(y)
#         mfcc = librosa.feature.mfcc(y=y, sr=sr)
#         to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
#         for e in mfcc:
#             to_append += f' {np.mean(e)}'
#         to_append += f' {g}'