In [None]:
import pandas as pd 
import string
import numpy as np
import os

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
caption_df = pd.read_csv("data/Mname&Captions.csv")
music_folder = 'data/music'

In [None]:
caption_df = caption_df[:10]

In [None]:
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

In [None]:
def text_p(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [None]:
# Initialize the stemmer
stemmer = PorterStemmer()

# Function to stem each word in the text
def stem_text(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [None]:
def text_preprocess(text):
    text = text_p(text)
    text = remove_stop_words(text)
    text = stem_text(text)
    return text

## What did I done for text_preprocess
1. Lowercasing
2. Removing Punctuation
3. Removing Stop Words
4. Stemming

In [None]:
caption_df["Preprocessed_Text"] = caption_df['caption'].apply(text_preprocess)

In [None]:
caption_df

### Word embedding using GloVe

In [None]:
# Load GloVe model
def load_glove_model(file_path):
    print("Loading GloVe Model...")
    glove_model = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype='float32')
            glove_model[word] = vector
    print(f"Done. {len(glove_model)} words loaded!")
    return glove_model

In [None]:
glove_model = load_glove_model("data\glove.6B.50d.txt")

In [None]:
# Function to get sentence embedding
def get_sentence_embedding(sentence, model):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If none of the words are in the model, return a zero vector
        return np.zeros(next(iter(model.values())).shape)
    return np.mean(word_vectors, axis=0)


In [None]:
caption_df['embedding'] = caption_df['Preprocessed_Text'].apply(lambda x: get_sentence_embedding(x, glove_model))


In [None]:
caption_df

In [None]:
import IPython.display as ipd
ipd.Audio(f"data/music/{caption_df['audio'][0]}")

In [None]:
import librosa

In [None]:
# Period is the time interval before the next peak
# Frequency is interval correlated with Period e.g. high period == lower frequency and vise varas {f = 1 / T}

# Amplitude is the length of the peak or valley  A
# face shift the wave to right or to left 
# Simple wave y(t) = A sin(2pi ft = face(greek latter))

In [None]:
# Two fundamental function of sound wave are Frequency and Amplitude and they are related to pitch and loudness i.e.
# Pitch is not phelay Feature of the sound 
# Frequency & Pitch :- Higher frequency == Higher pitch
# Amplitude & Loudness :- larger amplitude are louder

In [None]:
"""
As all the sound wave are Analog and is conutunels and we and not stroe it in digital form with highest relustion so we need
Analog digital conversion (ADC) and staps for ADC
1. Signal sampled at uniform time intervals (Sample rate = 44,100 Hz)
2. Amplitude quantized with limited number of bits (Bit depth = 16 bits/channel)
at each Sample will project the value of amplitude to the closest quantized bit at each interval
"""

In [None]:
"""
Real sound wave are really messy so for undertding the sound wave will use 
Fourier Transform :- Decompose complex periodic sound into sum of sine waves oscillating at different frequencies.

Complex sound (s) = A1 sin(2pi f1t = face(greek latter)1 + A2 sin(2pi f2t = face(greek latter)2
and value of amplitude will dicecd how much a particali sin function will conterbut to the final complex sound. i.e. Bigger the Amplitude more it will efacte the Complex sound
"""

In [None]:
"""
FFT (Fourier Transform) will give a power spectrum wiche will give us a snap shot of the sound wave
by giving Magnitude / Power of Amplitude @ Y-axis and Frequency @ X-axis of the whole sound wave. 
Becuse of this we will miss the info about the Time Domain.
FFT will From Time domain to Frequency Domain
There is no info about time
"""

In [None]:
"""
Solution Short Time Fourier Transform (STFT)
- Computes several FFT at different intervals
- To Preserves time info
- Will do at a Fixed frame size (e.g. 2048 samples) at ocens and then move on and do FFT at same Samples
- Gives us a Spectrogram (Time + Frequency + magnitude)
"""

In [None]:
"""
Mel Frequency Cepstral Coefficients (MFCCs) ## This will be the bsice diffrent b/w if  Frequency and Amplitude anre same for pieno and gitare it help for making a DIffrente
- Capture timbral / textural aspects of the sound
- Frequency domain feature 
- Approximate hume auditory system
- 13 to 40 coefficients
- Calculated at each frame
"""

In [None]:
sound1 = f"data/music/{caption_df['audio'][0]}"

In [None]:
import librosa , librosa.display
import matplotlib.pyplot as plt

In [None]:
# def audio(sound):
#     signal, sample_rate = librosa.load(sound, sr=22050) # signal is a nparry with sr * T values -> 22050 * 4 and the values are the Ampletued at each sr
#     # FFT -> Power Spectrum
#     fft = np.fft.fft(signal)
#     magnitude = np.abs(fft)
#     frequency = np.linspace(0, sample_rate, len(magnitude))
    
#     # STFT
#     n_FFT = 2048 # Window for FFT
#     hop_length = 512 # Shift rate
#     STFT = librosa.core.stft(signal, hop_length=hop_length, n_fft=n_FFT)
#     spectrogram = np.abs(STFT) 
#     log_spectrogram = librosa.amplitude_to_db(spectrogram)

#     # Compute MFCCs
#     MFFCs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_fft=n_FFT, hop_length=hop_lenght, n_mfcc=13)

#     return log_spectrogram, MFFCs

## Some useful Fetures for Text to Audio
1. Spectral Features
   1. Spectrogram (Frequency spectrum over time.)
   2. Mel-Spectrogram (How do humans hear)
2. Cepstral Features
   1. Mel-Frequency Cepstral Coefficients (MFCCs): SFTF (Useful for Timbral aspects)
3. Chroma features (12 differnt pitch classes (like C, C#, D, etc.)) and Harmonic content
4. Rhythmic Feactures
   1. Tempo : Speed beats occur in the audio
   2. Beat and Onset Detection 
5. Tonnetz (Tonal Centroid Features)
   1. Tonnetz: Describes the harmonic relations in music, like tonality and chordal structure.

In [None]:

def process_audio(sound_path):
    sound_path = f"data/music/{sound_path}"
    # Load the audio file
    signal, sample_rate = librosa.load(sound_path, sr=22050)

    # Short-Time Fourier Transform (STFT)
    n_FFT = 2048  # Window size for FFT
    hop_length = 512  # Hop length for FFT
    STFT = librosa.core.stft(signal, hop_length=hop_length, n_fft=n_FFT)
    spectrogram = np.abs(STFT)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)

    # Compute Mel-Frequency Cepstral Coefficients (MFCCs)
    MFFCs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_fft=n_FFT, hop_length=hop_length, n_mfcc=13)
    MFFCs = MFFCs.T # Tranpose the MFFCs

    return log_spectrogram, MFFCs


In [None]:
# librosa.display.waveshow(signal, sr = sample_rate)
# plt.xlabel("Time")
# plt.ylabel("Amplitude")
# plt.show()

In [None]:
# # FFT -> Power Spectrum
# fft = np.fft.fft(signal)
# magnitude = np.abs(fft)
# frequency = np.linspace(0, sample_rate, len(magnitude))

In [None]:
# left_frequency = frequency[:int(len(frequency)/2)]
# left_magnitude = magnitude[:int(len(frequency)/2)]

In [None]:
# # Ploting Power Spectrum
# plt.plot(left_frequency, left_magnitude)
# plt.xlabel("Frequency")
# plt.ylabel("Magnitude")
# plt.show()

In [None]:
# # STFT
# n_FFT = 2048 # Window for FFT
# hop_lenght = 512 # Shift rate
# STFT = librosa.core.stft(signal, hop_length=hop_lenght, n_fft=n_FFT)
# spectrogram = np.abs(STFT) 
# log_spectrogram = librosa.amplitude_to_db(spectrogram)
# librosa.display.specshow(log_spectrogram, sr=sample_rate, hop_length=hop_lenght)

# plt.xlabel("Time")
# plt.ylabel("Frequency")
# plt.colorbar()
# plt.show()

In [None]:
# # Compute MFCCs
# MFFCs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_fft=n_FFT, hop_length=hop_lenght, n_mfcc=13)

# # Display the MFCCs
# librosa.display.specshow(MFFCs, sr=sample_rate, hop_length=hop_lenght, x_axis='time')

# plt.xlabel("Time")
# plt.ylabel("MFCCs")
# plt.colorbar()
# plt.show()

In [None]:
caption_df[["Log_Spectrogram", "MFFCs"]] = caption_df["audio"].apply(lambda x: pd.Series(process_audio(x)))

In [None]:
caption_df