In [1]:
!pip install pydub



In [2]:
# Importing the necessary library
import librosa
import pandas as pd
import os


In [3]:
# Function to get the total number of mp3 files in a folder
def count_mp3_files(folder_path):
    mp3_files = [file for file in os.listdir(folder_path) if file.endswith('.mp3')]
    return len(mp3_files)

# Function to get the total number of mp3 files in all folders
def total_mp3_files_in_folders(root_folder):
    total_mp3_files = 0
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        if os.path.isdir(folder_path):
            total_mp3_files += count_mp3_files(folder_path)
    return total_mp3_files


root_folder_path = 'E:\\personal files\\dataset\\archive_8\\Language Detection Dataset'

total_mp3_files = total_mp3_files_in_folders(root_folder_path)

print(f'Total number of .mp3 files in all folders: {total_mp3_files}')


Total number of .mp3 files in all folders: 256833


In [4]:
# verify if any other formats are available other than .mp3

import os
from collections import Counter

def count_files_by_type(folder_path):
    file_types_counter = Counter()

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            _, file_extension = os.path.splitext(file)
            file_types_counter[file_extension.lower()] += 1

    return file_types_counter

def print_file_counts(folder_path):
    file_counts = count_files_by_type(folder_path)

    print(f"File counts in {folder_path}:")
    for file_type, count in file_counts.items():
        print(f"{file_type}: {count}")

# Example usage:
folder_path = "E:\\personal files\\dataset\\archive_8\\Language Detection Dataset"
print_file_counts(folder_path)




File counts in E:\personal files\dataset\archive_8\Language Detection Dataset:
.mp3: 256833


In [5]:
# Number of audio files in each folder
import os
d={}
def count_audio_files(folder_path):
    audio_files_count = 0

    for root, dirs, files in os.walk(folder_path):
        audio_files = [file for file in files if file.lower().endswith((".mp3"))]
        audio_files_count += len(audio_files)

        folder_name = os.path.basename(root)
        if folder_name=="":
            continue
        print(f"Folder: {folder_name}, Audio Files: {len(audio_files)}")
        d[folder_name]=len(audio_files)
    return audio_files_count

def print_total_audio_files(folder_path):
    total_audio_files = count_audio_files(folder_path)
    print(f"\nTotal Audio Files across all folders: {total_audio_files}")

# Example usage:
folder_path = "E:\\personal files\\dataset\\archive_8\\Language Detection Dataset\\"
print_total_audio_files(folder_path)


# min_value = min(d.values())
# print("\nThe minimum no of audio files is:- ",min_value)

#print the language having less data
min_key = min(d, key=d.get)
min_value = d[min_key]

print("\nThe minimum number of audio files is:", min_value, "for the language:", min_key)

Folder: Bengali, Audio Files: 27258
Folder: Gujarati, Audio Files: 26441
Folder: Hindi, Audio Files: 25462
Folder: Kannada, Audio Files: 22208
Folder: Malayalam, Audio Files: 24044
Folder: Marathi, Audio Files: 25379
Folder: Punjabi, Audio Files: 26229
Folder: Tamil, Audio Files: 24196
Folder: Telugu, Audio Files: 23656
Folder: Urdu, Audio Files: 31960

Total Audio Files across all folders: 256833

The minimum number of audio files is: 22208 for the language: Kannada


In [6]:
# function to extract the relevant features of an audio file
def extract_features(audio_file):
    # Load audio file
    y, sr = librosa.load(audio_file) #The sampling rate --- typically fs in the digital signal processing literature, or sr in librosa. 
                                    # The sampling rate (sr) is defined as 1/ts, where ts is the amount of time (in seconds) between successive samples.
        
    # print("The sample rate of the {} is:- {}".format(audio_file,sr))
        
# Extract features using librosa
    # Compute a chromagram from a waveform or power spectrogram and return as numpy array.    
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr) 

    # Compute root-mean-square (RMS) value from the audio samples y.
    rmse = librosa.feature.rms(y=y) 
    
    # The spectral centroid indicates at which frequency the energy of a spectrum is centered upon 
    # or in other words It indicates where the ” center of mass” for a sound is located.
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)  
    
    # The spectrum of a signal is the range of frequencies contained in the signal. 
    # The bandwidth is the difference between the lowest and highest frequency in the spectrum.
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)

    # spectral_rolloff measures the bandwidth of the audio signal by determining the frequency bin under which a given percentage of the total energy exists 
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    
    # The mel frequency cepstral coefficients (MFCCs) of an audio signal are a small set of features (usually about 10–20) which describe the overall shape of the spectral envelope.
    mfccs = librosa.feature.mfcc(y=y, sr=sr)

    # compute the mean values for the extracted features and store is as a list
    features = [chroma_stft.mean(), rmse.mean(), spectral_centroid.mean(),
                spectral_bandwidth.mean(), spectral_rolloff.mean()] + mfccs.mean(axis=1).tolist()

    return features



In [7]:
# function to access the audio files stored locally and then extract the features of each audio file using the extract_features function
def process_audio_files(root_folder):
    features_list = []

    for folder_name in os.listdir(root_folder): # iterate through all the folders in the root folder
        folder_path = os.path.join(root_folder, folder_name) 
        # print(folder_path)
        count = 1
        if os.path.isdir(folder_path):
            print(folder_path)
            for file_name in os.listdir(folder_path): # iterate through all the contents/files in the folder
                if (file_name.endswith(".mp3") and count<=20000): # access all the audio files (.mp3 format) in each folder
                    audio_path = os.path.join(folder_path, file_name)
                    print("audio_path:- ",audio_path)
                    try:
                        features = extract_features(audio_path) # extract the features of the audio file
                        features_list.append(features + [folder_name])
                        print("Audio Foder: {} and File Name: {} is successfully extracted".format(folder_name,file_name))
                        count+=1
                    except:
                        print("Cannot open the file {} from {} audio folder as it is corrupt".format(file_name,folder_name))
                else:
                    break
    
    return audio_path, features_list



In [8]:
# Function to save the extracted features into csv file
def save_features_to_csv(features_list, csv_file):
    # define the column names
    column_names = ["chroma_stft", "rmse", "spectral_centroid",
                    "spectral_bandwidth", "spectral_rolloff"] + ["mfcc_{}".format(i) for i in range(1, 21)] + ["target"]
    
    df = pd.DataFrame(features_list, columns=column_names)
    df.to_csv(csv_file, index=False)



In [None]:
# define the root folder
root_folder = "E:\\personal files\\dataset\\archive_8\\Language Detection Dataset"

# define the csv file
csv_file = "Indian_language_dataset.csv"

audio_path, features_list = process_audio_files(root_folder)
save_features_to_csv(features_list, csv_file)


E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\0.mp3
Audio Foder: Bengali and File Name: 0.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\1.mp3
Audio Foder: Bengali and File Name: 1.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\10.mp3
Audio Foder: Bengali and File Name: 10.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\100.mp3
Audio Foder: Bengali and File Name: 100.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\1000.mp3
Audio Foder: Bengali and File Name: 1000.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\10000.mp3
Audio Foder: Bengali and Fi

  return pitch_tuning(


Audio Foder: Bengali and File Name: 13618.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\13619.mp3
Audio Foder: Bengali and File Name: 13619.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\1362.mp3
Audio Foder: Bengali and File Name: 1362.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\13620.mp3
Audio Foder: Bengali and File Name: 13620.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\13621.mp3
Audio Foder: Bengali and File Name: 13621.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\13622.mp3
Audio Foder: Bengali and File Name: 13622.mp3 is successfully extracted
audio_path:-  E:\personal files\dataset\archive_8\Language Detection Dataset\Bengali\13623.mp3
