In [3]:
%pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pydub import AudioSegment
from pydub.silence import split_on_silence

def split_audio_on_silence(input_file, output_folder, silence_thresh=-30, min_silence_len=2000, keep_silence=100):
    # Load audio file
    audio = AudioSegment.from_file(input_file, format="wav")

    # Split audio based on silence
    segments = split_on_silence(
        audio,
        min_silence_len=min_silence_len,
        silence_thresh=silence_thresh,
        keep_silence=keep_silence
    )

    segment_durations = [len(segment) / 1000.0 for segment in segments]

    # Export segments and print their lengths
    for i, segment in enumerate(segments):
        
        # Calculate and print segment duration in seconds
        segment_duration = segment_durations[i]  # Convert milliseconds to seconds
        print(f"Segment {i}: Duration = {segment_duration:.2f} seconds")

    return segments, segment_durations



In [3]:
import os
import glob

def process_folder(folder_path, folder_name):
    # Create a folder to store the segmented audio files
    output_folder = os.path.join(".", 'segmented')
    os.makedirs(output_folder, exist_ok=True)

    # Get a list of all .wav files in the folder
    audio_files = glob.glob(os.path.join(folder_path, '*.wav'))

    # Loop through each audio file
    for idx, audio_file in enumerate(audio_files):
        audio = AudioSegment.from_wav(audio_file)
        total_duration = audio.duration_seconds  # Get the duration of the original file

        segments, segment_durations = split_audio_on_silence(audio_file, output_folder, silence_thresh=-30, min_silence_len=2000, keep_silence=100)

        cumulative_start_time = 0  # Keep track of the cumulative start time
        cumulative_end_time = 0  # Keep track of the cumulative end time

        for segment_idx, (segment, segment_duration) in enumerate(zip(segments, segment_durations)):
            segment_name = f"{folder_name}_{idx}_{segment_idx}.wav"
            segment_path = os.path.join(output_folder, segment_name)
            segment.export(segment_path, format="wav")

            # Calculate cumulative start and end times
            start_time = cumulative_start_time
            end_time = cumulative_end_time + segment_duration

            # Store start and end times in a text file
            time_info_file = os.path.splitext(segment_path)[0] + '.txt'
            with open(time_info_file, 'w') as f:
                f.write(f"Start time: {start_time} seconds\nEnd time: {end_time} seconds\n(Out of {total_duration} seconds)")

            cumulative_start_time = end_time
            cumulative_end_time = end_time

In [4]:
folder_paths = [
    os.path.join(".", "CodeMixed")
]

folder_names = ["CodeMixed"]

for folder_path, folder_name in zip(folder_paths, folder_names):
    process_folder(folder_path, folder_name)

Segment 0: Duration = 13.78 seconds
Segment 1: Duration = 10.98 seconds
Segment 2: Duration = 24.69 seconds
Segment 3: Duration = 13.85 seconds
Segment 4: Duration = 35.90 seconds
Segment 5: Duration = 4.55 seconds
Segment 6: Duration = 29.03 seconds
Segment 7: Duration = 4.43 seconds
Segment 8: Duration = 5.67 seconds
Segment 9: Duration = 22.59 seconds
Segment 10: Duration = 47.57 seconds
Segment 11: Duration = 29.79 seconds
Segment 12: Duration = 44.25 seconds
Segment 13: Duration = 10.18 seconds
Segment 14: Duration = 45.47 seconds
Segment 15: Duration = 45.76 seconds
Segment 0: Duration = 41.57 seconds
Segment 1: Duration = 26.48 seconds
Segment 2: Duration = 81.57 seconds
Segment 3: Duration = 55.05 seconds
Segment 4: Duration = 26.50 seconds
Segment 5: Duration = 30.23 seconds
Segment 6: Duration = 10.08 seconds
Segment 7: Duration = 9.04 seconds
Segment 8: Duration = 2.43 seconds
Segment 9: Duration = 45.37 seconds
Segment 10: Duration = 8.67 seconds
Segment 11: Duration = 26.4

In [1]:
%pip install librosa

Collecting librosa
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.59.1-cp39-cp39-win_amd64.whl.metadata (2.8 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.12.1-py2.py3-none-win_amd64.whl.metadata (14 kB)
Collecting pooch>=1.0 (from librosa)
  Downloading pooch-1.8.1-py3-none-any.whl.metadata (9.5 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.3.7-cp39-cp39-win_amd64.whl.metadata (5.7 kB)
Collecting lazy-loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.0.8-cp39-cp39-win_amd64.whl.metadata (9.4 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba>=0.51.0->librosa)
  Downloading llvmlite-0.42.0-cp39-cp39-win_amd64.whl.metadata (4.9 kB)
Downloading librosa-0

In [24]:
import librosa
import pandas as pd

def process_segmented_files(segmented_folder_path):
    data = []

    segmented_files = glob.glob(os.path.join(segmented_folder_path, '*.wav'))

    for segmented_file in segmented_files:

        print(segmented_file)
        print("\n\n")

        y, sample_rate = librosa.core.load(segmented_file, sr=None, mono=True)
        segments = librosa.effects.split(y, top_db=20)
        segment_count = len(segments)
        segment_list = []
        
        for i, (start, end) in enumerate(segments):
            segment = y[start:end]
            segment_list.append(segment)
            
            # Calculate and print segment duration in seconds
            segment_duration = (end - start) / sample_rate
            print(f"Segment {i}: Duration = {segment_duration:.2f} seconds")
                    

        # Extract the label from the file name
        file_name = os.path.basename(segmented_file)
        label = file_name.split('_')[0]

        # Append the features and label to the data list
        data.append({'segment_list': segment_list, 'label': label,'sample_rate':sample_rate})

    # Create a DataFrame from the data list
    df = pd.DataFrame(data)

    return df

dataset = process_segmented_files("C:/Users/aksha/OneDrive/Documents/Programming/LanguageIdentification/segmented")

C:/Users/aksha/OneDrive/Documents/Programming/LanguageIdentification/segmented\CodeMixed_0_0.wav



Segment 0: Duration = 1.38 seconds
Segment 1: Duration = 1.15 seconds
Segment 2: Duration = 0.77 seconds
Segment 3: Duration = 0.26 seconds
Segment 4: Duration = 1.18 seconds
Segment 5: Duration = 0.58 seconds
Segment 6: Duration = 0.22 seconds
Segment 7: Duration = 1.28 seconds
Segment 8: Duration = 0.70 seconds
Segment 9: Duration = 0.70 seconds
Segment 10: Duration = 0.26 seconds
Segment 11: Duration = 0.77 seconds
Segment 12: Duration = 1.17 seconds
C:/Users/aksha/OneDrive/Documents/Programming/LanguageIdentification/segmented\CodeMixed_0_1.wav



Segment 0: Duration = 2.02 seconds
Segment 1: Duration = 0.64 seconds
Segment 2: Duration = 0.80 seconds
Segment 3: Duration = 0.80 seconds
Segment 4: Duration = 0.70 seconds
Segment 5: Duration = 0.22 seconds
Segment 6: Duration = 1.06 seconds
Segment 7: Duration = 0.22 seconds
Segment 8: Duration = 0.38 seconds
Segment 9: Duration = 0.90 

In [25]:
print(dataset['segment_list'][0])

[array([-0.10708618, -0.0710144 , -0.08306885, ..., -0.0078125 ,
       -0.00427246, -0.0098877 ], dtype=float32), array([ 0.1307373 ,  0.14105225, -0.01507568, ..., -0.00198364,
       -0.00094604, -0.00289917], dtype=float32), array([ 0.00234985,  0.00177002,  0.00180054, ..., -0.00823975,
       -0.01000977, -0.0098877 ], dtype=float32), array([ 0.00289917,  0.003479  ,  0.00299072, ...,  0.04116821,
       -0.0489502 ,  0.078125  ], dtype=float32), array([-0.00317383,  0.00152588, -0.00512695, ..., -0.00817871,
       -0.00900269, -0.00976562], dtype=float32), array([0.00906372, 0.00741577, 0.00772095, ..., 0.00411987, 0.03277588,
       0.01635742], dtype=float32), array([ 0.0118103 , -0.05474854, -0.01876831, ...,  0.00854492,
        0.00894165,  0.0027771 ], dtype=float32), array([-0.0020752 , -0.00125122, -0.00094604, ..., -0.00613403,
       -0.00732422, -0.00772095], dtype=float32), array([-0.00244141, -0.0017395 ,  0.00015259, ..., -0.02468872,
       -0.0123291 , -0.008178

In [21]:
for segment in dataset["segment_list"][0]:
    print(segment)

[-0.10708618 -0.0710144  -0.08306885 ... -0.0078125  -0.00427246
 -0.0098877 ]
[ 0.1307373   0.14105225 -0.01507568 ... -0.00198364 -0.00094604
 -0.00289917]
[ 0.00234985  0.00177002  0.00180054 ... -0.00823975 -0.01000977
 -0.0098877 ]
[ 0.00289917  0.003479    0.00299072 ...  0.04116821 -0.0489502
  0.078125  ]
[-0.00317383  0.00152588 -0.00512695 ... -0.00817871 -0.00900269
 -0.00976562]
[0.00906372 0.00741577 0.00772095 ... 0.00411987 0.03277588 0.01635742]
[ 0.0118103  -0.05474854 -0.01876831 ...  0.00854492  0.00894165
  0.0027771 ]
[-0.0020752  -0.00125122 -0.00094604 ... -0.00613403 -0.00732422
 -0.00772095]
[-0.00244141 -0.0017395   0.00015259 ... -0.02468872 -0.0123291
 -0.00817871]
[ 0.00393677  0.00143433  0.00167847 ... -0.00158691 -0.00061035
 -0.00030518]
[-0.00424194 -0.00488281 -0.00500488 ...  0.         -0.00454712
  0.00228882]
[ 0.00735474  0.00720215  0.00958252 ... -0.00073242 -0.00064087
 -0.00091553]
[-0.00018311 -0.00027466 -0.00067139 ...  0.00375366  0.00524

In [29]:
dataset.head()

Unnamed: 0,segment_list,label,sample_rate
0,"[[-0.10708618, -0.071014404, -0.08306885, -0.0...",CodeMixed,16000
1,"[[0.0016479492, 0.001739502, 0.0016174316, 0.0...",CodeMixed,16000
2,"[[0.002380371, 0.0025634766, 0.0025939941, 0.0...",CodeMixed,16000
3,"[[-0.007904053, -0.0082092285, -0.007293701, -...",CodeMixed,16000
4,"[[0.004852295, 0.0023498535, -3.0517578e-05, 0...",CodeMixed,16000


In [32]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=False
)

In [33]:
from transformers import Wav2Vec2Model

model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
audio_data = dataset["segment_list"][0][0]

In [35]:
import torch

# Assuming `input_audio` is your audio data as a PyTorch tensor
with torch.no_grad():
    inputs = feature_extractor(audio_data, return_tensors="pt", sampling_rate=16000)
    outputs = model(**inputs)

print(outputs)