In [2]:
import os
import pandas as pd
from pydub import AudioSegment

In [None]:
file_path = r"D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\Beehive-Health-Detection-System\bee_data_split_audio.xlsx"



# Load the CSV file
df = pd.read_excel(file_path)

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())
print("\n" + "="*50 + "\n")

# Display basic info about the dataset (data types, non-null counts)
print("Dataset Info:")
print(df.info())
print("\n" + "="*50 + "\n")

# Check for missing values (count of NaN per column)
print("Missing Values per Column:")
missing_values = df.isnull().sum()
print(missing_values)
print("\n" + "="*50 + "\n")

# Percentage of missing values per column
print("Percentage of Missing Values per Column:")
missing_percentage = (df.isnull().sum() / len(df)) * 100
print(missing_percentage.round(2))  # Rounded to 2 decimal places
print("\n" + "="*50 + "\n")

# Check column data types and unique value counts (to spot potential errors)
print("Column Data Types and Unique Values:")
for column in df.columns:
    print(f"Column: {column}")
    print(f"  Data Type: {df[column].dtype}")
    print(f"  Unique Values: {df[column].nunique()}")
    # Display sample values (first 5 unique values) to inspect for errors
    print(f"  Sample Values: {df[column].dropna().unique()[:5]}")
    print("-"*30)

In [None]:
# Input Excel file and output directory
excel_path = r"D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\bee_data_with_features.xlsx"
output_dir = r"D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\split_audio"
audio_dir = r"D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Read the Excel file
df = pd.read_excel(excel_path)

# Column name in Excel that contains file names
audio_column = 'file name'  # Change this if your column name is different

# List to hold new rows
new_rows = []

# Process each row in the Excel
for idx, row in df.iterrows():
    original_file_name = row[audio_column]
    base_name = original_file_name.replace(".raw", "")

    # Loop through all 6 segments
    for segment_index in range(6):
        segment_file = os.path.join(audio_dir, f"{base_name}__segment{segment_index}.wav")

        if not os.path.exists(segment_file):
            print(f"File not found: {segment_file}")
            continue

        audio = AudioSegment.from_wav(segment_file)
        duration_ms = len(audio)
        chunk_length_ms = 10 * 1000  # 10 seconds

        # Split each segment into 10-second parts
        for chunk_index, start_ms in enumerate(range(0, duration_ms, chunk_length_ms)):
            chunk = audio[start_ms:start_ms + chunk_length_ms]
            segment_base_name = os.path.splitext(os.path.basename(segment_file))[0]
            chunk_filename = f"{segment_base_name}_part{chunk_index + 1}.wav"
            chunk_path = os.path.join(output_dir, chunk_filename)

            # Export the chunk
            chunk.export(chunk_path, format="wav")

            # Append to new dataset with updated filename
            new_row = row.copy()
            new_row[audio_column] = chunk_filename
            new_rows.append(new_row)

# Create a new DataFrame and save to Excel
new_df = pd.DataFrame(new_rows)
new_excel_path = os.path.join(os.path.dirname(excel_path), "bee_data_split_audio.xlsx")
new_df.to_excel(new_excel_path, index=False)

print("✅ Audio splitting complete and new Excel file created!")


File not found: D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files\2022-06-12--22-58-10_2__segment5.wav
File not found: D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files\2022-06-14--20-15-04_2__segment5.wav
File not found: D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files\2022-06-14--22-14-39_2__segment1.wav
File not found: D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files\2022-06-14--22-14-39_2__segment2.wav
File not found: D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files\2022-06-14--22-14-39_2__segment3.wav
File not found: D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files\2022-06-14--22-14-39_2__segment4.wav
File not found: D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\sound_files\2022-06-14--22-14-39_2__segment5.wav
File not found: D:\AIDS\3rd year\sem2\Bee Hive Conditio

In [6]:
file_path = r"D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\Beehive-Health-Detection-System\bee_data_split_audio1.xlsx"


# Load the CSV file
df = pd.read_excel(file_path)

In [7]:
import os
import librosa
import numpy as np
import pandas as pd

# Load dataset
df.columns = df.columns.str.strip()

# Path to the directory containing audio files
audio_dir = r"D:\AIDS\3rd year\sem2\Bee Hive Condition Projecct\archive\sound_files\split_audio"

# Function to extract audio features (e.g., MFCCs)
def extract_audio_features(file_path, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=22050)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    return mfccs_mean

# Loop over each file and extract features
audio_features = []
failed_files = []

for fname in df['file name']:
    segment_features = []
    
    try:
        segment_path = os.path.join(audio_dir, fname)
        if os.path.exists(segment_path):
            features = extract_audio_features(segment_path)
            segment_features.append(features)
        else:
            print(f"Segment missing: {segment_path}")

        if segment_features:
            # Average features across segments
            avg_features = np.mean(segment_features, axis=0)
            audio_features.append(avg_features)
        else:
            # If no segment was found or all failed
            audio_features.append([np.nan] * 13)
            failed_files.append(fname)

    except Exception as e:
        print(f"❌ Error processing {fname}: {e}")
        audio_features.append([np.nan] * 13)
        failed_files.append(fname)


# Add audio features to dataframe
mfcc_df = pd.DataFrame(audio_features, columns=[f"mfcc_{i+1}" for i in range(13)])
df_combined = pd.concat([df, mfcc_df], axis=1).dropna()


In [8]:
df_combined.to_excel("bee_data_with_features1.xlsx", index=False)
