In [None]:
import pandas as pd
import numpy as np
import librosa
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
import os

for root, dirs, files in os.walk("/kaggle/input"):
    print("ROOT:", root)
    print("DIRS:", dirs)
    print("FILES:", files)
    print("------------")

In [None]:
def extract_features(file_path):
    y, sr = librosa.load(file_path, duration=60)
    
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    
    return mfcc_mean

In [None]:
import os

for root, dirs, files in os.walk("/kaggle/input"):
    print("Root:", root)
    print("Dirs:", dirs)
    print("Files:", files)
    print("------------")

In [None]:
train_df = pd.read_csv("/kaggle/input/competitions/shl-audio-scoring-challenge/dataset/csvs/train.csv")
train_df.head()

In [None]:
features = []
labels = []

audio_base_path = "/kaggle/input/competitions/shl-audio-scoring-challenge/dataset/audios/train/"

for index, row in train_df.iterrows():
    file_name = row["filename"] + ".wav"   # adding .wav
    label = row["label"]
    
    file_path = os.path.join(audio_base_path, file_name)
    
    try:
        y_audio, sr = librosa.load(file_path, duration=60)
        mfcc = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfcc.T, axis=0)
        
        features.append(mfcc_mean)
        labels.append(label)
    except:
        continue

X = np.array(features)
y = np.array(labels)

print("Feature shape:", X.shape)
print("Label shape:", y.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import numpy as np

# Split data
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_val)

# Metrics
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
pearson_corr, _ = pearsonr(y_val, y_pred)

print("Validation RMSE:", rmse)
print("Validation Pearson Correlation:", pearson_corr)

In [None]:
# Training performance
train_pred = model.predict(X_train)

train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
train_pearson, _ = pearsonr(y_train, train_pred)

print("Training RMSE:", train_rmse)
print("Training Pearson Correlation:", train_pearson)

In [None]:
# Load test data
test_df = pd.read_csv("/kaggle/input/competitions/shl-audio-scoring-challenge/dataset/csvs/test.csv")

test_features = []

for index, row in test_df.iterrows():
    file_name = row["filename"]
    file_path = "/kaggle/input/competitions/shl-audio-scoring-challenge/dataset/audios/test/" + file_name + ".wav"
    
    y_audio, sr = librosa.load(file_path, duration=60)
    mfcc = librosa.feature.mfcc(y=y_audio, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc.T, axis=0)
    
    test_features.append(mfcc_mean)

test_features = np.array(test_features)

# Predict
test_predictions = model.predict(test_features)

# Create submission file
submission = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_predictions
})

submission.to_csv("submission.csv", index=False)

print("Submission file created successfully!")