<a href="https://colab.research.google.com/github/Adi30TyaDTU/LinguaGrade/blob/main/LinguaGrade.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

shl_intern_hiring_assessment_path = kagglehub.competition_download('shl-intern-hiring-assessment')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
import numpy as np
import librosa
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import AutoTokenizer, AutoModel
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Load data
train_df = pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv')
sample_submission = pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/sample_submission.csv')

train_audio_path = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train'
test_audio_path = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test'

# Check data
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(train_df.head())

In [None]:
# Load ASR model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load model directly
from transformers import AutoProcessor, AutoModelForCTC


asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(device)

def transcribe_audio(audio_path):
    # Load audio file
    speech_array, sampling_rate = librosa.load(audio_path, sr=16000)

    # Process for ASR
    inputs = asr_processor(speech_array, sampling_rate=16000, return_tensors="pt", padding=True)

    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get logits
    with torch.no_grad():
        logits = asr_model(**inputs).logits

    # Decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = asr_processor.batch_decode(predicted_ids)[0]

    return transcription

# Example transcription (we'll process all files in batches later)
sample_audio = os.path.join(train_audio_path, train_df.iloc[0]['filename'])
print(f"Sample transcription: {transcribe_audio(sample_audio)}")

In [None]:
# Load language model for embedding
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text_model = AutoModel.from_pretrained("bert-base-uncased").to(device)

def get_text_features(text):
    # Basic text stats
    features = {
        'num_words': len(text.split()),
        'num_chars': len(text),
        'avg_word_length': np.mean([len(word) for word in text.split()]) if len(text.split()) > 0 else 0,
        'unique_word_ratio': len(set(text.split())) / len(text.split()) if len(text.split()) > 0 else 0,
    }

    # Get BERT embeddings
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = text_model(**inputs)

    # Use mean pooling for sentence embedding
    mean_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    for i, val in enumerate(mean_embedding):
        features[f'text_embed_{i}'] = val

    return features

# Example features
sample_text = "This is a sample sentence to demonstrate feature extraction."
print(get_text_features(sample_text))

In [None]:
def extract_acoustic_features(audio_path):
    # Load audio file
    y, sr = librosa.load(audio_path, sr=16000)

    features = {}

    # MFCC features
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    for i in range(13):
        features[f'mfcc_{i}_mean'] = np.mean(mfcc[i])
        features[f'mfcc_{i}_std'] = np.std(mfcc[i])

    # Pitch features
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitches = pitches[pitches > 0]  # Remove zeros
    if len(pitches) > 0:
        features['pitch_mean'] = np.mean(pitches)
        features['pitch_std'] = np.std(pitches)
    else:
        features['pitch_mean'] = 0
        features['pitch_std'] = 0

    # Energy features
    rms = librosa.feature.rms(y=y)
    features['energy_mean'] = np.mean(rms)
    features['energy_std'] = np.std(rms)

    # Speaking rate approximation
    speech_duration = len(y) / sr
    num_words = len(transcribe_audio(audio_path).split())
    features['speaking_rate'] = num_words / speech_duration if speech_duration > 0 else 0

    # Pause features
    y_trimmed, _ = librosa.effects.trim(y, top_db=20)
    pause_ratio = (len(y) - len(y_trimmed)) / len(y)
    features['pause_ratio'] = pause_ratio

    return features

# Example features
print(extract_acoustic_features(sample_audio))

In [None]:
def process_files(df, audio_path, is_train=True):
    features_list = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        audio_file = os.path.join(audio_path, row['filename'])

        try:
            # Step 1: Transcribe audio
            transcription = transcribe_audio(audio_file)

            # Step 2: Extract text features
            text_features = get_text_features(transcription)

            # Step 3: Extract acoustic features
            acoustic_features = extract_acoustic_features(audio_file)

            # Combine all features
            combined_features = {**text_features, **acoustic_features}

            # Add label if training data
            if is_train:
                combined_features['label'] = row['label']

            combined_features['filename'] = row['filename']
            features_list.append(combined_features)

        except Exception as e:
            print(f"Error processing {row['filename']}: {str(e)}")
            continue

    return pd.DataFrame(features_list)

# Process training data
print("Processing training data...")
train_features = process_files(train_df, train_audio_path)

# Process test data
print("\nProcessing test data...")
test_features = process_files(test_df, test_audio_path, is_train=False)

# Save features to avoid recomputing
train_features.to_csv('train_features.csv', index=False)
test_features.to_csv('test_features.csv', index=False)

In [None]:
# Load features if not already in memory
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')

# Prepare data
X = train_features.drop(['label', 'filename'], axis=1)
y = train_features['label']

# Split into train and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train XGBoost model
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.01,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
    'n_jobs': -1
}

dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dval = xgb.DMatrix(X_val_scaled, label=y_val)

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dval, 'val')],
    early_stopping_rounds=50,
    verbose_eval=50
)

# Evaluate on validation set
val_preds = model.predict(dval)
pearson = pearsonr(y_val, val_preds)[0]
print(f"\nValidation Pearson Correlation: {pearson:.4f}")

# Feature importance
fig, ax = plt.subplots(figsize=(12, 8))
xgb.plot_importance(model, max_num_features=20, ax=ax)
plt.title('Feature Importance')
plt.show()

In [None]:
# Prepare test data
X_test = test_features.drop(['filename'], axis=1)
X_test_scaled = scaler.transform(X_test)
dtest = xgb.DMatrix(X_test_scaled)

# Make predictions
test_preds = model.predict(dtest)

# Clip predictions to [1, 5] range
# Round to 1 decimal place
test_preds = np.round(np.clip(test_preds, 1, 5), 1)

# Create submission
submission = pd.DataFrame({
    'filename': test_features['filename'],
    'label': test_preds
})

# Save submission
submission.to_csv('xg_submission.csv', index=False)
print("Submission file created!")

In [None]:
submission

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define model
def create_model(input_shape):
    inputs = Input(shape=(input_shape,))

    x = Dense(256, activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)

    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)

    outputs = Dense(1, activation='linear')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    return model

# Create and train model
nn_model = create_model(X_train_scaled.shape[1])
callbacks = [
    EarlyStopping(patience=20, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.1, patience=10)
]

history = nn_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=200,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Evolution')

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.legend()
plt.title('MAE Evolution')
plt.show()

# Evaluate
nn_val_preds = nn_model.predict(X_val_scaled).flatten()
nn_pearson = pearsonr(y_val, nn_val_preds)[0]
print(f"\nNeural Network Validation Pearson Correlation: {nn_pearson:.4f}")

# Make test predictions
nn_test_preds = nn_model.predict(X_test_scaled).flatten()
# Round to 1 decimal place
nn_test_preds = np.round(np.clip(nn_test_preds, 1, 5), 1)

# Create NN submission
nn_submission = pd.DataFrame({
    'filename': test_features['filename'],
    'label': nn_test_preds
})

nn_submission.to_csv('nn_submission.csv', index=False)

In [None]:
os.remove("/kaggle/working/xg_submission.csv")

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr, spearmanr

def evaluate_model(y_true, y_pred, model_name="Model"):
    """
    Comprehensive evaluation of regression model performance.

    Parameters:
    -----------
    y_true : array-like
        Ground truth target values
    y_pred : array-like
        Predicted target values
    model_name : str
        Name of the model for reporting

    Returns:
    --------
    dict
        Dictionary containing all calculated metrics
    """
    # Round predictions to 2 decimal places for reporting
    y_pred_rounded = np.round(y_pred, 2)

    # Calculate metrics
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pearson_corr, pearson_p = pearsonr(y_true, y_pred)
    spearman_corr, spearman_p = spearmanr(y_true, y_pred)

    # Calculate additional metrics
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 0.001))) * 100

    # Within 0.5 and 1.0 accuracy
    within_half = np.mean(np.abs(y_true - y_pred) < 0.5) * 100
    within_one = np.mean(np.abs(y_true - y_pred) < 1.0) * 100

    # Store metrics in dictionary
    metrics = {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE (%)': mape,
        'R²': r2,
        'Pearson Correlation': pearson_corr,
        'Pearson p-value': pearson_p,
        'Spearman Correlation': spearman_corr,
        'Spearman p-value': spearman_p,
        'Within 0.5 Accuracy (%)': within_half,
        'Within 1.0 Accuracy (%)': within_one
    }

    # Print metrics in a formatted table
    print(f"\n{model_name} Performance Metrics:")
    print("-" * 50)
    for metric, value in metrics.items():
        if isinstance(value, float):
            print(f"{metric:25}: {value:.4f}")
        else:
            print(f"{metric:25}: {value}")

    # Create error visualization data
    errors = y_true - y_pred
    metrics['errors'] = errors

    return metrics

# Example usage for XGBoost model evaluation
xgb_val_metrics = evaluate_model(y_val, val_preds, "XGBoost (Validation)")

# Example usage for Neural Network model evaluation
nn_val_metrics = evaluate_model(y_val, nn_val_preds, "Neural Network (Validation)")

# Visualize error distributions
def plot_error_analysis(metrics_dict1, metrics_dict2=None, model1_name="Model 1", model2_name="Model 2"):
    """
    Plot error analysis visualizations for one or two models

    Parameters:
    -----------
    metrics_dict1 : dict
        Dictionary with metrics from first model
    metrics_dict2 : dict, optional
        Dictionary with metrics from second model
    model1_name : str
        Name of first model
    model2_name : str
        Name of second model
    """
    plt.figure(figsize=(15, 10))

    # Plot error distributions
    plt.subplot(2, 2, 1)
    sns.histplot(metrics_dict1['errors'], kde=True, color='blue', alpha=0.6, label=model1_name)
    if metrics_dict2:
        sns.histplot(metrics_dict2['errors'], kde=True, color='red', alpha=0.6, label=model2_name)
    plt.axvline(x=0, color='black', linestyle='--')
    plt.title('Error Distribution')
    plt.xlabel('Error (True - Predicted)')
    plt.legend()

    # Plot prediction vs actual scatter plot
    plt.subplot(2, 2, 2)

    # For first model
    y_true1 = metrics_dict1['errors'] + y_val  # reconstructing y_true
    y_pred1 = y_val - metrics_dict1['errors']  # reconstructing y_pred
    plt.scatter(y_true1, y_pred1, alpha=0.5, color='blue', label=model1_name)

    # For second model if provided
    if metrics_dict2:
        y_true2 = metrics_dict2['errors'] + y_val
        y_pred2 = y_val - metrics_dict2['errors']
        plt.scatter(y_true2, y_pred2, alpha=0.5, color='red', label=model2_name)

    # Add perfect prediction line
    min_val = min(y_true1)
    max_val = max(y_true1)
    plt.plot([min_val, max_val], [min_val, max_val], 'k--')
    plt.title('Predictions vs Actual Values')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.legend()

    # Plot residuals
    plt.subplot(2, 2, 3)
    plt.scatter(y_val, metrics_dict1['errors'], alpha=0.5, color='blue', label=model1_name)
    if metrics_dict2:
        plt.scatter(y_val, metrics_dict2['errors'], alpha=0.5, color='red', label=model2_name)
    plt.axhline(y=0, color='black', linestyle='--')
    plt.title('Residual Plot')
    plt.xlabel('Actual Values')
    plt.ylabel('Residuals')
    plt.legend()

    # Plot error boxplot by actual value range
    plt.subplot(2, 2, 4)
    bins = pd.cut(y_val, bins=5)

    boxplot_data = []
    labels = []

    for bin_name, bin_group in pd.DataFrame({'bin': bins, 'error1': metrics_dict1['errors']}).groupby('bin'):
        boxplot_data.append(bin_group['error1'].values)
        labels.append(f"{bin_name.left:.1f}-{bin_name.right:.1f}")

    plt.boxplot(boxplot_data, labels=labels)
    plt.title('Error by Actual Value Range')
    plt.xlabel('Actual Value Range')
    plt.ylabel('Error')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

# Visualize errors for both models
plot_error_analysis(xgb_val_metrics, nn_val_metrics, "XGBoost", "Neural Network")

# Model Comparison
def compare_models(models_metrics, model_names):
    """
    Create bar chart comparing key metrics across models

    Parameters:
    -----------
    models_metrics : list
        List of metric dictionaries from different models
    model_names : list
        List of model names
    """
    key_metrics = ['RMSE', 'MAE', 'R²', 'Pearson Correlation', 'Within 0.5 Accuracy (%)', 'Within 1.0 Accuracy (%)']

    comparison_data = []
    for i, metrics in enumerate(models_metrics):
        for metric in key_metrics:
            comparison_data.append({
                'Model': model_names[i],
                'Metric': metric,
                'Value': metrics[metric]
            })

    comparison_df = pd.DataFrame(comparison_data)

    plt.figure(figsize=(15, 10))
    for i, metric in enumerate(key_metrics):
        plt.subplot(2, 3, i+1)
        metric_data = comparison_df[comparison_df['Metric'] == metric]
        sns.barplot(x='Model', y='Value', data=metric_data)
        plt.title(metric)
        plt.ylim(bottom=0 if metric != 'R²' and metric != 'Pearson Correlation' else None)

    plt.tight_layout()
    plt.show()

# Compare models
compare_models([xgb_val_metrics, nn_val_metrics], ["XGBoost", "Neural Network"])

# Ensemble the models
def create_ensemble(y_pred1, y_pred2, weights=[0.5, 0.5]):
    """
    Create an ensemble of two models using weighted averaging

    Parameters:
    -----------
    y_pred1 : array-like
        Predictions from first model
    y_pred2 : array-like
        Predictions from second model
    weights : list
        Weights for averaging [weight1, weight2]

    Returns:
    --------
    array-like
        Ensemble predictions
    """
    return weights[0] * y_pred1 + weights[1] * y_pred2

# Try different ensemble weights to find the optimal combination
def optimize_ensemble_weights(y_true, y_pred1, y_pred2, metric='pearson'):
    """
    Find optimal weights for ensemble

    Parameters:
    -----------
    y_true : array-like
        Ground truth values
    y_pred1 : array-like
        Predictions from first model
    y_pred2 : array-like
        Predictions from second model
    metric : str
        Metric to optimize ('pearson', 'rmse', 'mae')

    Returns:
    --------
    tuple
        (best_weight, best_score)
    """
    weights = np.linspace(0, 1, 21)  # Try weights from 0 to 1 with 0.05 step
    best_score = -np.inf if metric == 'pearson' else np.inf
    best_weight = 0.5

    scores = []

    for w in weights:
        ensemble_pred = create_ensemble(y_pred1, y_pred2, [w, 1-w])

        if metric == 'pearson':
            score = pearsonr(y_true, ensemble_pred)[0]
            if score > best_score:
                best_score = score
                best_weight = w
        elif metric == 'rmse':
            score = np.sqrt(mean_squared_error(y_true, ensemble_pred))
            if score < best_score:
                best_score = score
                best_weight = w
        elif metric == 'mae':
            score = mean_absolute_error(y_true, ensemble_pred)
            if score < best_score:
                best_score = score
                best_weight = w

        scores.append((w, score))

    # Plot weight optimization
    plt.figure(figsize=(10, 6))
    weights_list, scores_list = zip(*scores)
    plt.plot(weights_list, scores_list, 'o-')
    plt.axvline(x=best_weight, color='red', linestyle='--')
    plt.xlabel('Weight for Model 1')
    plt.ylabel(f'{metric.upper()} Score')
    plt.title(f'Ensemble Weight Optimization (Best weight: {best_weight:.2f})')
    plt.grid(True)
    plt.show()

    return best_weight, best_score

# Optimize ensemble weights
best_weight, best_score = optimize_ensemble_weights(y_val, val_preds, nn_val_preds, 'pearson')
print(f"Best ensemble weight: {best_weight:.2f} for XGBoost, {1-best_weight:.2f} for Neural Network")
print(f"Best Pearson correlation: {best_score:.4f}")

# Create ensemble predictions
ensemble_val_preds = create_ensemble(val_preds, nn_val_preds, [best_weight, 1-best_weight])
ensemble_val_metrics = evaluate_model(y_val, ensemble_val_preds, "Ensemble (Validation)")

# Create final ensemble prediction for test set
ensemble_test_preds = create_ensemble(test_preds, nn_test_preds, [best_weight, 1-best_weight])
ensemble_test_preds = np.clip(ensemble_test_preds, 1, 5)

# Create ensemble submission
ensemble_submission = pd.DataFrame({
    'filename': test_features['filename'],
    'label': ensemble_test_preds.astype(int)  # optional: ensure integer type
})

ensemble_submission.to_csv('submission.csv', index=False)
print("Ensemble submission file created!")

def plot_xgb_learning_curve(model, train_data, val_data):
    """
    Plot learning curve for XGBoost model

    Parameters:
    -----------
    model : xgb.Booster
        Trained XGBoost model
    train_data : xgb.DMatrix
        Training data
    val_data : xgb.DMatrix
        Validation data
    """
    results = {}

    # For a Booster object, get parameters directly from the model
    params = model.get_params() if hasattr(model, 'get_params') else {}

    # Get the best iteration or use default
    best_iteration = model.best_iteration if hasattr(model, 'best_iteration') else 100

    # Train model with evaluation at each iteration
    xgb.train(
        params,
        train_data,
        num_boost_round=best_iteration + 1,
        evals=[(train_data, 'train'), (val_data, 'val')],
        evals_result=results,
        verbose_eval=False
    )

    # Plot learning curves
    plt.figure(figsize=(10, 5))
    plt.plot(results['train']['rmse'], label='Train RMSE')
    plt.plot(results['val']['rmse'], label='Validation RMSE')
    plt.axvline(x=best_iteration, color='red', linestyle='--',
                label=f'Best iteration: {best_iteration}')
    plt.xlabel('Number of Trees')
    plt.ylabel('RMSE')
    plt.title('XGBoost Learning Curve')
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot XGBoost learning curve
plot_xgb_learning_curve(model, dtrain, dval)

# Feature importance analysis with more detail
def analyze_feature_importance(model, feature_names):
    """
    Detailed analysis of feature importance

    Parameters:
    -----------
    model : xgb.Booster
        Trained XGBoost model
    feature_names : list
        Names of features
    """
    # Get importance scores
    importance_dict = model.get_score(importance_type='gain')

    # Convert to DataFrame for easier manipulation
    importance_df = pd.DataFrame({
        'Feature': list(importance_dict.keys()),
        'Importance': list(importance_dict.values())
    })

    # Sort by importance
    importance_df = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)

    # Calculate cumulative importance
    importance_df['Normalized'] = importance_df['Importance'] / importance_df['Importance'].sum()
    importance_df['Cumulative'] = importance_df['Normalized'].cumsum()

    # Group features by type
    feature_types = {
        'text_embed': 'Text Embedding',
        'mfcc': 'MFCC Audio',
        'pitch': 'Pitch',
        'energy': 'Energy',
        'speaking_rate': 'Speaking Rate',
        'pause': 'Pauses',
        'num_words': 'Text Stats',
        'num_chars': 'Text Stats',
        'avg_word': 'Text Stats',
        'unique_word': 'Text Stats'
    }

    # Assign type to each feature
    def get_feature_type(feature_name):
        for key, feature_type in feature_types.items():
            if key in feature_name.lower():
                return feature_type
        return 'Other'

    importance_df['Type'] = importance_df['Feature'].apply(get_feature_type)

    # Plot importance by feature
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 1, 1)
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title('Top 20 Features by Importance')
    plt.tight_layout()

    # Plot importance by feature type
    type_importance = importance_df.groupby('Type')['Importance'].sum().reset_index()
    type_importance = type_importance.sort_values('Importance', ascending=False)

    plt.subplot(2, 1, 2)
    sns.barplot(x='Importance', y='Type', data=type_importance)
    plt.title('Feature Importance by Type')
    plt.tight_layout()
    plt.show()

    # Print top features
    print("\nTop 10 Most Important Features:")
    for i, row in importance_df.head(10).iterrows():
        print(f"{i+1}. {row['Feature']}: {row['Normalized']:.4f} ({row['Normalized']*100:.2f}%)")

    # Print importance by type
    print("\nFeature Importance by Type:")
    for i, row in type_importance.iterrows():
        total_importance = row['Importance'] / importance_df['Importance'].sum()
        print(f"{row['Type']}: {total_importance:.4f} ({total_importance*100:.2f}%)")

    return importance_df

# Analyze feature importance
feature_importance = analyze_feature_importance(model, X.columns)