In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
from sklearn.feature_selection import VarianceThreshold

In [10]:
data_folder_path = Path('../../../data/')
data_all = pd.read_csv(data_folder_path / 'data_train_continuous_data.csv')
# Identify target and predictor variables
target_variables_list = ['decade', 'main_genre', 'spotify_popularity']
predictor_variables_list = [
    'drone_ratio',
    'average_overlap',
    'average_2overlap',
    'average_3overlap',
    'average_4overlap',
    'average_5overlap',
    'maj_triad_ratio',
    'min_triad_ratio',
    'unique_5gram_density',
    'unique_chord_density'
]

In [8]:
def select_features_by_variance(predictors, threshold=0.01):
    """Select features above variance threshold."""
    variances = predictors.var()
    print("Variance of each predictor:")
    print(variances.sort_values())

    selector = VarianceThreshold(threshold=threshold)
    selected_features = selector.fit_transform(predictors)

    selected_names = predictors.columns[selector.get_support()].tolist()
    removed_names = predictors.columns[~selector.get_support()].tolist()

    print(f"\nSelected: {len(selected_names)}, Removed: {len(removed_names)}")
    print(f"Selected features: {selected_names}")
    print(f"Removed features: {removed_names}")

    return pd.DataFrame(selected_features, columns=selected_names), variances

def plot_variance_distribution(variances, threshold=0.01):
    """Plot variance distribution with threshold line."""
    plt.figure(figsize=(10, 6))
    variances.sort_values().plot(kind='barh')
    plt.xlabel('Variance')
    plt.title('Variance of Predictor Variables')
    plt.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold={threshold}')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [9]:
# Remove very low variance features
threshold=0.01
predictors_filtered, variances = select_features_by_variance(predictors, threshold=threshold)
plot_variance_distribution(variances, threshold=threshold)

AttributeError: 'list' object has no attribute 'var'

In [None]:
# Remove low variance features
threshold=0.05
predictors_filtered, variances = select_features_by_variance(predictors, threshold=threshold)
plot_variance_distribution(variances, threshold=threshold)

In [None]:
# Remove variance features with more aggressive filter
threshold=0.1
predictors_filtered, variances = select_features_by_variance(predictors, threshold=threshold)
plot_variance_distribution(variances, threshold=threshold)