In [None]:
import numpy as np
import pickle
import pandas as pd

# Load preprocessed MIDI features
with open("processed_midi_data.pkl", "rb") as f:
    midi_features = pickle.load(f)
# Extract feature data
tempos = [f["tempo"] for f in midi_features.values()]
keys = [f["key"] for f in midi_features.values()]
note_densities = [f["note_density"] for f in midi_features.values()]
velocities = [f["avg_velocity"] for f in midi_features.values()]

print("Dataset statistics:")
print(f"Tempo range: {min(tempos):.1f} to {max(tempos):.1f}, mean: {np.mean(tempos):.1f}")
print(f"Key range: {min(keys)} to {max(keys)}, mean: {np.mean(keys):.1f}")
print(f"Note density range: {min(note_densities):.1f} to {max(note_densities):.1f}, mean: {np.mean(note_densities):.1f}")
print(f"Velocity range: {min(velocities):.1f} to {max(velocities):.1f}, mean: {np.mean(velocities):.1f}")

# Calculate percentiles for each feature
tempo_25 = np.percentile(tempos, 25)
tempo_50 = np.percentile(tempos, 50)
tempo_75 = np.percentile(tempos, 75)

note_25 = np.percentile(note_densities, 25)
note_50 = np.percentile(note_densities, 50)
note_75 = np.percentile(note_densities, 75)

vel_25 = np.percentile(velocities, 25)
vel_50 = np.percentile(velocities, 50)
vel_75 = np.percentile(velocities, 75)

print("\nPercentiles:")
print(f"Tempo 25/50/75: {tempo_25:.1f}/{tempo_50:.1f}/{tempo_75:.1f}")
print(f"Note density 25/50/75: {note_25:.1f}/{note_50:.1f}/{note_75:.1f}")
print(f"Velocity 25/50/75: {vel_25:.1f}/{vel_50:.1f}/{vel_75:.1f}")

# Count major vs minor keys
major_keys = sum(1 for k in keys if k < 12)
minor_keys = sum(1 for k in keys if k >= 12)
print(f"Major keys: {major_keys} ({major_keys/len(keys)*100:.1f}%)")
print(f"Minor keys: {minor_keys} ({minor_keys/len(keys)*100:.1f}%)")



# Calculate percentiles for each feature
tempo_25 = np.percentile(tempos, 25)
tempo_50 = np.percentile(tempos, 50)
tempo_75 = np.percentile(tempos, 75)

note_25 = np.percentile(note_densities, 25)
note_50 = np.percentile(note_densities, 50)
note_75 = np.percentile(note_densities, 75)

vel_25 = np.percentile(velocities, 25)
vel_50 = np.percentile(velocities, 50)
vel_75 = np.percentile(velocities, 75)

# Count major vs minor keys
major_keys = sum(1 for k in keys if k < 12)
minor_keys = sum(1 for k in keys if k >= 12)

# Define emotion mapping based on the actual data distribution
emotion_mapping = {
    "Happy": {
        "tempo": (tempo_50, max(tempos)),
        "key": "major",
        "note_density": (note_50, max(note_densities)),
        "velocity": (vel_50, max(velocities)),
        "weight": 0.7
    },
    "Sad": {
        "tempo": (min(tempos), tempo_25),
        "key": "minor",
        "note_density": (min(note_densities), note_50),
        "velocity": (min(velocities), vel_25),
        "weight": 1.2
    },
    "Calm": {
        "tempo": (min(tempos), tempo_50),
        "key": "both",
        "note_density": (min(note_densities), note_25),
        "velocity": (min(velocities), vel_50),
        "weight": 1.2
    },
    "Energetic": {
        "tempo": (tempo_75, max(tempos)),
        "key": "both",
        "note_density": (note_75, max(note_densities)),
        "velocity": (vel_75, max(velocities)),
        "weight": 0.65
    },
    "Romantic": {
        "tempo": (tempo_25, tempo_75),
        "key": "both",
        "note_density": (note_25, note_75),
        "velocity": (vel_25, vel_75),
        "weight": 1.1
    },
    "Fearful": {
        "tempo": (tempo_25, tempo_75),
        "key": "minor",
        "note_density": (note_50, max(note_densities)),
        "velocity": (vel_25, vel_75),
        "weight": 1.3
    },
    "Angry": {
        "tempo": (tempo_50, max(tempos)),
        "key": "minor",
        "note_density": (note_75, max(note_densities)),
        "velocity": (vel_75, max(velocities)),
        "weight": 1.2
    },
    "Mysterious": {
        "tempo": (min(tempos), tempo_50),
        "key": "minor",
        "note_density": (note_25, note_75),
        "velocity": (min(velocities), vel_50),
        "weight": 1.3
    }
}

def classify_emotion_with_fuzzy_membership(features):
    """
    Classify emotion using fuzzy membership functions instead of hard boundaries.
    This allows pieces to partially belong to multiple emotion categories.
    """
    scores = {}
    
    # Get key category
    key_category = "major" if features["key"] < 12 else "minor"
    
    for emotion, params in emotion_mapping.items():
        tempo_range = params["tempo"]
        key_type = params["key"]
        note_range = params["note_density"]
        velocity_range = params["velocity"]
        weight = params["weight"]
        
        # Calculate feature scores using fuzzy membership
        
        # Tempo score - triangular membership function
        if features["tempo"] <= tempo_range[0]:
            tempo_score = 0
        elif features["tempo"] >= tempo_range[1]:
            tempo_score = 0
        else:
            # Peak at the middle of the range
            middle = (tempo_range[0] + tempo_range[1]) / 2
            if features["tempo"] <= middle:
                tempo_score = (features["tempo"] - tempo_range[0]) / (middle - tempo_range[0])
            else:
                tempo_score = (tempo_range[1] - features["tempo"]) / (tempo_range[1] - middle)
                
        # Key score - binary membership
        if key_type == "both":
            key_score = 1.0
        elif key_category == key_type:
            key_score = 1.0
        else:
            key_score = 0.0
        
        # Note density score - triangular membership
        if features["note_density"] <= note_range[0]:
            note_score = 0
        elif features["note_density"] >= note_range[1]:
            note_score = 0
        else:
            middle = (note_range[0] + note_range[1]) / 2
            if features["note_density"] <= middle:
                note_score = (features["note_density"] - note_range[0]) / (middle - note_range[0])
            else:
                note_score = (note_range[1] - features["note_density"]) / (note_range[1] - middle)
        
        # Velocity score - triangular membership
        if features["avg_velocity"] <= velocity_range[0]:
            velocity_score = 0
        elif features["avg_velocity"] >= velocity_range[1]:
            velocity_score = 0
        else:
            middle = (velocity_range[0] + velocity_range[1]) / 2
            if features["avg_velocity"] <= middle:
                velocity_score = (features["avg_velocity"] - velocity_range[0]) / (middle - velocity_range[0])
            else:
                velocity_score = (velocity_range[1] - features["avg_velocity"]) / (velocity_range[1] - middle)
        
        # Calculate weighted score
        # Give higher weight to key for emotional impact
        feature_weights = {
            "tempo": 1.2,
            "key": 1.5,
            "note": 1.0,
            "velocity": 1.1
        }
        
        weighted_score = (
            tempo_score * feature_weights["tempo"] +
            key_score * feature_weights["key"] +
            note_score * feature_weights["note"] +
            velocity_score * feature_weights["velocity"]
        ) * weight
        
        max_possible = sum(feature_weights.values())
        normalized_score = weighted_score / max_possible
        
        scores[emotion] = normalized_score
    
    # Find emotion with highest score
    best_emotion = max(scores, key=scores.get)
    best_score = scores[best_emotion]
    
    # Return the top emotion, but ensure variety in the dataset
    if best_score >= 0.4:
        return best_emotion, scores
    else:
        return "Unknown", scores

# Process all MIDI files
results = {}
emotion_scores = {}

for file, features in midi_features.items():
    emotion, scores = classify_emotion_with_fuzzy_membership(features)
    results[file] = emotion
    emotion_scores[file] = scores

# Create DataFrame
emotion_df = pd.DataFrame.from_dict(results, orient="index", columns=["Emotion"])

# Check distribution
emotion_counts = emotion_df["Emotion"].value_counts()

# Balance distribution if needed
MAX_PERCENTAGE = 30.0  # Maximum percentage for any emotion
for emotion, count in emotion_counts.items():
    if count/len(emotion_df)*100 > MAX_PERCENTAGE and emotion != "Unknown":
        # Find how many files to reassign
        excess = int(count - (MAX_PERCENTAGE * len(emotion_df) / 100))
        
        # Find files with this emotion sorted by lowest score
        emotion_files = [file for file, e in results.items() if e == emotion]
        emotion_scores_list = [(file, emotion_scores[file][emotion]) for file in emotion_files]
        emotion_scores_list.sort(key=lambda x: x[1])  # Sort by score (lowest first)
        
        # Reassign the lowest scoring files to their second-best emotion
        for i in range(excess):
            if i < len(emotion_scores_list):
                file_to_reassign = emotion_scores_list[i][0]
                file_scores = emotion_scores[file_to_reassign]
                
                # Find second-best emotion
                sorted_emotions = sorted(file_scores.items(), key=lambda x: x[1], reverse=True)
                second_best = sorted_emotions[1][0]  # Second item in sorted list
                
                # Reassign if second-best is not Unknown
                if second_best != "Unknown" and file_scores[second_best] > 0.25:
                    results[file_to_reassign] = second_best

# Update DataFrame with adjusted results
emotion_df = pd.DataFrame.from_dict(results, orient="index", columns=["Emotion"])

# Save to CSV
output_file = "midi_emotion_labels.csv"
print("Emotions tagged successfully")
emotion_df.to_csv(output_file)
midi_emotion_labels = results