In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score

from sklearn.model_selection import KFold, cross_val_score
import numpy as np

# Create a synthetic dataset
# We'll simplify audio features for demonstration
data = {
    'tempo': np.random.uniform(60, 180, 200),  # beats per minute
    'key': np.random.randint(0, 12, 200),      # musical key (0-11 representing notes)
    'instrumentation_complexity': np.random.uniform(1, 10, 200), # e.g., number of instruments
    'rhythmic_density': np.random.uniform(1, 10, 200), # e.g., complexity of rhythm
    'genre': np.random.choice(['Pop', 'Rock', 'Electronic', 'Classical'], 200)
}

df = pd.DataFrame(data)

# Prepare data for the model
X = df[['tempo', 'key', 'instrumentation_complexity', 'rhythmic_density']]
y = df['genre']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Precision (macro average to account for all classes)
precision = precision_score(y_test, y_pred, average='macro')
print(f"Precision: {precision:.4f}")



# Add some null values to the DataFrame
# Let's add nulls to 'tempo' and 'instrumentation_complexity'
np.random.seed(42)
for col in ['tempo', 'instrumentation_complexity']:
    null_indices = np.random.choice(df.index, size=int(len(df) * 0.1), replace=False) # Add 10% nulls
    df.loc[null_indices, col] = np.nan

print("DataFrame with null values:")
print(df.isnull().sum()) # Check null counts

# Handle missing values (Simple Imputation: fill with mean for numerical columns)
X = df[['tempo', 'key', 'instrumentation_complexity', 'rhythmic_density']]
y = df['genre']

# Impute missing values before cross-validation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Convert imputed X back to DataFrame for easier handling (optional but good practice)
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Define k-fold cross-validation
n_splits = 5 # Number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X_imputed_df, y, cv=kf, scoring='accuracy')

print(f"\nAccuracy scores for each fold ({n_splits}-fold cross-validation):")
print(cv_scores)

print(f"\nMean Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation of Cross-Validation Accuracy: {cv_scores.std():.4f}")

# You can also perform cross-validation for other metrics like precision
cv_precision_scores = cross_val_score(model, X_imputed_df, y, cv=kf, scoring='precision_macro')
print(f"\nMean Cross-Validation Precision (macro): {cv_precision_scores.mean():.4f}")
print(f"Standard Deviation of Cross-Validation Precision: {cv_precision_scores.std():.4f}")


Confusion Matrix:
[[3 6 6 2]
 [4 1 5 2]
 [5 3 8 2]
 [3 4 4 2]]

Accuracy: 0.2333
Precision: 0.2173
DataFrame with null values:
tempo                         20
key                            0
instrumentation_complexity    20
rhythmic_density               0
genre                          0
dtype: int64

Accuracy scores for each fold (5-fold cross-validation):
[0.275 0.175 0.25  0.175 0.3  ]

Mean Cross-Validation Accuracy: 0.2350
Standard Deviation of Cross-Validation Accuracy: 0.0515

Mean Cross-Validation Precision (macro): 0.2320
Standard Deviation of Cross-Validation Precision: 0.0544


In [2]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Train an ID3 (Decision Tree) classifier
# For simplicity, we'll use scikit-learn's DecisionTreeClassifier which implements ID3-like algorithms
# based on entropy or Gini impurity. Here we'll use entropy for ID3.
dt_model = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions with the Decision Tree
y_pred_dt = dt_model.predict(X_test)

# Evaluate the Decision Tree model
# Accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"\nDecision Tree Accuracy (using Entropy): {accuracy_dt:.4f}")

# Confusion Matrix for Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)
print("\nDecision Tree Confusion Matrix:")
print(cm_dt)

# Precision for Decision Tree
precision_dt = precision_score(y_test, y_pred_dt, average='macro')
print(f"Decision Tree Precision: {precision_dt:.4f}")
np.random.seed(42) # for reproducibility
for col in ['tempo', 'instrumentation_complexity']:
    # Add 15% nulls to each selected column
    null_indices = np.random.choice(df.index, size=int(len(df) * 0.15), replace=False)
    df.loc[null_indices, col] = np.nan

print("DataFrame with null values:")
print(df.isnull().sum()) # Check null counts

# Handle missing values before K-Fold Cross-Validation
# Use SimpleImputer to fill missing values (e.g., with the mean)
imputer = SimpleImputer(strategy='mean')

# Separate features (X) and target (y)
# Update X and y based on the DataFrame with added nulls
X = df[['tempo', 'key', 'instrumentation_complexity', 'rhythmic_density']]
y = df['genre']

# Impute the missing values in X
# Use fit_transform on the entire dataset X before cross-validation
X_imputed = imputer.fit_transform(X)

# Convert the imputed numpy array back to a DataFrame (optional, but good practice)
X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)

# --- Perform K-Fold Cross-Validation for the Decision Tree model ---
# Define the Decision Tree model (ID3-like with entropy criterion)
dt_model = DecisionTreeClassifier(criterion='entropy', random_state=42)

# Define k-fold cross-validation
n_splits = 5 # Number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

print(f"\nPerforming {n_splits}-fold cross-validation for Decision Tree (Entropy):")

# Perform cross-validation for accuracy
cv_accuracy_scores_dt = cross_val_score(dt_model, X_imputed_df, y, cv=kf, scoring='accuracy')
print(f"Accuracy scores for each fold: {cv_accuracy_scores_dt}")
print(f"Mean Cross-Validation Accuracy: {cv_accuracy_scores_dt.mean():.4f}")
print(f"Standard Deviation of Cross-Validation Accuracy: {cv_accuracy_scores_dt.std():.4f}")

# Perform cross-validation for precision (macro average)
cv_precision_scores_dt = cross_val_score(dt_model, X_imputed_df, y, cv=kf, scoring='precision_macro')
print(f"\nPrecision scores for each fold (macro): {cv_precision_scores_dt}")
print(f"Mean Cross-Validation Precision (macro): {cv_precision_scores_dt.mean():.4f}")
print(f"Standard Deviation of Cross-Validation Precision (macro): {cv_precision_scores_dt.std():.4f}")



Decision Tree Accuracy (using Entropy): 0.2500

Decision Tree Confusion Matrix:
[[3 4 6 4]
 [3 5 2 2]
 [3 8 5 2]
 [4 2 5 2]]
Decision Tree Precision: 0.2429
DataFrame with null values:
tempo                         30
key                            0
instrumentation_complexity    30
rhythmic_density               0
genre                          0
dtype: int64

Performing 5-fold cross-validation for Decision Tree (Entropy):
Accuracy scores for each fold: [0.325 0.25  0.25  0.125 0.2  ]
Mean Cross-Validation Accuracy: 0.2300
Standard Deviation of Cross-Validation Accuracy: 0.0660

Precision scores for each fold (macro): [0.29056187 0.23690476 0.32937063 0.12916667 0.16666667]
Mean Cross-Validation Precision (macro): 0.2305
Standard Deviation of Cross-Validation Precision (macro): 0.0745
