### Simple SVM model


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Load and preprocess the data
data = pd.read_csv('final_dataset.csv')
X = data.iloc[:, :-1].values / 255.0
y = data.iloc[:, -1].values

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the SVM model
svm_model = SVC(kernel='rbf', random_state=42)

# Perform cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store results
accuracies = []
all_y_true = []
all_y_pred = []

start_time = time.time()

for fold, (train_index, val_index) in enumerate(skf.split(X_train_scaled, y_train), 1):
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    svm_model.fit(X_train_fold, y_train_fold)
    y_pred = svm_model.predict(X_val_fold)
    
    accuracy = accuracy_score(y_val_fold, y_pred)
    accuracies.append(accuracy)
    
    all_y_true.extend(y_val_fold)
    all_y_pred.extend(y_pred)
    
    print(f"Fold {fold}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1_score(y_val_fold, y_pred, average='weighted'):.4f}")
    print()

end_time = time.time()

# Print overall results
print("\nSimple SVM Model Results:")
print(f"Mean Accuracy: {np.mean(accuracies):.4f} (+/- {np.std(accuracies):.4f})")

# Calculate and print F1-score, precision, and recall
f1 = f1_score(all_y_true, all_y_pred, average='weighted')
precision = precision_score(all_y_true, all_y_pred, average='weighted')
recall = recall_score(all_y_true, all_y_pred, average='weighted')
print(f"F1-score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

print(f"Training time: {end_time - start_time:.2f} seconds")

# Generate classification report
class_names = label_encoder.classes_
print("\nClassification Report:")
print(classification_report(all_y_true, all_y_pred, target_names=class_names))

# Generate confusion matrix
plt.figure(figsize=(20, 16))
cm = confusion_matrix(all_y_true, all_y_pred)
sns.heatmap(cm, annot=False, cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix (Simple SVM)')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

# Train the final model on the entire training set
svm_model.fit(X_train_scaled, y_train)

# Evaluate on the test set
y_test_pred = svm_model.predict(X_test_scaled)

print("\nTest Set Evaluation:")
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Test F1-score: {f1_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Test Precision: {precision_score(y_test, y_test_pred, average='weighted'):.4f}")
print(f"Test Recall: {recall_score(y_test, y_test_pred, average='weighted'):.4f}")

# Print model complexity (number of support vectors)
n_support_vectors = svm_model.n_support_.sum()
print(f"\nNumber of support vectors: {n_support_vectors}")

Simple SVM Model Results:
Train accuracy: 0.5655
Test accuracy: 0.4841
Train precision: 0.5778
Test precision: 0.4903
Train recall: 0.5660
Test recall: 0.4859

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.69      0.85      0.76        13
           1       0.50      0.57      0.53        14
           2       0.44      0.85      0.58        13
           3       0.44      0.57      0.50        14
           4       0.73      0.62      0.67        13
           5       0.18      0.21      0.19        14
           6       0.64      0.69      0.67        13
           7       0.48      0.77      0.59        13
           8       0.86      0.86      0.86        14
           9       0.91      0.71      0.80        14
          10       0.81      1.00      0.90        13
          11       0.33      0.85      0.48        13
          12       0.64      0.69      0.67        13
          13       0.50      0.69      0.58      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Improved SVM model


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

# Load and preprocess the data
data = pd.read_csv('final_dataset.csv')
X = data.iloc[:, :-1].values / 255.0
y = data.iloc[:, -1].values

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),  # Keep 95% of variance
    ('svm', SVC(random_state=42))
])

# Define hyperparameters for grid search
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
    'svm__kernel': ['rbf', 'poly'],
    'svm__degree': [2, 3, 4],  # Only used by poly kernel
    'svm__class_weight': [None, 'balanced'],
}

# Perform grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
    scoring=['accuracy', 'precision_macro', 'recall_macro'],
    refit='accuracy'
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Get the number of features retained by PCA
n_features_retained = best_model.named_steps['pca'].n_components_

# Perform cross-validation on the best model
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")
print(f"CV score standard deviation: {cv_scores.std():.4f}")

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Make predictions on train and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate metrics for train and test sets
def calculate_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='macro'),
        'recall': recall_score(y_true, y_pred, average='macro'),
        'f1': f1_score(y_true, y_pred, average='macro')
    }

train_metrics = calculate_metrics(y_train, y_train_pred)
test_metrics = calculate_metrics(y_test, y_test_pred)

# Print results
print("\nImproved SVM Model Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Number of features retained after PCA: {n_features_retained}")
print("\nMetrics:")
for metric in ['accuracy', 'precision', 'recall', 'f1']:
    print(f"{metric.capitalize()}:")
    print(f"  Train: {train_metrics[metric]:.4f}")
    print(f"  Test:  {test_metrics[metric]:.4f}")
    print(f"  Difference: {train_metrics[metric] - test_metrics[metric]:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))

print("\nConfusion Matrix (Test Set):")
print(confusion_matrix(y_test, y_test_pred))

# Function to predict emotion for new RGB values
def predict_emotion(rgb_values):
    rgb_array = np.array(rgb_values).reshape(1, -1) / 255.0
    prediction_encoded = best_model.predict(rgb_array)
    prediction = label_encoder.inverse_transform(prediction_encoded)
    return prediction[0]

# Example usage
new_rgb = [229, 0, 13, 225, 225, 255, 253, 166, 74]
predicted_emotion = predict_emotion(new_rgb)
print(f"\nPredicted Emotion: {predicted_emotion}")

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Cross-validation scores: [0.72362326 0.73957797 0.746138   0.73789907 0.73429454]
Mean CV score: 0.7363
CV score standard deviation: 0.0074

Improved SVM Model Results:
Best parameters: {'svm__C': 100, 'svm__class_weight': 'balanced', 'svm__degree': 2, 'svm__gamma': 'auto', 'svm__kernel': 'rbf'}
Number of features retained after PCA: 7

Metrics:
Accuracy:
  Train: 0.9489
  Test:  0.7707
  Difference: 0.1782
Precision:
  Train: 0.9510
  Test:  0.7765
  Difference: 0.1745
Recall:
  Train: 0.9490
  Test:  0.7714
  Difference: 0.1776
F1:
  Train: 0.9488
  Test:  0.7644
  Difference: 0.1843

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.75      0.92      0.83        13
           1       0.80      0.57      0.67        14
           2       1.00      0.85      0.92        13
           3       0.64      0.64      0.64        14
           4       0.47      0.62    

### Even more improved by experiment with Regularazation and Error Analysis


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Load and preprocess the data
data = pd.read_csv('final_dataset.csv')
X = data.iloc[:, :-1].values / 255.0
y = data.iloc[:, -1].values

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('svm', SVC(random_state=42, probability=True))
])

# Define hyperparameters for grid search
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
    'svm__kernel': ['rbf', 'poly'],
    'svm__degree': [2, 3, 4],
    'svm__class_weight': [None, 'balanced'],
    'svm__coef0': [0.0, 0.1, 0.5, 1.0],  # Regularization for polynomial kernel
}

# Perform grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
    refit='f1_macro'  # Changed to optimize for F1-score
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Get the number of features retained by PCA
n_features_retained = best_model.named_steps['pca'].n_components_

# Perform cross-validation on the best model
cv_results = grid_search.cv_results_
print(f"Cross-validation scores:")
print(f"Accuracy: {cv_results['mean_test_accuracy'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_accuracy'][grid_search.best_index_]:.4f})")
print(f"Precision: {cv_results['mean_test_precision_macro'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_precision_macro'][grid_search.best_index_]:.4f})")
print(f"Recall: {cv_results['mean_test_recall_macro'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_recall_macro'][grid_search.best_index_]:.4f})")
print(f"F1-score: {cv_results['mean_test_f1_macro'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_f1_macro'][grid_search.best_index_]:.4f})")

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Make predictions on train and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate metrics for train and test sets
def calculate_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='macro'),
        'recall': recall_score(y_true, y_pred, average='macro'),
        'f1': f1_score(y_true, y_pred, average='macro')
    }

train_metrics = calculate_metrics(y_train, y_train_pred)
test_metrics = calculate_metrics(y_test, y_test_pred)

# Print results
print("\nImproved SVM Model Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Number of features retained after PCA: {n_features_retained}")
print("\nMetrics:")
for metric in ['accuracy', 'precision', 'recall', 'f1']:
    print(f"{metric.capitalize()}:")
    print(f"  Train: {train_metrics[metric]:.4f}")
    print(f"  Test:  {test_metrics[metric]:.4f}")
    print(f"  Difference: {train_metrics[metric] - test_metrics[metric]:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(20, 16))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

# Learning Curves
train_sizes, train_scores, test_scores = learning_curve(
    best_model, X, y_encoded, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10), scoring='f1_macro'
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("F1-score")
plt.title("Learning Curves")
plt.legend(loc="best")
plt.tight_layout()
plt.savefig('learning_curves.png')
plt.close()

# Class distribution analysis
class_distribution = pd.Series(y).value_counts().sort_index()
plt.figure(figsize=(12, 6))
class_distribution.plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Emotion Class')
plt.ylabel('Number of Samples')
plt.tight_layout()
plt.savefig('class_distribution.png')
plt.close()

# Function to predict emotion for new RGB values
def predict_emotion(rgb_values):
    rgb_array = np.array(rgb_values).reshape(1, -1) / 255.0
    prediction_encoded = best_model.predict(rgb_array)
    prediction_proba = best_model.predict_proba(rgb_array)
    prediction = label_encoder.inverse_transform(prediction_encoded)
    return prediction[0], prediction_proba[0]

# Example usage
new_rgb = [229, 0, 13, 225, 225, 255, 253, 166, 74]
predicted_emotion, prediction_proba = predict_emotion(new_rgb)
print(f"\nPredicted Emotion: {predicted_emotion}")
print("Prediction Probabilities:")
for emotion, prob in zip(label_encoder.classes_, prediction_proba):
    print(f"{emotion}: {prob:.4f}")

# Error analysis
misclassified = X_test[y_test != y_test_pred]
misclassified_true = y_test[y_test != y_test_pred]
misclassified_pred = y_test_pred[y_test != y_test_pred]

print("\nMisclassified Samples Analysis:")
for i in range(min(10, len(misclassified))):  # Print first 10 misclassifications
    true_label = label_encoder.inverse_transform([misclassified_true[i]])[0]
    pred_label = label_encoder.inverse_transform([misclassified_pred[i]])[0]
    print(f"Sample {i+1}:")
    print(f"  True label: {true_label}")
    print(f"  Predicted label: {pred_label}")
    print(f"  RGB values: {misclassified[i]}")
    print()

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Load and preprocess the data
data = pd.read_csv('final_dataset.csv')
X = data.iloc[:, :-1].values / 255.0
y = data.iloc[:, -1].values

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('svm', SVC(random_state=42, probability=True))
])

# Define hyperparameters for grid search
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': ['scale', 'auto', 0.1, 0.01, 0.001],
    'svm__kernel': ['rbf', 'poly'],
    'svm__degree': [2, 3, 4],
    'svm__class_weight': [None, 'balanced'],
    'svm__coef0': [0.0, 0.1, 0.5, 1.0],  # Regularization for polynomial kernel
}

# Perform grid search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
    refit='f1_macro'  # Changed to optimize for F1-score
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Get the number of features retained by PCA
n_features_retained = best_model.named_steps['pca'].n_components_

# Perform cross-validation on the best model
cv_results = grid_search.cv_results_
print(f"Cross-validation scores:")
print(f"Accuracy: {cv_results['mean_test_accuracy'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_accuracy'][grid_search.best_index_]:.4f})")
print(f"Precision: {cv_results['mean_test_precision_macro'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_precision_macro'][grid_search.best_index_]:.4f})")
print(f"Recall: {cv_results['mean_test_recall_macro'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_recall_macro'][grid_search.best_index_]:.4f})")
print(f"F1-score: {cv_results['mean_test_f1_macro'][grid_search.best_index_]:.4f} (+/- {cv_results['std_test_f1_macro'][grid_search.best_index_]:.4f})")

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

# Make predictions on train and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate metrics for train and test sets
def calculate_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, average='macro'),
        'recall': recall_score(y_true, y_pred, average='macro'),
        'f1': f1_score(y_true, y_pred, average='macro')
    }

train_metrics = calculate_metrics(y_train, y_train_pred)
test_metrics = calculate_metrics(y_test, y_test_pred)

# Print results
print("\nImproved SVM Model Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Number of features retained after PCA: {n_features_retained}")
print("\nMetrics:")
for metric in ['accuracy', 'precision', 'recall', 'f1']:
    print(f"{metric.capitalize()}:")
    print(f"  Train: {train_metrics[metric]:.4f}")
    print(f"  Test:  {test_metrics[metric]:.4f}")
    print(f"  Difference: {train_metrics[metric] - test_metrics[metric]:.4f}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(20, 16))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()

# Learning Curves
train_sizes, train_scores, test_scores = learning_curve(
    best_model, X, y_encoded, cv=5, n_jobs=-1, 
    train_sizes=np.linspace(0.1, 1.0, 10), scoring='f1_macro'
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("F1-score")
plt.title("Learning Curves")
plt.legend(loc="best")
plt.tight_layout()
plt.savefig('learning_curves.png')
plt.close()

# Class distribution analysis
class_distribution = pd.Series(y).value_counts().sort_index()
plt.figure(figsize=(12, 6))
class_distribution.plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Emotion Class')
plt.ylabel('Number of Samples')
plt.tight_layout()
plt.savefig('class_distribution.png')
plt.close()

# Function to predict emotion for new RGB values
def predict_emotion(rgb_values):
    rgb_array = np.array(rgb_values).reshape(1, -1) / 255.0
    prediction_encoded = best_model.predict(rgb_array)
    prediction_proba = best_model.predict_proba(rgb_array)
    prediction = label_encoder.inverse_transform(prediction_encoded)
    return prediction[0], prediction_proba[0]

# Example usage
new_rgb = [229, 0, 13, 225, 225, 255, 253, 166, 74]
predicted_emotion, prediction_proba = predict_emotion(new_rgb)
print(f"\nPredicted Emotion: {predicted_emotion}")
print("Prediction Probabilities:")
for emotion, prob in zip(label_encoder.classes_, prediction_proba):
    print(f"{emotion}: {prob:.4f}")

# Error analysis
misclassified = X_test[y_test != y_test_pred]
misclassified_true = y_test[y_test != y_test_pred]
misclassified_pred = y_test_pred[y_test != y_test_pred]

print("\nMisclassified Samples Analysis:")
for i in range(min(10, len(misclassified))):  # Print first 10 misclassifications
    true_label = label_encoder.inverse_transform([misclassified_true[i]])[0]
    pred_label = label_encoder.inverse_transform([misclassified_pred[i]])[0]
    print(f"Sample {i+1}:")
    print(f"  True label: {true_label}")
    print(f"  Predicted label: {pred_label}")
    print(f"  RGB values: {misclassified[i]}")
    print()