<a href="https://colab.research.google.com/github/DevClare/Automatic-Scoring-for-Oral-Presentation-using-ML-and-DL/blob/main/SVM%26RFmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Istallation of all necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix)
import joblib
import warnings
warnings.filterwarnings("ignore")


In [None]:
# Upload both datasets (human evaluated scores + extracted features)
from google.colab import files
print("Upload the human evaluated score file:")
uploaded_scores = files.upload()

print("Upload the extracted features file:")
uploaded_features = files.upload()

Upload the human evaluated score file:


Saving AudioScoreLatest.csv to AudioScoreLatest (2).csv
Upload the extracted features file:


Saving aggregated_results_mean.csv to aggregated_results_mean (2).csv


NameError: name 'uploades_scores' is not defined

In [None]:
import pandas as pd

# Get filenames from the uploaded files
scores_filename   = list(uploaded_scores.keys())[0]
features_filename = list(uploaded_features.keys())[0]

# Read CSVs into pandas
scores_df   = pd.read_csv(scores_filename)
features_df = pd.read_csv(features_filename)

print("Scores shape:", scores_df.shape)
print("Features shape:", features_df.shape)
print("Scores columns:", scores_df.columns.tolist())
print("Features columns:", features_df.columns.tolist())


Scores shape: (72, 9)
Features shape: (72, 11)
Scores columns: ['Name', 'Volume', 'Pace', 'Tone', 'Pronounciation', 'Vocal Variety', 'Vocal Control', 'Fluency', 'Total']
Features columns: ['MFCC_mean_combined', 'MFCC_std_combined', 'Chroma_mean_combined', 'Chroma_std_combined', 'Spectral Centroid Mean', 'Zero-Crossing Rate Mean', 'Tempo', 'Pitch', 'Intensity', 'Name', 'Scores']


In [None]:
# Merge the two datasets on common column(in this case, Name of participants)
merged_df = pd.merge(features_df, scores_df, on="Name")

print("Merged shape:", merged_df.shape)
print("Merged columns:", merged_df.columns.tolist())

Merged shape: (57, 19)
Merged columns: ['MFCC_mean_combined', 'MFCC_std_combined', 'Chroma_mean_combined', 'Chroma_std_combined', 'Spectral Centroid Mean', 'Zero-Crossing Rate Mean', 'Tempo', 'Pitch', 'Intensity', 'Name', 'Scores', 'Volume', 'Pace', 'Tone', 'Pronounciation', 'Vocal Variety', 'Vocal Control', 'Fluency', 'Total']


In [None]:
# Drop missing values
df_clean = merged_df.dropna(subset=feature_cols + HUMAN_SCORE_COLUMNS).copy()

# Compute average score
df_clean["AvgScore"] = df_clean[HUMAN_SCORE_COLUMNS].mean(axis=1)

# Map to classes, 0 = bad(average score of 1-2), 1 = neutral(average score of 2-3.5), 2 = very good(average score above 3.5)
def map_score_to_class(s):
    if s <= 2:
        return 0
    elif 2 < s <= 3.5:
        return 1
    else:
        return 2

df_clean["Label"] = df_clean["AvgScore"].apply(map_score_to_class)

# Final X and y
X = df_clean[feature_cols].values
y = df_clean["Label"].values

print("Class distribution:\n", pd.Series(y).value_counts())


Class distribution:
 1    37
2    17
0     3
Name: count, dtype: int64


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training size:", X_train.shape[0])
print("Testing size:", X_test.shape[0])


Training size: 45
Testing size: 12


In [None]:
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(kernel="rbf", class_weight="balanced", random_state=42))
])

svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:\n", cm)

# Scores with 2 decimals
acc = round(accuracy_score(y_test, y_pred_svm), 2)
prec = round(precision_score(y_test, y_pred_svm, average="weighted"), 2)
rec = round(recall_score(y_test, y_pred_svm, average="weighted"), 2)
f1 = round(f1_score(y_test, y_pred_svm, average="weighted"), 2)

print("\n--- SVM Results ---")
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1-score: {f1}")

Confusion Matrix:
 [[1 0 0]
 [1 5 2]
 [0 0 3]]

--- SVM Results ---
Accuracy: 0.75
Precision: 0.86
Recall: 0.75
F1-score: 0.76


In [None]:
rf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:\n", cm)

# Scores with 2 decimals
acc = round(accuracy_score(y_test, y_pred_rf), 2)
prec = round(precision_score(y_test, y_pred_rf, average="macro"), 2)
rec = round(recall_score(y_test, y_pred_rf, average="macro"), 2)
f1 = round(f1_score(y_test, y_pred_rf, average="macro"), 2)

print("\n--- Random Forest Results ---")
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1-score: {f1}")


Confusion Matrix:
 [[1 0 0]
 [1 7 0]
 [0 0 3]]

--- Random Forest Results ---
Accuracy: 0.92
Precision: 0.83
Recall: 0.96
F1-score: 0.87
