In [2]:
!pip install librosa

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Downloading librosa-0.11.0-py3-none-any.whl (260 kB)
Downloading audioread-3.0.1-py3-none-any.whl (23 kB)
Downloading pooch-1.8.2-py3-none-any.whl (64 kB)
Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.3/1.0 MB ? eta -:--:--
   -------------------- ------------------- 0.5/1.0 MB

In [3]:
import speech_recognition as sr
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
def speech_to_text(duration=5):
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print(f"Recording for {duration} seconds...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source, timeout=duration)
    
    try:
        text = recognizer.recognize_google(audio)
        print(f"Recognized Text: {text}")
        return text
    except sr.UnknownValueError:
        print("Could not understand the audio.")
        return None
    except sr.RequestError:
        print("Error in Speech Recognition API.")
        return None


In [5]:
def extract_features(audio_path):
    y, sr = librosa.load(audio_path, sr=22050)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1)
    return mfcc

# Sample Text Emotion Dataset (Replace with actual dataset)
text_data = pd.DataFrame({
    'Text': ['I am very happy today', 'I feel so sad', 'This is frustrating', 'I love this moment', 'I am angry'],
    'Emotion': ['Happy', 'Sad', 'Angry', 'Happy', 'Angry']
})


In [6]:
label_encoder = LabelEncoder()
text_data['Encoded_Emotion'] = label_encoder.fit_transform(text_data['Emotion'])


In [7]:
X_text = text_data['Text'].values
y_text = text_data['Encoded_Emotion']
vectorizer = lambda x: np.array([len(x), sum(ord(c) for c in x)])  # Simple text feature
X_text_features = np.array([vectorizer(txt) for txt in X_text])


In [8]:
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(X_text_features, y_text, test_size=0.2, random_state=42)
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train_text, y_train_text)

In [9]:
def predict_text_emotion(text):
    text_features = vectorizer(text).reshape(1, -1)
    emotion_index = knn_model.predict(text_features)[0]
    return label_encoder.inverse_transform([emotion_index])[0]

In [10]:
audio_features = np.random.rand(50, 13)  # Dummy 50 audio samples with 13 features each
audio_labels = np.random.choice(['Happy', 'Sad', 'Angry', 'Neutral'], 50)
audio_df = pd.DataFrame(audio_features)
audio_df['Emotion'] = audio_labels
audio_df['Encoded_Emotion'] = label_encoder.fit_transform(audio_df['Emotion'])

In [11]:
X_audio = audio_df.iloc[:, :-2].values
y_audio = audio_df['Encoded_Emotion'].values
X_train_audio, X_test_audio, y_train_audio, y_test_audio = train_test_split(X_audio, y_audio, test_size=0.2, random_state=42)


In [12]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_audio, y_train_audio)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
def predict_speech_emotion(audio_path):
    features = extract_features(audio_path).reshape(1, -1)
    emotion_index = xgb_model.predict(features)[0]
    probabilities = xgb_model.predict_proba(features)[0]
    emotion_prob = {label_encoder.inverse_transform([i])[0]: round(prob * 100, 2) for i, prob in enumerate(probabilities)}
    return label_encoder.inverse_transform([emotion_index])[0], emotion_prob

In [14]:
def compare_models():
    y_pred_text = knn_model.predict(X_test_text)
    y_pred_audio = xgb_model.predict(X_test_audio)

    print("Classification Report for Text Emotion Prediction:")
    print(classification_report(y_test_text, y_pred_text, target_names=label_encoder.classes_))

    print("Classification Report for Speech Emotion Prediction:")
    print(classification_report(y_test_audio, y_pred_audio, target_names=label_encoder.classes_))

    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    sns.heatmap(confusion_matrix(y_test_text, y_pred_text), annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix - Text Emotion")

    plt.subplot(1, 2, 2)
    sns.heatmap(confusion_matrix(y_test_audio, y_pred_audio), annot=True, fmt="d", cmap="Oranges")
    plt.title("Confusion Matrix - Speech Emotion")

    plt.show()

In [17]:
# Running the workflow
if __name__ == "__main__":
    # Step 1: Get speech input
    text_input = speech_to_text(duration=5)
    if text_input:
        # Step 2: Predict emotion from text
        text_emotion = predict_text_emotion(text_input)

        # Step 3: Predict emotion from speech (assuming an audio file is provided)
        audio_emotion, emotion_probs = predict_speech_emotion(r"C:\sandhiya\25-317\sample-0.mp3")  # Replace with actual audio path

        # Step 4: Display final results
        print("\nFinal Emotion Prediction:")
        print(f"From Text: {text_emotion}")
        print(f"From Speech: {audio_emotion}")
        print(f"Mixed Emotion Probabilities: {emotion_probs}")

        # Step 5: Model comparison
        compare_models()

Recording for 5 seconds...
Recognized Text: hi how are you hello

Final Emotion Prediction:
From Text: Happy
From Speech: Happy
Mixed Emotion Probabilities: {'Angry': 1.51, 'Happy': 56.43, 'Neutral': 34.57, 'Sad': 7.49}
Classification Report for Text Emotion Prediction:


ValueError: Number of classes, 2, does not match size of target_names, 4. Try specifying the labels parameter