In [10]:
import cv2
import librosa
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Function to extract speech features (MFCCs) from audio file
def extract_speech_features(file_path):
    audio, _ = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13)
    return np.mean(mfccs, axis=1)

# Function to extract facial features (dummy function, replace with your facial expression analysis)
def extract_facial_features(frame):
    # Replace this with your facial expression analysis code
    # Dummy function returning random values for illustration purposes
    return np.random.rand(5)

# Generate synthetic data for demonstration purposes
# In a real scenario, you would replace this with your dataset loading code
speech_data = np.random.rand(100, 13)  # 100 samples, 13 features for speech
facial_data = np.random.rand(100, 5)   # 100 samples, 5 features for facial expressions
labels = np.random.randint(2, size=100)  # Binary labels for demonstration

# Split the data into train and test sets
X_speech_train, X_speech_test, X_facial_train, X_facial_test, y_train, y_test = train_test_split(
    speech_data, facial_data, labels, test_size=0.2, random_state=42
)

# Train a logistic regression model for speech
speech_model = LogisticRegression()
speech_model.fit(X_speech_train, y_train)

# Train a logistic regression model for facial expressions
facial_model = LogisticRegression()
facial_model.fit(X_facial_train, y_train)

# Simulate real-time processing (replace this loop with your video/audio streaming code)
for i in range(len(X_speech_test)):
    # Simulate real-time speech and facial feature extraction
    speech_features = X_speech_test[i]
    frame = np.random.rand(480, 640, 3)  # Replace this with your video frame capture code
    facial_features = extract_facial_features(frame)

    # Make predictions using individual models
    speech_prediction = speech_model.predict([speech_features])[0]
    facial_prediction = facial_model.predict([facial_features])[0]

    # Combine predictions using a simple averaging approach (you may use more sophisticated fusion methods)
    combined_prediction = int((speech_prediction + facial_prediction) / 2)

    # Print or use the combined prediction for further processing
    #print(f"Combined Prediction: {combined_prediction}, Ground Truth: {y_test[i]}")

# Evaluate the combined model on the test set
combined_predictions = []
for i in range(len(X_speech_test)):
    speech_prediction = speech_model.predict([X_speech_test[i]])[0]
    facial_prediction = facial_model.predict([X_facial_test[i]])[0]
    combined_prediction = int((speech_prediction + facial_prediction) / 2)
    combined_predictions.append(combined_prediction)

accuracy = accuracy_score(y_test, combined_predictions)
#print(f"Combined Model Accuracy: {accuracy * 100:.2f}%")