In [1]:
# CELL 1: Install Libraries
import sys

# We use 'pip' (Python's app store) to install MediaPipe
!{sys.executable} -m pip install mediapipe opencv-python matplotlib

print("‚úÖ Installation Complete.")

‚úÖ Installation Complete.



[notice] A new release of pip available: 22.2.2 -> 25.3
[notice] To update, run: c:\Users\preda\.pyenv\pyenv-win\versions\3.10.7\python.exe -m pip install --upgrade pip


In [2]:
# CELL 2: Imports
import mediapipe as mp
import cv2
import numpy as np
import urllib.request
import os

# Define the "Tasks" library (The new v0.10.31 way of doing things)
from mediapipe.tasks import python
from mediapipe.tasks.python import vision, text, audio

print(f"‚úÖ Ready. Using MediaPipe Version: {mp.__version__}")

‚úÖ Ready. Using MediaPipe Version: 0.10.31


In [3]:
# CELL 3: Download Text Model
text_model_url = "https://storage.googleapis.com/mediapipe-models/text_classifier/bert_classifier/float32/1/bert_classifier.tflite"
text_model_file = "bert_classifier.tflite"

if not os.path.exists(text_model_file):
    print("üì• Downloading Text Brain...")
    urllib.request.urlretrieve(text_model_url, text_model_file)
    print("‚úÖ Text Model Downloaded.")
else:
    print("‚ö° Text Model already exists.")

üì• Downloading Text Brain...
‚úÖ Text Model Downloaded.


In [4]:
# CELL 4: Download Vision Model
vision_model_url = "https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/1/face_landmarker.task"
vision_model_file = "face_landmarker.task"

if not os.path.exists(vision_model_file):
    print("üì• Downloading Vision Brain...")
    urllib.request.urlretrieve(vision_model_url, vision_model_file)
    print("‚úÖ Vision Model Downloaded.")
else:
    print("‚ö° Vision Model already exists.")

‚ö° Vision Model already exists.


In [6]:
# CELL 5: Download Audio Model
audio_model_url = "https://storage.googleapis.com/mediapipe-models/audio_classifier/yamnet/float32/1/yamnet.tflite"
audio_model_file = "yamnet.tflite"

if not os.path.exists(audio_model_file):
    print("üì• Downloading Audio Brain...")
    urllib.request.urlretrieve(audio_model_url, audio_model_file)
    print("‚úÖ Audio Model Downloaded.")
else:
    print("‚ö° Audio Model already exists.")

‚ö° Audio Model already exists.


In [7]:
# CELL 6: Setup Text AI
base_options = python.BaseOptions(model_asset_path=text_model_file)
options = text.TextClassifierOptions(base_options=base_options)

# Create the tool
text_classifier = text.TextClassifier.create_from_options(options)
print("‚úÖ Text Classifier is Awake.")

‚úÖ Text Classifier is Awake.


In [10]:
# CELL 7: Test Happy Input
my_text = "I am feeling nice for now, atleast"
classification = text_classifier.classify(my_text)

# The result is a list. Category 0 is usually 'negative', 1 is 'positive'
# But let's look at the label names directly.
top_result = classification.classifications[0].categories[0]

print(f"Input: '{my_text}'")
print(f"AI thinks this is: {top_result.category_name} (Score: {top_result.score:.2f})")

Input: 'I am feeling nice for now, atleast'
AI thinks this is: positive (Score: 0.99)


In [11]:
# CELL 8: Test Sad Input
sad_text = "I am feeling very down and lonely."
classification = text_classifier.classify(sad_text)
top_result = classification.classifications[0].categories[0]

print(f"Input: '{sad_text}'")
print(f"AI thinks this is: {top_result.category_name} (Score: {top_result.score:.2f})")

Input: 'I am feeling very down and lonely.'
AI thinks this is: negative (Score: 0.97)


In [12]:
# CELL 9: Setup Vision AI
base_options = python.BaseOptions(model_asset_path=vision_model_file)
options = vision.FaceLandmarkerOptions(
    base_options=base_options,
    output_face_blendshapes=True, # <--- THIS allows us to see smiles!
    running_mode=vision.RunningMode.VIDEO,
    num_faces=1)

face_landmarker = vision.FaceLandmarker.create_from_options(options)
print("‚úÖ Face AI is Awake.")

‚úÖ Face AI is Awake.


In [13]:
# CELL 10: Camera Access
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)

if cap.isOpened():
    print("‚úÖ Camera is ready.")
else:
    print("‚ùå Camera failed. Check your settings.")

‚úÖ Camera is ready.


In [14]:
# CELL 11: Real-time Emotion Detector
import time

print("Controls: Smile to be Happy. Press 'q' to stop.")

while True:
    ret, frame = cap.read()
    if not ret: break

    # Convert to MediaPipe format
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    timestamp = int(time.time() * 1000)

    # Detect
    result = face_landmarker.detect_for_video(mp_image, timestamp)

    # LOGIC: Check for Smile
    status = "Neutral / Sad"
    color = (0, 0, 255) # Red

    if result.face_blendshapes:
        # Access the first face detected
        shapes = result.face_blendshapes[0]
        
        # Blendshape #52 is usually 'mouthSmileLeft' and #53 is 'mouthSmileRight'
        # But looking up by name is safer:
        smile_score = 0
        for shape in shapes:
            if shape.category_name == 'mouthSmileLeft' or shape.category_name == 'mouthSmileRight':
                smile_score += shape.score

        # Average the two sides
        smile_score /= 2

        if smile_score > 0.5:
            status = "HAPPY / SMILING"
            color = (0, 255, 0) # Green

    cv2.putText(frame, f"Emotion: {status}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)
    cv2.imshow('Emotion Detector', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'): break

cap.release()
cv2.destroyAllWindows()

Controls: Smile to be Happy. Press 'q' to stop.


In [33]:
# CELL 12: Setup Audio AI
from mediapipe.tasks.python.audio.core import audio_record
from mediapipe.tasks.python.components.containers import audio_data as audio_data_module
from mediapipe.tasks import python
from mediapipe.tasks.python import audio

# Configure the model
base_options = python.BaseOptions(model_asset_path="yamnet.tflite")
options = audio.AudioClassifierOptions(
    base_options=base_options,
    max_results=3)

# Create the tool
audio_classifier = audio.AudioClassifier.create_from_options(options)
print("‚úÖ Audio AI is Awake and ready to listen.")

‚úÖ Audio AI is Awake and ready to listen.


In [34]:
# CELL: Find My Microphone
import sounddevice as sd

print("üéß Scanning Audio Devices...")
print(sd.query_devices())

print("\n------------------------------------------------")
print("üëâ LOOK AT THE LIST ABOVE.")
print("Find the number (Index) next to your actual Microphone.")
print("Ignore 'Microsoft Sound Mapper' or 'Primary Sound Capture'.")
print("Look for hardware names like 'Microphone Array (Realtek)' or 'Headset'.")

üéß Scanning Audio Devices...
   0 Microsoft Sound Mapper - Input, MME (2 in, 0 out)
>  1 Headset Microphone (Oculus Virt, MME (1 in, 0 out)
   2 Microphone Array (Intel¬Æ Smart , MME (4 in, 0 out)
   3 Microsoft Sound Mapper - Output, MME (0 in, 2 out)
<  4 Speakers (Realtek(R) Audio), MME (0 in, 2 out)
   5 Headphones (Oculus Virtual Audi, MME (0 in, 2 out)
   6 Primary Sound Capture Driver, Windows DirectSound (2 in, 0 out)
   7 Headset Microphone (Oculus Virtual Audio Device), Windows DirectSound (1 in, 0 out)
   8 Microphone Array (Intel¬Æ Smart Sound Technology for Digital Microphones), Windows DirectSound (4 in, 0 out)
   9 Primary Sound Driver, Windows DirectSound (0 in, 2 out)
  10 Speakers (Realtek(R) Audio), Windows DirectSound (0 in, 2 out)
  11 Headphones (Oculus Virtual Audio Device), Windows DirectSound (0 in, 2 out)
  12 Speakers (Realtek(R) Audio), Windows WASAPI (0 in, 2 out)
  13 Headphones (Oculus Virtual Audio Device), Windows WASAPI (0 in, 2 out)
  14 Headset Mic

In [48]:
# CELL 13 (FIXED): Recorder with Device Selector
import sounddevice as sd
import numpy as np
import time

# --- CONFIGURATION ---
# CHANGE THIS NUMBER to the index you found in Step 1!
# If you are not sure, try 1, then 2, then 3.
MY_MIC_INDEX = 2
# ---------------------

def record_audio(duration=5, sample_rate=16000, device_id=MY_MIC_INDEX):
    print(f"üé§ Recording for {duration} seconds using Device {device_id}...")
    
    try:
        # We add 'blocking=True' and explicit device selection
        recording = sd.rec(int(duration * sample_rate), 
                           samplerate=sample_rate, 
                           channels=1, 
                           dtype='float32',
                           device=device_id, 
                           blocking=True)
        
        # Check if we actually got sound
        vol = np.max(np.abs(recording))
        print(f"üìä Volume captured: {vol:.4f}")
        
        if vol == 0.0:
            print("‚ùå ERROR: Still hearing Silence (0.0). Try a different Device Index!")
        else:
            print("‚úÖ Success! Audio captured.")
            
        return recording

    except Exception as e:
        print(f"‚ùå Device Failed: {e}")
        return np.zeros((int(duration*sample_rate), 1))

# Test it immediately
test_audio = record_audio()

üé§ Recording for 5 seconds using Device 2...


Exception ignored from cffi callback <function _StreamBase.__init__.<locals>.finished_callback_wrapper at 0x000002743AB0FF40>:
Traceback (most recent call last):
  File "c:\Users\preda\.pyenv\pyenv-win\versions\3.10.7\lib\site-packages\sounddevice.py", line 940, in finished_callback_wrapper
    return finished_callback()
  File "c:\Users\preda\.pyenv\pyenv-win\versions\3.10.7\lib\site-packages\sounddevice.py", line 2652, in finished_callback
    del self.data
AttributeError: data


üìä Volume captured: 0.1018
‚úÖ Success! Audio captured.


In [55]:
# CELL 14 (DEBUG MODE): Audio Analysis with "Top 3" & Volume Check
import numpy as np

try:
    # 1. Record Audio
    raw_audio = record_audio() # Uses the function from Cell 13
    
    # --- DEBUG 1: VOLUME CHECK ---
    # We check how loud the sound was (0.0 to 1.0)
    volume = np.max(np.abs(raw_audio))
    print(f"üìä Volume Level: {volume:.4f}")
    
    if volume < 0.01:
        print("‚ö†Ô∏è WARNING: Audio is too quiet! Move closer to the mic.")
    # -----------------------------

    # 2. Prepare Data
    raw_audio = raw_audio.squeeze()
    mp_audio = audio_data_module.AudioData.create_from_array(
        raw_audio.astype(float), 16000)

    # 3. Classify
    results = audio_classifier.classify(mp_audio)
    
    # 4. Show Top 3 Results (To see what's hiding)
    if results:
        categories = results[0].classifications[0].categories
        
        print("\nüèÜ Top 3 Guesses:")
        print("-" * 30)
        
        for i, category in enumerate(categories):
            print(f"{i+1}. {category.category_name} (Confidence: {category.score:.2f})")
            
            # Simple Emotion Logic based on the specific category
            if category.category_name in ["Laughter", "Giggle", "Snicker"]:
                print(f"   >>> EMOTION DETECTED: HAPPY üòä (Rank {i+1})")
            elif category.category_name in ["Crying, sobbing", "Whimper"]:
                print(f"   >>> EMOTION DETECTED: SAD üò¢ (Rank {i+1})")
                
    else:
        print("No sound detected.")

except Exception as e:
    print(f"Error: {e}")

üé§ Recording for 5 seconds using Device 2...


Exception ignored from cffi callback <function _StreamBase.__init__.<locals>.finished_callback_wrapper at 0x000002743AB94D30>:
Traceback (most recent call last):
  File "c:\Users\preda\.pyenv\pyenv-win\versions\3.10.7\lib\site-packages\sounddevice.py", line 940, in finished_callback_wrapper
    return finished_callback()
  File "c:\Users\preda\.pyenv\pyenv-win\versions\3.10.7\lib\site-packages\sounddevice.py", line 2652, in finished_callback
    del self.data
AttributeError: data


üìä Volume captured: 0.1601
‚úÖ Success! Audio captured.
üìä Volume Level: 0.1601

üèÜ Top 3 Guesses:
------------------------------
1. Grunt (Confidence: 0.41)
2. Roar (Confidence: 0.26)
3. Roaring cats (lions, tigers) (Confidence: 0.26)


In [18]:
# CELL 15: Conclusion
print("üéâ COURSE COMPLETE!")
print("1. We built a Text AI that knows if you write happy words.")
print("2. We built a Vision AI that sees if you smile.")
print("3. We built an Audio AI that hears if you laugh.")
print("This is Multimodal AI!")

üéâ COURSE COMPLETE!
1. We built a Text AI that knows if you write happy words.
2. We built a Vision AI that sees if you smile.
3. We built an Audio AI that hears if you laugh.
This is Multimodal AI!
