### Reading image files using openCV

In [75]:
import cv2 as cv
import sys

### Displaying image using opencv, image window won't close until you press 's'

In [2]:
# Load the image
img = cv.imread(cv.samples.findFile("lambo.jpg"))

# Check if the image was loaded successfully
if img is None:
    sys.exit('Could not read the image.')

# Display the image in a window
cv.imshow('Display window', img)

# Wait for a key press
k = cv.waitKey(0)

# If 's' is pressed, save the image and close the window
if k == ord('s'):
    cv.imwrite('lambo.jpg', img)
    cv.destroyAllWindows()  # Close the window after saving

: 

### Extracts frames from video

In [76]:
import os
# def extract_frames(video_path, output_folder):
#     video = cv.VideoCapture(video_path)
#     count = 0
#     while True:
#         ret, frame = video.read()
#         if not ret:
#             break
#         cv.imwrite(f"{output_folder}/frame_{count}.jpg", frame)
#         count += 1
#     video.release()

def extract_frames(video_path, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load the video
    video = cv.VideoCapture(video_path)

    frame_paths = []
    frame_index = 0
    
    while True:
        ret, frame = video.read()
        if not ret:
            break  # Break if no more frames are available

        # Define the path to save the frame
        frame_filename = os.path.join(output_folder, f"frame_{frame_index}.jpg")
        cv.imwrite(frame_filename, frame)  # Save the frame as a JPEG file
        frame_paths.append(frame_filename)  # Store the path of the saved frame
        
        frame_index += 1

    video.release()
    
    return frame_paths  # Return the list of frame file paths

In [55]:
extract_frames('test_video.mp4', 'frames/')

['frames/frame_0.jpg',
 'frames/frame_1.jpg',
 'frames/frame_2.jpg',
 'frames/frame_3.jpg',
 'frames/frame_4.jpg',
 'frames/frame_5.jpg',
 'frames/frame_6.jpg',
 'frames/frame_7.jpg',
 'frames/frame_8.jpg',
 'frames/frame_9.jpg',
 'frames/frame_10.jpg',
 'frames/frame_11.jpg',
 'frames/frame_12.jpg',
 'frames/frame_13.jpg',
 'frames/frame_14.jpg',
 'frames/frame_15.jpg',
 'frames/frame_16.jpg',
 'frames/frame_17.jpg',
 'frames/frame_18.jpg',
 'frames/frame_19.jpg',
 'frames/frame_20.jpg',
 'frames/frame_21.jpg',
 'frames/frame_22.jpg',
 'frames/frame_23.jpg',
 'frames/frame_24.jpg',
 'frames/frame_25.jpg',
 'frames/frame_26.jpg',
 'frames/frame_27.jpg',
 'frames/frame_28.jpg',
 'frames/frame_29.jpg',
 'frames/frame_30.jpg',
 'frames/frame_31.jpg',
 'frames/frame_32.jpg',
 'frames/frame_33.jpg',
 'frames/frame_34.jpg',
 'frames/frame_35.jpg',
 'frames/frame_36.jpg',
 'frames/frame_37.jpg',
 'frames/frame_38.jpg',
 'frames/frame_39.jpg',
 'frames/frame_40.jpg',
 'frames/frame_41.jpg',
 '

### Classifies objects in a scene using RestNet50

In [77]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
import numpy as np

model = ResNet50(weights='imagenet')

def classify_frame(frame_path):
    img = image.load_img(frame_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    preds = model.predict(x)
    return decode_predictions(preds, top=3)[0]


In [80]:
classify_frame('frames1/frame_50.jpg')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step


[('n03877845', 'palace', 0.9050565),
 ('n02817516', 'bearskin', 0.012761978),
 ('n04005630', 'prison', 0.0058205742)]

### Maps face to known faces

In [81]:
import face_recognition

def recognize_faces(known_faces_encodings, frame_path):
    frame = face_recognition.load_image_file(frame_path)
    face_locations = face_recognition.face_locations(frame)
    face_encodings = face_recognition.face_encodings(frame, face_locations)

    for face_encoding in face_encodings:
        matches = face_recognition.compare_faces(known_faces_encodings, face_encoding)
        if True in matches:
            match_index = matches.index(True)
            # Return matched face details

### Extracts audio from video file

In [95]:
from moviepy.editor import VideoFileClip

def extract_audio(video_path, aud_path):
    # Load the video
    video = VideoFileClip(video_path)
    
    # Check if the video has an audio track
    if not video.audio:
        print("No audio track found in the video.")
        return None
    
    # Save the audio as a WAV file
    video.audio.write_audiofile(aud_path, codec='pcm_s16le')
    
    return aud_path

In [96]:
extract_audio('test_video2.mp4', 'test_audio2.wav')

No audio track found in the video.


### Speech recognition algorithm

In [83]:
import speech_recognition as sr

def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand the audio")
        return None
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return None

In [19]:
transcribe_audio('test_audio.wav')

Google Speech Recognition could not understand the audio


### The audio has a lot of noise so this is a cleaning algorithm that transforms the audio to a numpy aray and denoises it

In [21]:
from pydub import AudioSegment
import numpy as np
import scipy.signal

def load_audio(audio_path):
    # Load audio file
    audio = AudioSegment.from_file(audio_path)
    
    # Convert audio to a NumPy array
    audio_array = np.array(audio.get_array_of_samples())
    
    # If stereo, take the first channel (mono)
    if audio.channels > 1:
        audio_array = audio_array[::audio.channels]
    
    return audio_array, audio.frame_rate

audio_array, sample_rate = load_audio("test_audio.wav")

In [25]:
audio_array.dtype.itemsize

2

In [29]:
def reduce_noise(audio_array, sample_rate):
    # Design a low-pass filter to remove noise
    nyquist = 0.5 * sample_rate
    low_cutoff = 100.0  # 300 Hz
    high_cutoff = 300.0  # 3000 Hz

    # Design the filter
    b, a = scipy.signal.butter(1, [low_cutoff / nyquist, high_cutoff / nyquist], btype="band")
    
    # Apply the filter
    filtered_audio = scipy.signal.lfilter(b, a, audio_array)
    
    return filtered_audio

cleaned_audio = reduce_noise(audio_array, sample_rate)


In [30]:
def save_audio(audio_array, sample_rate, output_path):
    # Ensure audio_array has a valid dtype for pydub
    if audio_array.dtype == np.float32:
        audio_array = (audio_array * 32767).astype(np.int16)  # Convert float32 to int16
    elif audio_array.dtype == np.float64:
        audio_array = (audio_array * 32767).astype(np.int16)  # Convert float64 to int16
    elif audio_array.dtype == np.int32:
        audio_array = (audio_array // 2).astype(np.int16)  # Convert int32 to int16
    elif audio_array.dtype == np.uint8:
        audio_array = (audio_array - 128).astype(np.int16)  # Convert uint8 to int16
    
    sample_width = audio_array.dtype.itemsize  # This should now be valid (1, 2, 3, or 4 bytes)

    # Convert NumPy array back to AudioSegment
    cleaned_audio_segment = AudioSegment(
        audio_array.tobytes(),
        frame_rate=sample_rate,
        sample_width=sample_width,
        channels=1
    )
    
    # Export the cleaned audio
    cleaned_audio_segment.export(output_path, format="wav")

In [31]:
cleaned_audio = np.random.randn(44100)  # Example audio array
save_audio(cleaned_audio, 44100, "cleaned_test_audio.wav")

### This method is very time constraint as i have to tweek the numbers manually until i have a very clear audio...


### Trying out the 'noisereduce' library

In [32]:
import noisereduce as nr

def advanced_noise_reduction(audio_array, sample_rate):
    # Apply noise reduction
    reduced_noise_audio = nr.reduce_noise(y=audio_array, sr=sample_rate)
    return reduced_noise_audio

cleaned_audio = advanced_noise_reduction(audio_array, sample_rate)
save_audio(cleaned_audio, sample_rate, "cleaned_test_audio_advanced.wav")

Exception ignored in: <function Wave_write.__del__ at 0x1514cd080>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/wave.py", line 465, in __del__
    self.close()
  File "/opt/anaconda3/lib/python3.12/wave.py", line 583, in close
    self._ensure_header_written(0)
  File "/opt/anaconda3/lib/python3.12/wave.py", line 603, in _ensure_header_written
    raise Error('sample width not specified')
wave.Error: sample width not specified


### Attempting the speech recognition again

In [33]:
transcribe_audio('cleaned_test_audio_advanced.wav')

Google Speech Recognition could not understand the audio


In [84]:
from keras.layers import Conv3D, MaxPooling3D, Reshape, LSTM, Dense
from keras.models import Sequential

def build_action_recognition_model():
    model = Sequential()
    
    # 3D Convolution Layer
    model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(frames, height, width, channels)))
    
    # 3D MaxPooling Layer
    model.add(MaxPooling3D(pool_size=(2, 2, 2)))
    
    # Reshape Layer to flatten height and width dimensions
    model.add(Reshape((-1, 32)))  # Flatten spatial dimensions, keep channels
    
    # LSTM Layer
    model.add(LSTM(100, activation='relu'))
    
    # Dense Layer
    model.add(Dense(1, activation='sigmoid'))
    
    return model

# Example usage
frames = 16  # Number of frames in a sequence
height = 112  # Height of each frame
width = 112  # Width of each frame
channels = 3  # Number of color channels (e.g., RGB)

model = build_action_recognition_model()
model.summary()

In [85]:
from ultralytics import YOLO

# Load YOLOv8 model
model = YOLO('yolov8n.pt')


In [86]:
# Example: Detect objects in a single frame
results = model('frames1/frame_100.jpg')


image 1/1 /Users/user/Documents/Data Science Projects/Context Understanding from Videos/frames1/frame_100.jpg: 384x640 1 person, 559.8ms
Speed: 57.6ms preprocess, 559.8ms inference, 101.6ms postprocess per image at shape (1, 3, 384, 640)


In [87]:
import torch
from PIL import Image

def detect_objects(frames):
    # Load YOLOv5 model from Ultralytics
    model = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True)
    
    detected_objects = {}
    
    for frame_path in frames:
        img = Image.open(frame_path)
        results = model(img)
        detected_objects[frame_path] = results.pandas().xyxy[0].to_dict(orient="records")
    
    return detected_objects

In [88]:
import face_recognition

def recognize_faces_in_frames(frames):
    recognized_faces = {}
    
    for frame_path in frames:
        image = face_recognition.load_image_file(frame_path)
        face_locations = face_recognition.face_locations(image)
        face_encodings = face_recognition.face_encodings(image, face_locations)
        recognized_faces[frame_path] = {
            'locations': face_locations,
            'encodings': face_encodings
        }
    
    return recognized_faces

In [89]:
from fer import FER

def detect_emotions(frames):
    detector = FER()  # Initialize FER detector
    detected_emotions = {}
    
    for frame_path in frames:
        # Read the image from the frame path
        img = cv.imread(frame_path)
        
        # Convert image from BGR (OpenCV format) to RGB (FER format)
        img_rgb = cv.cvtColor(img, cv.COLOR_BGR2RGB)
        
        # Detect emotions in the image
        result = detector.detect_emotions(img_rgb)
        
        # Extract the dominant emotion
        if result:
            emotions = result[0]['emotions']
            dominant_emotion = max(emotions, key=emotions.get)
        else:
            dominant_emotion = "unknown"
        
        detected_emotions[frame_path] = dominant_emotion
    
    return detected_emotions

In [90]:
def recognize_faces_in_frame(known_face_encodings, frame):
    face_locations = face_recognition.face_locations(frame)
    face_encodings = face_recognition.face_encodings(frame, face_locations)
    matches = face_recognition.compare_faces(known_face_encodings, face_encodings)
    return matches

In [91]:
import torch
import torchvision.transforms as transforms
from torchvision.models.video import r3d_18  # ResNet3D model

def recognize_actions(frames):
    # Load a pre-trained ResNet3D model for action recognition
    model = r3d_18(pretrained=True)
    model.eval()
    
    # Define the transformation to apply to each frame
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((112, 112)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    # Prepare a list to store the transformed frames
    processed_frames = []
    
    for frame_path in frames:
        img = cv.imread(frame_path)
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)  # Convert BGR to RGB
        transformed_frame = transform(img)
        processed_frames.append(transformed_frame)
    
    # Stack frames along the depth dimension (to match [C, T, H, W])
    clip = torch.stack(processed_frames, dim=1).unsqueeze(0)  # Shape: [1, C, T, H, W]
    
    # Predict actions in the sequence of frames
    with torch.no_grad():
        outputs = model(clip)
    
    # Get the predicted action
    _, predicted = outputs.max(1)
    action_label = predicted.item()  # Convert tensor to integer
    
    return action_label


In [92]:
def analyze_context(detected_objects, recognized_faces, detected_emotions, actions, transcribed_text):
    # Generate a text-based summary
    
    # Extract object names from detected_objects
    object_names = []
    for objs in detected_objects.values():
        for obj in objs:
            if isinstance(obj, dict):
                object_names.append(obj.get('name', 'Unknown Object'))  # Handling if 'name' key is missing
            else:
                object_names.append(obj)  # In case obj is a string directly

    summary = f"Actions: {actions}\n"
    summary += f"Objects Detected: {', '.join(set(object_names))}\n"
    
    # Extract face names from recognized_faces
    face_names = []
    for faces in recognized_faces.values():
        for face in faces:
            if isinstance(face, dict):
                face_names.append(face.get('name', 'Unknown Face'))  # Handling if 'name' key is missing
            else:
                face_names.append(face)  # In case face is a string directly
    
    summary += f"Faces Recognized: {', '.join(set(face_names))}\n"
    summary += f"Emotions Detected: {', '.join(set(detected_emotions.values()))}\n"
    summary += f"Transcribed Speech: {transcribed_text}\n"
    
    return summary

In [103]:
def analyze_video(video_path):
    # Step 1: Extract frames and audio
    frames = extract_frames(video_path, 'images/frames/')
    audio = extract_audio(video_path, 'audio/test_audio.wav')
    
    # Step 2: Detect objects, faces, emotions in each frame
    detected_objects = detect_objects(frames)
    recognized_faces = recognize_faces_in_frames(frames)
    detected_emotions = detect_emotions(frames)
    
    # Step 3: Recognize actions in the sequence of frames
    actions = recognize_actions(frames)
    
    # Step 4: Transcribe speech in the audio
    transcribed_text = transcribe_audio(audio)
    
    # Step 5: Contextual analysis (Combine all information)
    context = analyze_context(detected_objects, recognized_faces, detected_emotions, actions, transcribed_text)
    
    # Step 6: Output the context
    return context

context_summary = analyze_video('video/test_video.mp4')
print(context_summary)


MoviePy - Writing audio in audio/test_audio.wav


                                                                    

MoviePy - Done.


Using cache found in /Users/user/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2024-8-21 Python-3.12.4 torch-2.2.2 CPU

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 


Google Speech Recognition could not understand the audio
Actions: 378
Objects Detected: motorcycle, person, dog, car, skateboard, bicycle
Faces Recognized: encodings, locations
Emotions Detected: happy, fear, unknown, angry, sad
Transcribed Speech: None

