In [None]:
!pip install ffmpeg-python openai-whisper


Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidi

In [None]:
import os
import random
import numpy as np
import ffmpeg as ff
import cv2
import librosa
import torch
import whisper
from typing import Tuple, List, Dict
from transformers import BertTokenizer, BertModel
import tensorflow as tf

# Load models once
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
whisper_model = whisper.load_model("base.en")
predictor = tf.keras.models.load_model('/content/drive/MyDrive/first_impressions_model.keras', safe_mode=False)


def extract_audio_from_video(file_path: str) -> np.ndarray:
    inputfile = ff.input(file_path)
    out = inputfile.output('-', format='f32le', acodec='pcm_f32le', ac=1, ar='44100')
    raw = out.run(capture_stdout=True, capture_stderr=True)
    return np.frombuffer(raw[0], np.float32)

def preprocess_audio_series(audio_raw):
    print(f"Original audio shape: {audio_raw.shape}")

    # Ensure the number of elements is divisible by 24 * 1319 (31656)
    target_size = 24 * 1319  # This is the required number of elements

    # If the size of the audio is greater than required, truncate
    if audio_raw.size > target_size:
        audio_raw = audio_raw[:target_size]
    # If the size is smaller, pad with zeros
    elif audio_raw.size < target_size:
        padding = target_size - audio_raw.size
        audio_raw = np.pad(audio_raw, (0, padding), 'constant')

    # Now reshape to (24, 1319, 1)
    audio_raw = np.reshape(audio_raw, (24, 1319, 1))

    print(f"Processed audio shape: {audio_raw.shape}")
    return audio_raw



def get_number_of_frames(file_path: str) -> int:
    try:
        # Check if file exists and is readable
        if not os.path.exists(file_path):
            raise ValueError("Video file does not exist")

        # Get file size
        file_size = os.path.getsize(file_path)
        if file_size == 0:
            raise ValueError("Video file is empty")

        # Use OpenCV to get actual frame count and properties
        cap = cv2.VideoCapture(file_path)
        if not cap.isOpened():
            raise ValueError("Could not open video file")

        # Get video properties
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames / fps if fps > 0 else 0

        # If we couldn't get frame count directly, try to count frames
        if total_frames <= 0:
            print("Could not get frame count directly, counting frames...")
            total_frames = 0
            while True:
                ret, _ = cap.read()
                if not ret:
                    break
                total_frames += 1
            duration = total_frames / fps if fps > 0 else 0
            # Reset video capture
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

        print(f"Video properties - Duration: {duration:.2f}s, FPS: {fps:.2f}, Total frames: {total_frames}")

        if total_frames <= 0:
            raise ValueError("Could not determine number of frames in video")

        cap.release()
        return total_frames

    except Exception as e:
        print(f"Error details: {str(e)}")
        raise ValueError(f"Error processing video file: {str(e)}")

def extract_N_video_frames(file_path: str, number_of_samples: int = 6) -> List[np.ndarray]:
    try:
        cap = cv2.VideoCapture(file_path)
        if not cap.isOpened():
            raise ValueError("Could not open video file")

        # Get video properties
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = cap.get(cv2.CAP_PROP_FPS)
        duration = total_frames / fps if fps > 0 else 0

        # If we couldn't get frame count directly, count frames
        if total_frames <= 0:
            print("Could not get frame count directly, counting frames...")
            total_frames = 0
            frame_positions = []
            while True:
                ret, _ = cap.read()
                if not ret:
                    break
                frame_positions.append(cap.get(cv2.CAP_PROP_POS_FRAMES))
                total_frames += 1
            duration = total_frames / fps if fps > 0 else 0
            print(f"Counted {total_frames} frames at positions: {frame_positions}")
            # Reset video capture
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

        print(f"Video properties - Duration: {duration:.2f}s, FPS: {fps:.2f}, Total frames: {total_frames}")

        if total_frames < number_of_samples:
            raise ValueError(f"Video is too short. Expected at least {number_of_samples} frames, got {total_frames}")

        video_frames = []
        sample_size = min(number_of_samples, total_frames)

        # Instead of using frame indices, sample at specific time points
        time_points = [duration * i / (sample_size - 1) for i in range(sample_size)]
        print(f"Sampling at time points: {time_points}")

        for time_point in time_points:
            # Convert time to frame position
            frame_pos = int(time_point * fps)
            print(f"Attempting to read frame at time {time_point:.2f}s (position {frame_pos})")

            # Try to read the frame
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_pos)
            res, frame = cap.read()

            if not res:
                print(f"Warning: Could not read frame at position {frame_pos}")
                # Try reading frames sequentially until we get a valid one
                for offset in range(-5, 6):  # Try 5 frames before and after
                    if frame_pos + offset < 0:
                        continue
                    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_pos + offset)
                    res, frame = cap.read()
                    if res:
                        print(f"Successfully read frame at offset {offset}")
                        break

                if not res:
                    raise ValueError(f"Could not read any frames near position {frame_pos}")

            video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

        cap.release()
        return video_frames
    except Exception as e:
        raise ValueError(f"Error extracting video frames: {str(e)}")

def resize_image(image: np.ndarray, new_size: Tuple[int, int]) -> np.ndarray:
    return cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)

def crop_image_window(image: np.ndarray, training: bool = False) -> np.ndarray:
    height, width, _ = image.shape
    N_index = (height - 128) // 2
    M_index = (width - 128) // 2
    return image[N_index:N_index+128, M_index:M_index+128, :]

def get_text_embeddings(text: str) -> np.ndarray:
    if not text:
        return np.zeros((768,))
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def predict_personality(file_path: str) -> Dict[str, float]:
    try:
        # Validate file exists and is readable
        if not os.path.exists(file_path):
            raise ValueError("Video file does not exist")

        # Get file size
        file_size = os.path.getsize(file_path)
        if file_size == 0:
            raise ValueError("Video file is empty")

        transcription = whisper_model.transcribe(file_path)['text']
        print(f"Transcription: {transcription}")
        print(f"Predicting personality for file: {file_path}")

        # Audio
        audio_raw = extract_audio_from_video(file_path)
        audio_input = preprocess_audio_series(audio_raw)

        # Video
        sampled = extract_N_video_frames(file_path, number_of_samples=6)
        resized_images = [resize_image(im, (248, 140)) for im in sampled]
        cropped_images = [crop_image_window(img) / 255.0 for img in resized_images]
        video_input = np.stack(cropped_images)

        # Text
        text_embedding = get_text_embeddings(transcription)

        # Predict
        preds = predictor.predict([
            np.expand_dims(audio_input, axis=0),
            np.expand_dims(video_input, axis=0),
            np.expand_dims(text_embedding, axis=0)
        ])[0]

        traits = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']
        return dict(zip(traits, preds.tolist()))
    except Exception as e:
        raise ValueError(f"Error processing video: {str(e)}")


In [None]:
predict_personality(video_path)



Transcription:  that sir you must be having a lot of option for this position to select but I don't know about the rest what I believe that I meet with the minimum requirement for this position but I'm able to do my job for its best requirement and apart from this my words cannot justify my action until I don't get the right opportunity to perform so I will be very privileged if you avail me this opportunity sir.
Predicting personality for file: /content/whyHire1.mp4
Original audio shape: (1219584,)
Processed audio shape: (24, 1319, 1)
Video properties - Duration: 27.63s, FPS: 30.01, Total frames: 829
Sampling at time points: [0.0, 5.52552, 11.05104, 16.57656, 22.10208, 27.6276]
Attempting to read frame at time 0.00s (position 0)
Attempting to read frame at time 5.53s (position 165)
Attempting to read frame at time 11.05s (position 331)
Attempting to read frame at time 16.58s (position 497)
Attempting to read frame at time 22.10s (position 663)
Attempting to read frame at time 27.63s (

{'extraversion': 1.010462760925293,
 'neuroticism': -13.511393547058105,
 'agreeableness': -11.11119270324707,
 'conscientiousness': 16.780467987060547,
 'openness': -0.36275890469551086}

In [None]:
predict_personality('/content/WhatsApp Video 2025-04-18 at 10.08.43 PM.mp4')



Transcription:  sort of not. I've just been really really busy school wise and friend wise and other stuff. But I am back. I'm working on some stuff, video ideas.
Predicting personality for file: /content/WhatsApp Video 2025-04-18 at 10.08.43 PM.mp4
Original audio shape: (674816,)
Processed audio shape: (24, 1319, 1)
Video properties - Duration: 15.30s, FPS: 30.00, Total frames: 459
Sampling at time points: [0.0, 3.06, 6.12, 9.180000000000001, 12.24, 15.3]
Attempting to read frame at time 0.00s (position 0)
Attempting to read frame at time 3.06s (position 91)
Attempting to read frame at time 6.12s (position 183)
Attempting to read frame at time 9.18s (position 275)
Attempting to read frame at time 12.24s (position 367)
Attempting to read frame at time 15.30s (position 459)
Successfully read frame at offset -5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


{'extraversion': -0.340932697057724,
 'neuroticism': -1.0145142078399658,
 'agreeableness': 1.6222896575927734,
 'conscientiousness': -0.23875777423381805,
 'openness': 1.057740330696106}

In [None]:
predict_personality('/content/WhatsApp Video 2025-04-18 at 9.42.17 PM.mp4')



Transcription:  Contains monologue video type, 10,000 clips of 5 to 10 seconds across 2000 participants. Multimodal data combines audio, visual and text data for robust analysis. Big-fi annotations with data sets include labels for the ocean personality traits based on participants' responses. Big-fi annotations include ethnicity, gender and age annotations providing rich data for personality traits analysis.
Predicting personality for file: /content/WhatsApp Video 2025-04-18 at 9.42.17 PM.mp4
Original audio shape: (1680269,)
Processed audio shape: (24, 1319, 1)
Video properties - Duration: 38.27s, FPS: 29.47, Total frames: 1128
Sampling at time points: [0.0, 7.654600977198697, 15.309201954397395, 22.963802931596092, 30.61840390879479, 38.27300488599349]
Attempting to read frame at time 0.00s (position 0)
Attempting to read frame at time 7.65s (position 225)
Attempting to read frame at time 15.31s (position 451)
Attempting to read frame at time 22.96s (position 676)
Attempting to read 

{'extraversion': -0.5545593500137329,
 'neuroticism': -1.5340781211853027,
 'agreeableness': 1.7984704971313477,
 'conscientiousness': -0.1540166586637497,
 'openness': 1.2759473323822021}

In [None]:
predict_personality('/content/WhatsApp Video 2025-04-19 at 12.11.12 AM.mp4')



Transcription:  Hyamie for four reasons. The first reason is because I am a great fit for the job description, meaning you won't have to spend your valuable time training and supervising me. The second reason you should hire me is because I am a positive person who will bring fresh ideas to your team to help you grow. The third reason you should hire me is because I have a track record of success, which I will replicate for you in at this position. For example, in my last role, I helped the company increase sales by delivering excellent customer service. And the fourth reason you should hire me is because I am positive about change and will support you.
Predicting personality for file: /content/WhatsApp Video 2025-04-19 at 12.11.12 AM.mp4
Original audio shape: (1846272,)
Processed audio shape: (24, 1319, 1)
Video properties - Duration: 41.80s, FPS: 25.02, Total frames: 1046
Sampling at time points: [0.0, 8.36, 16.72, 25.08, 33.44, 41.8]
Attempting to read frame at time 0.00s (position 

{'extraversion': 0.2852325439453125,
 'neuroticism': 0.4350805878639221,
 'agreeableness': 0.4812329113483429,
 'conscientiousness': 0.5682945251464844,
 'openness': 0.41324323415756226}