## Importing the necessary libraries

In [1]:
# Standard libraries
import os
import io
import re
import time
import pickle
import threading
import subprocess
from collections import Counter
import sys

# Data manipulation and analysis
import numpy as np
import pandas as pd

# Image and video processing
import cv2
from PIL import Image, ImageTk, ImageDraw, ImageFont
import moviepy.editor as mp
from moviepy.video.io.VideoFileClip import VideoFileClip

# Audio processing
import pyaudio
import wave
import pygame

# Machine learning and deep learning
import tensorflow as tf
from tensorflow import keras
from keras.models import model_from_json

# Computer vision
import face_recognition

# Speech recognition and synthesis
import speech_recognition as sr
from pyht import Client, TTSOptions, Format
from elevenlabs import Voice, VoiceSettings
from elevenlabs.client import ElevenLabs

# GUI
import tkinter as tk
from tkinter import ttk, filedialog

# AI and language models
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts.prompt import PromptTemplate
from langchain.memory import ConversationSummaryBufferMemory
from langchain.chains import ConversationChain

## Configuration for Audio Recording, Text-to-Speech, and Google API Integration


In [2]:
# Audio recording parameters
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
AUDIO_OUTPUT = "output.wav"
VIDEO_OUTPUT = "output.avi"
FINAL_OUTPUT = "final_output.mp4"

# Global variables
recording = False
audio_frames = []
cap = None
out = None
selected_microphone = None

# TTS Client
client = Client("User ID", "API Code")

# Google API Key setup
os.environ["GOOGLE_API_KEY"] = 'GOOGLE_API_KEY'
gemini_model  = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0.6)

prompt_template = PromptTemplate(
    input_variables=['history', 'input'],
    template='''You are a nice friend and imagine you have the ability to remember the past conversation and the ability to see the face of the conversation's human. Use the past conversation if it's needed. Based on the conversation chat emotion and the human face emotion, alter the responses and rank the human chat response within these emotions: sad, fearful, angry, love, embarrassed, happy, neutral.
Use the following format:
Response: response here in a beautiful manner
Rank the emotion of the human chat input: only mention the one ranked emotion value here nothing more needed here
.\n\nCurrent conversation:\n{history}\nHuman: {input}\nAI:'''
)


## Load emotion detection model

In [3]:
# Load emotion detection model
with open('emotion_model.json', 'r') as json_file:
    loaded_model_json = json_file.read()
emotion_model = model_from_json(loaded_model_json)
emotion_model.load_weights("emotion_model.h5")

## Function to Find Face Encodings 

In [4]:
def find_encodings(file):
    img = cv2.imread(file)
    rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    encode = face_recognition.face_encodings(rgb_img)[0]
    return encode

## Script for Face Encoding, Chat History Management, Audio Processing, and Emotion Detection


In [5]:
# Finding face encodings for specific images
enc_face_img2 = find_encodings('chris.jpg')

# Sample chat history
chat1 = '''Elon Musk is a rich person.'''

# Creating a DataFrame as a database
df = pd.DataFrame(columns=['encode', 'chat_history'])
new_row2 = {"encode": enc_face_img2, "chat_history": chat1}
df.loc[len(df)] = new_row2

# Function to separate audio from video and extract text
def separate_audio(video_file, audio_path):
    video = VideoFileClip(video_file)
    audio = video.audio
    audio.write_audiofile(audio_path)
    
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
    return text

# Setting important variables
emotion_dict = {
    0: "Angry", 1: "Disgusted", 2: "Fearful", 
    3: "Happy", 4: "Neutral", 5: "Sad", 6: "Surprised"
}

audio_path = "output_audio.wav"  # Temporary file path to save the audio

# Regular expression patterns
response_pattern = r'Response: (.*) Rank the emotion'
emotion_pattern = r'Rank the emotion of the human chat input: (\w+)'

# Function to separate audio from video and extract text

In [6]:
def Transcribing(file_path):
    # Function to separate audio from video and extract text
    video = VideoFileClip(file_path)
    audio = video.audio
    audio.write_audiofile(audio_path)

    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
    return text
    

# Face recognition

In [7]:
def face_match(db,frame):
    match = []
    # Face recognition
    face_locations = face_recognition.face_locations(frame)
    face_encodings = face_recognition.face_encodings(frame, face_locations)
    
    for face_encoding in face_encodings:
        face_matches = [face_recognition.compare_faces([face_encode], face_encoding,tolerance = 0.5) for face_encode in db.encode]
        match.extend(face_matches)
    return match,face_encodings 

# Emotion detection

In [8]:
def emotion_detection(frame):            
    face_detector = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
    labels = []
    # Emotion detection
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_detector.detectMultiScale(gray_frame, scaleFactor=1.3, minNeighbors=5)
    
    for (x, y, w, h) in faces:
        roi_gray_frame = gray_frame[y:y + h, x:x + w]
        cropped_img = np.expand_dims(np.expand_dims(cv2.resize(roi_gray_frame, (48, 48)), -1), 0)
        emotion_prediction = emotion_model.predict(cropped_img)
        max_index = int(np.argmax(emotion_prediction))
        labels.append(emotion_dict[max_index])
        print(labels)
    most_common_value = Counter(labels).most_common(1)[0][0]
    print(f"Most common emotion: {most_common_value}")
    return most_common_value


# Function to handle chat history

In [9]:
def chat_history(db,match,face_encodings,text,most_common_value):
    index = None
    chat_history = False
    if match:
        flat_list = [item for sublist in match for item in sublist]
        try:
            index = flat_list.index(True)
            chat_history = True
            print("Match found")
        except ValueError:
            print("No match found in existing data")
    
    if not chat_history:
        print("No history, creating new entry")
        new_row = {"encode": face_encodings[0], "chat_history": ''}
        db.loc[len(db)] = new_row
        index = len(db) - 1
    
    combined_input = f"human face emotion: {most_common_value} question: {text}"
    chat = str(db.iloc[index, 1]) if index is not None else ""
    return combined_input,chat,index


# Function to handle LLM

In [10]:
def llm_function(chat,combined_input,db,index):
    conversation_with_summary = ConversationChain(
        llm=gemini_model ,
        prompt=prompt_template,
        memory=ConversationSummaryBufferMemory(
            llm=ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0.6),
            max_token_limit=30,
            moving_summary_buffer=chat
            )
        )
    print("The LLM is running")
    # Running the chain to generate output
    output = conversation_with_summary.predict(input=combined_input)
    print(output)    

    memory = conversation_with_summary.memory.moving_summary_buffer
    db.iloc[index, 1] = memory
    print('Summary updated successfully')
    print(f"Updated summary: {db.iloc[index, 1]}")
    
    return output

# Extracting response and emotion from the output

In [11]:
def response_extract(output):
    # Extracting response and emotion from the output
    response_pattern = r'Response: (.+?)(?:\n|$)'
    emotion_pattern = r'Rank the emotion of the human chat input: (\w+)'
    
    response_match = re.search(response_pattern, output)
    response = response_match.group(1).strip() if response_match else None
    
    emotion_match = re.search(emotion_pattern, output)
    emotion_word = emotion_match.group(1) if emotion_match else 'NO Emotion'
    
    final_emotion = [emotion_word]
    return response, final_emotion

# Function to handle TTS

In [12]:
def play_tts(response, final_emotion):
    # Initialize pygame mixer
    pygame.mixer.init()

    emotions = ['sad', 'fearful', 'angry']
    final_emotion = [x.lower() for x in final_emotion]
    print(f"Processed input: {final_emotion}")

    base_options = {
        "voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
        "sample_rate": 8000,
        "format": Format.FORMAT_MP3,
    }

    if final_emotion[0] in emotions:
        options = TTSOptions(**base_options, speed=0.7)
    else:
        options = TTSOptions(**base_options, speed=0.9)

    # Create a bytes buffer to store the audio data
    audio_buffer = io.BytesIO()

    for chunk in client.tts(text=[response], voice_engine="PlayHT2.0-turbo", options=options):
        audio_buffer.write(chunk)

    # Reset the buffer position to the beginning
    audio_buffer.seek(0)

    # Load the audio data into pygame
    pygame.mixer.music.load(audio_buffer)

    # Play the audio
    pygame.mixer.music.play()

    # Wait for the audio to finish playing
    while pygame.mixer.music.get_busy():
        pygame.time.Clock().tick(10)

    # Clean up
    pygame.mixer.quit()

# The main process code

In [13]:

def jessica():
    # Assuming these are global variables or imported from elsewhere
    file_path = filedialog.askopenfilename(filetypes=[("Video files", "*.mp4;*.avi")])
    if not file_path:
        return

    def process_video():
        update_status("Transcribing video...")
        text = Transcribing(file_path)
        
        update_status("Detecting emotion and performing face recognition...")
        face_data = analyze_video(file_path)
        
        update_status("Generating response...")
        response, final_emotion = generate_response(face_data, text)
        
        update_status("Playing response...")
        play_tts(response, final_emotion)
        
        update_status("Process completed!")

    threading.Thread(target=process_video).start()

def analyze_video(file_path):
    match = ""
    face_encodings = []
    most_common_emotion = ""
    
    vid = cv2.VideoCapture(file_path)
    frame_count = 0
    
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        
        frame_count += 1
        if frame_count % 20 == 0:
            match, face_encodings = face_match(db,frame)
            current_emotion = emotion_detection(frame)
            if current_emotion:
                most_common_emotion = current_emotion  # Simplified; consider using a proper method to determine the most common emotion
                print(most_common_emotion)

    vid.release()
    cv2.destroyAllWindows()
    
    return match, face_encodings, most_common_emotion

def generate_response(face_data, text):
    try:
        match, face_encodings, most_common_emotion = face_data
        combined_input, chat,index = chat_history(db, match, face_encodings, text, most_common_emotion)
        output = llm_function(chat, combined_input, db, index)
        response, final_emotion = response_extract(output)
        return response, final_emotion
    except ValueError as e:
        print(f"Error in generate_response: {e}")
        return "I'm sorry, there was an error processing the response.", ["neutral"]



db = df

## Exit Function

In [14]:
def exit_application():
    # Release any resources that need to be closed
    if 'cap' in globals() and cap is not None:
        cap.release()
    if 'out' in globals() and out is not None:
        out.release()
    cv2.destroyAllWindows()
    
    # Stop any ongoing recordings
    global recording
    recording = False
    
    # Quit Pygame mixer if it's initialized
    if pygame.mixer.get_init():
        pygame.mixer.quit()
    
    # Destroy the window and exit the application
    window.destroy()
    sys.exit()

## status_label Function

In [15]:
def update_status(message):
    status_label.config(text=message)
    window.update_idletasks()


## the upload function

In [16]:
# the upload_video function
def upload_video():
    jessica()



## Setup GUI


In [None]:
# Setup GUI
window = tk.Tk()
window.title("Jessica")
window.attributes('-fullscreen', True)

# Get screen dimensions
screen_width = window.winfo_screenwidth()
screen_height = window.winfo_screenheight()

# Create and configure styles
style = ttk.Style()
style.theme_use('clam')

style.configure('Modern.TButton', 
                font=('Helvetica', 12, 'bold'),
                foreground='white',
                background='#FF4547',
                padding=(20, 10),
                borderwidth=0,
                relief='flat')
style.map('Modern.TButton',
          foreground=[('active', 'white')],
          background=[('active', '#D70003')])

style.configure('TLabel', font=('Helvetica', 12), background='#1E1E1E', foreground='white')
style.configure('TFrame', background='#1E1E1E')
style.configure('TCombobox', 
                selectbackground='#3E3E3E',
                fieldbackground='#3E3E3E',
                background='#3E3E3E',
                foreground='white',
                arrowcolor='white')
style.map('TCombobox', fieldbackground=[('readonly', '#3E3E3E')])

style.configure('Exit.TButton', 
                font=('Helvetica', 12, 'bold'),
                foreground='white',
                background='#FE8645',
                padding=(20, 10),
                borderwidth=0,
                relief='flat')
style.map('Exit.TButton',
          foreground=[('active', 'white')],
          background=[('active', '#B23F00')])

# Create overlay
overlay = ttk.Frame(window, style='TFrame')
overlay.place(relwidth=1, relheight=1)

# Set background image
bg_image = Image.open("jes2.jpg")
bg_image = bg_image.resize((screen_width, screen_height), Image.LANCZOS)
bg_photo = ImageTk.PhotoImage(bg_image)
bg_label = tk.Label(overlay, image=bg_photo)
bg_label.place(relwidth=1, relheight=1)

# Create frames and widgets
input_frame = ttk.Frame(overlay, style='TFrame')
input_frame.place(relx=0.98, rely=0.02, anchor='ne')

status_label = ttk.Label(overlay, text="", style='TLabel', font=('Helvetica', 14))
status_label.place(relx=0.5, rely=0.9, anchor='center')

button_frame = ttk.Frame(overlay, style='TFrame')
button_frame.place(relx=0.98, rely=0.98, anchor='se')

upload_button = ttk.Button(button_frame, text="Upload Video", command=upload_video, style='Modern.TButton')
upload_button.pack(side=tk.TOP, padx=5, pady=5)

exit_button = ttk.Button(overlay, text="Exit", command=exit_application, style='Exit.TButton')
exit_button.place(relx=0.02, rely=0.02, anchor='nw')

# Start the Tkinter main loop
window.mainloop()

MoviePy - Writing audio in output_audio.wav


                                                                                                                       

MoviePy - Done.
['Surprised']
Most common emotion: Surprised
Surprised
['Sad']
Most common emotion: Sad
Sad
['Neutral']
Most common emotion: Neutral
Neutral
['Fearful']
Most common emotion: Fearful
Fearful
['Surprised']
Most common emotion: Surprised
Surprised
['Neutral']
Most common emotion: Neutral
Neutral
['Neutral']
Most common emotion: Neutral
Neutral
['Surprised']
Most common emotion: Surprised
Surprised
No match found in existing data
No history, creating new entry
The LLM is running
Response: It's lovely to meet you, Ahmed! 😊 
Rank the emotion of the human chat input: neutral. 

Summary updated successfully
Updated summary: Current summary:


New lines of conversation:
Human: human face emotion: Surprised question: I am Ahmed

New summary:
A human, looking surprised, introduces themself as Ahmed. 

Processed input: ['neutral']
