-
Notifications
You must be signed in to change notification settings - Fork 0
/
Live video.py
155 lines (128 loc) · 5.18 KB
/
Live video.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import base64
import json
import collections
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from flask import Flask, render_template, request
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
import numpy as np
import pyttsx3
app = Flask(__name__)
model=load_model('model_weights_new-500/model_199.h5')
from keras.applications import ResNet50
model2=ResNet50(weights="imagenet",input_shape=(224,224,3))
from keras import Model
new_model=Model(model2.input,model2.layers[-2].output)
def preprocess_img(img):
img=image.load_img(img,target_size=(224,224))
img=image.img_to_array(img)
img=np.expand_dims(img,axis=0)#for reshaping
#Normalization
img=preprocess_input(img)
return img
def encode_image(img):
img = preprocess_img(img)
feature_vector = new_model.predict(img)
feature_vector = feature_vector.reshape((-1,))
return feature_vector
def get_caption(photo,model):
descriptions = None
with open('descriptions_created-500.txt', 'r') as f:
descriptions = f.read()
json_string = descriptions.replace("'", "\"")
descriptions = json.loads(json_string)
total_words = []
for key in descriptions.keys():
[total_words.append(i) for des in descriptions[key] for i in des.split()]
counter = collections.Counter(total_words)
freq_count = dict(counter)
sorted_freq_count = sorted(freq_count.items(), reverse=True, key=lambda x: x[1])
threshold = 10
sorted_freq_count = [x for x in sorted_freq_count if x[1] > threshold]
total_words = [x[0] for x in sorted_freq_count]
word_to_idx = {}
idx_to_word = {}
for i, word in enumerate(total_words):
word_to_idx[word] = i + 1
idx_to_word[i + 1] = word
idx_to_word[177] = "startseq"
idx_to_word[178] = "endseq"
word_to_idx["startseq"] = 177
word_to_idx['endseq'] = 178
vocab_size = len(word_to_idx) + 1
maxlen=19
in_text='startseq'
for i in range(maxlen):
sequence=[word_to_idx[w] for w in in_text.split() if w in word_to_idx]
sequence=pad_sequences([sequence],maxlen=maxlen,padding='post')
ypred=model.predict([photo,sequence])
ypred=ypred.argmax()#gives word with maximum probability--greedy sampling
word=idx_to_word[ypred]
in_text+=' '+word
if word=='endseq':
break
final_caption=in_text.split()[1:-1]
final_caption=' '.join(final_caption)
return final_caption
import cv2
import pyttsx3
import threading
import time
# Function to perform video processing
# Function to capture video frames and process them
def process_video(video_capture, frames_with_captions, interval, output_folder, model):
# Set the camera properties (you can adjust these as needed)
video_capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1000)
video_capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 900)
frame_count = 0
start_time = time.time()
while True:
ret, frame = video_capture.read(0)
if not ret:
print("Failed to read a frame.")
break
elapsed_time = time.time() - start_time
if elapsed_time >= interval:
frame_path = os.path.join(output_folder, f"frame_{frame_count}.jpg")
cv2.imwrite(frame_path, frame)
photo_2048 = encode_image(frame_path).reshape((1, 2048))
caption = get_caption(photo_2048, model)
start_time = time.time()
frame_count += 1
frames_with_captions.append((frame, caption))
# Function to perform audio insertion
def insert_audio(frames_with_captions):
engine = pyttsx3.init()
while True:
if frames_with_captions:
frame, caption = frames_with_captions.pop(0)
cv2.putText(frame, caption, (10, frame.shape[0] - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2, cv2.LINE_AA)
engine.say(caption)
engine.runAndWait()
cv2.imshow("Live Video", frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cv2.destroyAllWindows()
4
if __name__ == "__main__":
output_folder = "frames_output"
os.makedirs(output_folder, exist_ok=True)
frames_with_captions = []
interval = 6 # Adjust the interval as needed for your desired frame rate
video_capture = cv2.VideoCapture(0)
if not video_capture.isOpened():
print("Failed to open video capture.")
exit()
# Load your captioning model here (replace 'your_model.h5' with your actual model path)
model = load_model('model_weights_new-500/model_199.h5')
# Create and start the video processing thread
video_thread = threading.Thread(target=process_video,
args=(video_capture, frames_with_captions, interval, output_folder, model))
video_thread.start()
# Start the audio insertion function in the main thread
insert_audio(frames_with_captions)
# Wait for the video processing thread to finish
video_thread.join()