# Konfigrasyon

In [None]:
FILENAME = "demo.mp4"

# Gerekli Kütüphane ve Program Kurulumları

In [None]:
!pip install --quiet srt_file_translator

!pip install --quiet ipython-autotime
%load_ext autotime

!pip install --quiet moviepy==2.0.0.dev2
!pip install --quiet imageio==2.25.1
!pip install --quiet ffmpeg-python==0.2.0
!pip install --quiet faster-whisper==0.7.0
!pip install --quiet python-docx

In [None]:
!apt install  imagemagick
!cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml

# Core Functions


In [None]:
from faster_whisper import WhisperModel
import ffmpeg
import json
from docx import Document
import re
from moviepy.editor import TextClip, CompositeVideoClip, concatenate_videoclips,VideoFileClip, ColorClip
import numpy as np

In [None]:
#
def extract_sound_file(fileName):
  audiofilename = fileName.replace(".mp4",'.mp3')

  # Create the ffmpeg input strea m
  input_stream = ffmpeg.input(fileName)
  audio = input_stream.audio
  output_stream = ffmpeg.output(audio, audiofilename)
  output_stream = ffmpeg.overwrite_output(output_stream)

  ffmpeg.run(output_stream)
  return audiofilename

#
def load_model(model_size="medium"):
  return WhisperModel(model_size)

#
def create_segments(model, audiofilename):
  segments, info = model.transcribe(audiofilename, word_timestamps=True)
  return segments

#
def print_segments(segments):
  segments = list(segments)
  for segment in segments:
    for word in segment.words:
        print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))

#
def process(segments):
  segments = list(segments)

  wordlevel_info = []
  for segment in segments:
    for word in segment.words:
      wordlevel_info.append({'word':word.word,'start':word.start,'end':word.end})

  return wordlevel_info

#
def dump_json(data):
  with open('data.json', 'w') as f:
    json.dump(data, f,indent=4)

#
def read_json(fileName='data.json'):
  with open(fileName, 'r') as f:
    wordlevel_info_modified = json.load(f)

  return wordlevel_info_modified

#
def split_text_into_lines(data):

    MaxChars = 30
    #maxduration in seconds
    MaxDuration = 2.5
    #Split if nothing is spoken (gap) for these many seconds
    MaxGap = 1.5

    subtitles = []
    line = []
    line_duration = 0
    line_chars = 0


    for idx,word_data in enumerate(data):
        word = word_data["word"]
        start = word_data["start"]
        end = word_data["end"]

        line.append(word_data)
        line_duration += end - start

        temp = " ".join(item["word"] for item in line)


        # Check if adding a new word exceeds the maximum character count or duration
        new_line_chars = len(temp)

        duration_exceeded = line_duration > MaxDuration
        chars_exceeded = new_line_chars > MaxChars
        if idx>0:
          gap = word_data['start'] - data[idx-1]['end']
          # print (word,start,end,gap)
          maxgap_exceeded = gap > MaxGap
        else:
          maxgap_exceeded = False


        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitle_line = {
                    "word": " ".join(item["word"] for item in line),
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0
                line_chars = 0


    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        }
        subtitles.append(subtitle_line)

    return subtitles

#
def create_caption(textJSON, framesize,font = "Helvetica",color='white', highlight_color='yellow',stroke_color='black',stroke_width=1.5):
    wordcount = len(textJSON['textcontents'])
    full_duration = textJSON['end']-textJSON['start']

    word_clips = []
    xy_textclips_positions =[]

    x_pos = 0
    y_pos = 0
    line_width = 0  # Total width of words in the current line
    frame_width = framesize[0]
    frame_height = framesize[1]

    x_buffer = frame_width*1/10

    max_line_width = frame_width - 2 * (x_buffer)

    fontsize = int(frame_height * 0.075) #7.5 percent of video height

    space_width = ""
    space_height = ""

    for index,wordJSON in enumerate(textJSON['textcontents']):
      duration = wordJSON['end']-wordJSON['start']
      word_clip = TextClip(wordJSON['word'], font = font,fontsize=fontsize, color=color,stroke_color=stroke_color,stroke_width=stroke_width).set_start(textJSON['start']).set_duration(full_duration)
      word_clip_space = TextClip(" ", font = font,fontsize=fontsize, color=color).set_start(textJSON['start']).set_duration(full_duration)
      word_width, word_height = word_clip.size
      space_width,space_height = word_clip_space.size
      if line_width + word_width+ space_width <= max_line_width:
            # Store info of each word_clip created
            xy_textclips_positions.append({
                "x_pos":x_pos,
                "y_pos": y_pos,
                "width" : word_width,
                "height" : word_height,
                "word": wordJSON['word'],
                "start": wordJSON['start'],
                "end": wordJSON['end'],
                "duration": duration
            })

            word_clip = word_clip.set_position((x_pos, y_pos))
            word_clip_space = word_clip_space.set_position((x_pos+ word_width, y_pos))

            x_pos = x_pos + word_width+ space_width
            line_width = line_width+ word_width + space_width
      else:
            # Move to the next line
            x_pos = 0
            y_pos = y_pos+ word_height+10
            line_width = word_width + space_width

            # Store info of each word_clip created
            xy_textclips_positions.append({
                "x_pos":x_pos,
                "y_pos": y_pos,
                "width" : word_width,
                "height" : word_height,
                "word": wordJSON['word'],
                "start": wordJSON['start'],
                "end": wordJSON['end'],
                "duration": duration
            })

            word_clip = word_clip.set_position((x_pos, y_pos))
            word_clip_space = word_clip_space.set_position((x_pos+ word_width , y_pos))
            x_pos = word_width + space_width


      word_clips.append(word_clip)
      word_clips.append(word_clip_space)


    for highlight_word in xy_textclips_positions:

      word_clip_highlight = TextClip(highlight_word['word'], font = font,fontsize=fontsize, color=highlight_color,stroke_color=stroke_color,stroke_width=stroke_width).set_start(highlight_word['start']).set_duration(highlight_word['duration'])
      word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
      word_clips.append(word_clip_highlight)

    return word_clips,xy_textclips_positions

# Dönüşüm Fonksiyonları

In [None]:
#
def segments_to_srt(segments, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as file:
        for i, segment in enumerate(segments, start=1):
            for word in segment.words:
                start = word.start
                end = word.end
                # SRT formatında zamanı formatlama
                start_srt = "%02d:%02d:%02d,%03d" % (int(start / 3600), int(start / 60 % 60), int(start % 60), int(start * 1000 % 1000))
                end_srt = "%02d:%02d:%02d,%03d" % (int(end / 3600), int(end / 60 % 60), int(end % 60), int(end * 1000 % 1000))
                file.write(f"{i}\n")
                file.write(f"{start_srt} --> {end_srt}\n")
                file.write(f"{word.word}\n\n")

#
def srt_to_docx(srt_file_path="transcription.srt", docx_file_path="transcription.docx"):
    doc = Document()

    full_text = ''

    with open(srt_file_path, 'r', encoding='utf-8') as file:
        srt_content = file.read()

    subtitles = re.split(r'\n\n+', srt_content)

    for subtitle in subtitles:
        lines = subtitle.split('\n')[2:]
        subtitle_text = ' '.join(lines)
        full_text += subtitle_text + ' '

    doc.add_paragraph(full_text)

    doc.save(docx_file_path)

#
def seconds_to_srt_time(seconds):
    """Saniye cinsinden zamanı saat:dakika:saniye,milisaniye formatına çevir."""
    ms = int((seconds - int(seconds)) * 1000)
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h:02}:{m:02}:{s:02},{ms:03}"

def format_to_srt(data):
    srt_content = ""
    for index, item in enumerate(data, start=1):
        start_time = seconds_to_srt_time(item['start'])
        end_time = seconds_to_srt_time(item['end'])
        word = item['word'].strip()
        srt_content += f"{index}\n{start_time} --> {end_time}\n{word}\n\n"
    return srt_content

def json_to_srt(json_file="data.json", output_filename="transcription.srt"):
  with open(json_file, 'r') as file:
      data = json.load(file)

  srt_content = format_to_srt(data)

  with open(output_filename, 'w') as srt_file:
      srt_file.write(srt_content)

# Extra

In [None]:
#
def find_word_in_srt(search_word, srt_file_path="transcription.srt"):
    search_word = search_word.lower()
    matches = []

    with open(srt_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line_lower = line.lower()
            if re.search(r'\b' + re.escape(search_word) + r'\b', line_lower):
                matches.append(line.strip())

    return matches

#
def find_word_and_timestamp_in_srt(search_word, srt_file_path="transcription.srt"):
    search_word = search_word.lower()
    results = []

    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    subtitles = re.split(r'\n\n+', content)

    for subtitle in subtitles:
        lines = subtitle.split('\n')
        if len(lines) < 3:
            continue

        time_info = lines[1]
        text = ' '.join(lines[2:]).lower()

        if re.search(r'\b' + re.escape(search_word) + r'\b', text):
            results.append((time_info, '\n'.join(lines[2:])))

    return results

#
def time_to_seconds(time_str):
    """Zaman damgasını saniye cinsinden döndür."""
    hours, minutes, seconds = [int(part) for part in time_str.split(':')[0:3]]
    seconds += 60 * minutes + 3600 * hours
    return seconds

#
def find_word_and_seconds_in_srt(search_word, srt_file_path="transcription.srt"):
    search_word = search_word.lower()
    seconds_list = []

    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    subtitles = re.split(r'\n\n+', content)

    for subtitle in subtitles:
        lines = subtitle.split('\n')
        if len(lines) < 3:
            continue

        time_info = lines[1]
        text = ' '.join(lines[2:]).lower()

        if re.search(r'\b' + re.escape(search_word) + r'\b', text):
            start_time_str = time_info.split(' --> ')[0]
            start_seconds = time_to_seconds(start_time_str.split(',')[0])
            seconds_list.append(start_seconds)

    return seconds_list

# Runtime

In [None]:
soundFile = extract_sound_file(FILENAME)

In [None]:
model = load_model()

In [None]:
segments = create_segments(model, soundFile)

In [None]:
wli = process(segments)

In [None]:
dump_json(wli)

In [None]:
data = read_json()

In [None]:
subtitles = split_text_into_lines(data)

In [None]:
for line in subtitles:
  json_str = json.dumps(line, indent=4)
  print(json_str)

In [None]:
input_video = VideoFileClip(FILENAME)
frame_size = input_video.size

all_linelevel_splits=[]

for line in subtitles:
  out_clips,positions = create_caption(line,frame_size)

  max_width = 0
  max_height = 0

  for position in positions:
    # print (out_clip.pos)
    # break
    x_pos, y_pos = position['x_pos'],position['y_pos']
    width, height = position['width'],position['height']

    max_width = max(max_width, x_pos + width)
    max_height = max(max_height, y_pos + height)

  color_clip = ColorClip(size=(int(max_width*1.1), int(max_height*1.1)),
                       color=(64, 64, 64))
  color_clip = color_clip.set_opacity(.6)
  color_clip = color_clip.set_start(line['start']).set_duration(line['end']-line['start'])

  # centered_clips = [each.set_position('center') for each in out_clips]

  clip_to_overlay = CompositeVideoClip([color_clip]+ out_clips)
  clip_to_overlay = clip_to_overlay.set_position("bottom")


  all_linelevel_splits.append(clip_to_overlay)

input_video_duration = input_video.duration


final_video = CompositeVideoClip([input_video] + all_linelevel_splits)

# Set the audio of the final video to be the same as the input video
final_video = final_video.set_audio(input_video.audio)

# Save the final clip as a video file with the audio included
final_video.write_videofile("output.mp4", fps=24, codec="libx264", audio_codec="aac")

# Extra Features

In [None]:
convert_to_srt(segments, "transcription.srt")

In [None]:
json_to_srt()

# SRT to Docs

In [None]:
srt_to_docx()

# Find

In [None]:
search_word = "pentagon"
matches = find_word_in_srt(search_word)

if matches:
    print(f"'{search_word}' kelimesinin bulunduğu satırlar:")
    for match in matches:
        print(match)
else:
    print(f"'{search_word}' kelimesi bulunamadı.")

In [None]:
search_word = 'pentagon'  # Aranan kelime
matches = find_word_and_timestamp_in_srt(search_word)

if matches:
    print(f"'{search_word}' kelimesinin bulunduğu zamanlar ve satırlar:")
    for time_info, match in matches:
        print(f"Zaman: {time_info}")
        print(f"Metin: {match}\n")
else:
    print(f"'{search_word}' kelimesi bulunamadı.")

In [None]:
search_word = 'pentagon'
seconds_list = find_word_and_seconds_in_srt(search_word)

if seconds_list:
    print(f"'{search_word}' kelimesinin bulunduğu saniyeler:")
    for seconds in seconds_list:
        print(seconds)
else:
    print(f"'{search_word}' kelimesi bulunamadı.")