In [7]:
# --- Imports ---
import os
import uuid
import pandas as pd
from pathlib import Path
from moviepy.editor import VideoFileClip
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
import whisper

# --- Parameters ---
collection_name = "video_chunks"
chunk_size = 40
overlap = 10
clip_duration = 10  # seconds
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
qdrant = QdrantClient(host="localhost", port=6333)
os.makedirs("video_segments", exist_ok=True)

# --- Helper: Chunking ---
def chunk_text(words, chunk_size, overlap):
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

# --- Main Processing ---
def process_video(video_file):
    print(f"Processing video: {video_file}")

    # Step 1: Transcription
    whisper_model = whisper.load_model("base")
    video = VideoFileClip(video_file)
    audio_path = "temp.wav"
    video.audio.write_audiofile(audio_path, verbose=False, logger=None)

    result = whisper_model.transcribe(audio_path)
    transcript_text = result["text"]
    words = transcript_text.strip().split()

    chunks = chunk_text(words, chunk_size, overlap)
    print(f"Transcript contains {len(words)} words → {len(chunks)} text chunks.")

    # Step 2: Embedding
    vectors = embedding_model.encode(chunks)

    # Step 3: Segment saving + Qdrant point prep
    video_basename = Path(video_file).stem
    points = []

    for i, (text_chunk, vector) in enumerate(zip(chunks, vectors)):
        start_time = i * clip_duration
        end_time = start_time + clip_duration
        video_path = f"video_segments/{video_basename}_chunk{i}.mp4"

        try:
            clip = video.subclip(start_time, min(end_time, video.duration))
            clip.write_videofile(video_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
        except Exception as e:
            print(f"Clip {i} failed: {e}")
            continue

        payload = {
            "text": text_chunk,
            "video_path": video_path,
            "video_id": video_basename,
            "chunk_id": i,
            "start": int(start_time),
            "topic": -1
        }

        points.append({
            "id": str(uuid.uuid4()),
            "vector": vector.tolist(),
            "payload": payload
        })

    # Step 4: Upload to Qdrant
    if points:
        qdrant.upsert(collection_name=collection_name, points=points)
        print(f"📤 Uploaded {len(points)} chunks to Qdrant.")
    else:
        print(" No valid chunks to upload.")

    # Step 5: Return for debugging
    df = pd.DataFrame([p["payload"] for p in points])
    print(df.head())  # Show a preview
    return df

🎥 Processing video: C:\Users\algba\FinalAIDLCVProject - SegmentVC - forsubmission -562025\Climate Change.mp4
🧠 Transcript contains 48 words → 2 text chunks.
📤 Uploaded 2 chunks to Qdrant.
                                                text  \
0  Is global warming real? Is it man-made? This v...   
1  to subscribe, like this video, and share it wi...   

                                 video_path        video_id  chunk_id  start  \
0  video_segments/Climate Change_chunk0.mp4  Climate Change         0      0   
1  video_segments/Climate Change_chunk1.mp4  Climate Change         1     10   

   topic  
0     -1  
1     -1  


In [None]:
# --- Example Run ---
video_path = r"C:\Users\algba\FinalAIDLCVProject - SegmentVC - forsubmission -562025\Climate Change.mp4"
df = process_video(video_path)

In [8]:
# import os
# from moviepy.editor import VideoFileClip
# from qdrant_client import QdrantClient
# import pandas as pd

# # --- Directories ---
# video_dir = "youtube_videos"
# segment_dir = "video_segments"
# os.makedirs(segment_dir, exist_ok=True)

# # --- Connect to Qdrant ---
# qdrant = QdrantClient(host="localhost", port=6333)
# collection_name = "video_chunks"

# # --- Load all payloads ---
# response = qdrant.scroll(collection_name=collection_name, with_payload=True, limit=1000)
# records = response[0]

# rows = []
# for r in records:
#     payload = r.payload
#     if "video_id" in payload and "chunk_id" in payload and "start" in payload:
#         rows.append(payload)

# df = pd.DataFrame(rows)

# # --- Rebuild missing clips ---
# print(f" Rebuilding {len(df)} segments...")

# rebuilt_count = 0
# for _, row in df.iterrows():
#     video_id = row["video_id"]
#     chunk_id = row["chunk_id"]
#     start = int(row["start"])
#     end = start + 10  # 10s per chunk

#     full_video_path = os.path.join(video_dir, f"{video_id}.mp4")
#     output_path = os.path.join(segment_dir, f"{video_id}_chunk{chunk_id}.mp4")

#     if not os.path.exists(full_video_path):
#         print(f"⚠ Skipping: full video not found → {full_video_path}")
#         continue

#     if os.path.exists(output_path):
#         print(f" Already exists: {output_path}")
#         continue

#     try:
#         clip = VideoFileClip(full_video_path).subclip(start, end)
#         clip.write_videofile(output_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
#         rebuilt_count += 1
#         print(f" Rebuilt: {output_path}")
#     except Exception as e:
#         print(f" Failed for {video_id} chunk {chunk_id}: {e}")

# print(f"\n Done. Rebuilt {rebuilt_count} new clip(s).")


 Rebuilding 1000 segments...
⚠ Skipping: full video not found → youtube_videos\kVYyDO0B6xo.mp4
 Rebuilt: video_segments\4WHFcqq1ErA_chunk147.mp4
⚠ Skipping: full video not found → youtube_videos\E4pEBbF9CkU.mp4
⚠ Skipping: full video not found → youtube_videos\OAJ6M40YoGA.mp4
⚠ Skipping: full video not found → youtube_videos\LwAiOhW9VAk.mp4
⚠ Skipping: full video not found → youtube_videos\aPfDU2TwaxM.mp4
⚠ Skipping: full video not found → youtube_videos\C4sloVtHSwo.mp4
⚠ Skipping: full video not found → youtube_videos\Ii5DAx3noCA.mp4
⚠ Skipping: full video not found → youtube_videos\hSvsjlFbn2w.mp4
⚠ Skipping: full video not found → youtube_videos\.mp4
⚠ Skipping: full video not found → youtube_videos\C4sloVtHSwo.mp4
⚠ Skipping: full video not found → youtube_videos\eiGC3e78JVw.mp4
⚠ Skipping: full video not found → youtube_videos\Dj11ENNeGVY.mp4
⚠ Skipping: full video not found → youtube_videos\eFgkZKhNUdM.mp4
⚠ Skipping: full video not found → youtube_videos\z6nmn6lQqr4.mp4
⚠ Skippi

In [12]:
# from moviepy.editor import VideoFileClip
# import pandas as pd
# import os

# # Connect to Qdrant
# qdrant = QdrantClient(host="localhost", port=6333)
# collection_name = "video_chunks"

# # These are the clips that failed in Part 8
# missing_clips = [
#     "UbCq8ZnLHxY_chunk243.mp4",
#     "a-ePQQZgA0E_chunk94.mp4",
#     "hSvsjlFbn2w_chunk1271.mp4",
#     "rCVlIVKqqGE_chunk159.mp4",
#     "eFgkZKhNUdM_chunk345.mp4",
#     "oLP7v-jCGxM_chunk274.mp4",
#     "eQ6UE968Xe4_chunk672.mp4",
#     "4WHFcqq1ErA_chunk319.mp4"
# ]

# # Fetch full metadata from Qdrant
# records, _ = qdrant.scroll(collection_name=collection_name, with_payload=True, limit=10000)
# df = pd.DataFrame([r.payload for r in records if r.payload])

# rebuilt = 0
# for fname in missing_clips:
#     try:
#         vid, chunk = fname.replace(".mp4", "").split("_chunk")
#         chunk_id = int(chunk)
#         match = df[(df["video_id"] == vid) & (df["chunk_id"] == chunk_id)]
#         if match.empty:
#             print(f" Metadata not found for {fname}")
#             continue

#         row = match.iloc[0]
#         start = int(row["start"])
#         end = start + 10
#         full_path = f"youtube_videos/{vid}.mp4"
#         out_path = f"video_segments/{fname}"

#         if not os.path.exists(full_path):
#             print(f" Full video not found: {full_path}")
#             continue

#         clip = VideoFileClip(full_path).subclip(start, end)
#         clip.write_videofile(out_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
#         print(f" Rebuilt: {out_path}")
#         rebuilt += 1
#     except Exception as e:
#         print(f" Failed to rebuild {fname}: {e}")

# print(f"\n🎉 Done. Rebuilt {rebuilt} missing clip(s).")



 Metadata not found for UbCq8ZnLHxY_chunk243.mp4
 Rebuilt: video_segments/a-ePQQZgA0E_chunk94.mp4
 Metadata not found for hSvsjlFbn2w_chunk1271.mp4
 Rebuilt: video_segments/rCVlIVKqqGE_chunk159.mp4
 Rebuilt: video_segments/eFgkZKhNUdM_chunk345.mp4
 Metadata not found for oLP7v-jCGxM_chunk274.mp4
 Metadata not found for eQ6UE968Xe4_chunk672.mp4
 Metadata not found for 4WHFcqq1ErA_chunk319.mp4

🎉 Done. Rebuilt 3 missing clip(s).


In [13]:
# # List the remaining missing clips
# final_missing = [
#     "UbCq8ZnLHxY",
#     "hSvsjlFbn2w",
#     "oLP7v-jCGxM",
#     "eQ6UE968Xe4",
#     "4WHFcqq1ErA"
# ]

# # Process only the missing full videos
# for vid in final_missing:
#     video_path = f"youtube_videos/{vid}.mp4"
#     if os.path.exists(video_path):
#         print(f"🎥 Re-processing: {vid}")
#         process_video(video_path)
#     else:
#         print(f"❌ Missing file: {video_path}")


🎥 Re-processing: UbCq8ZnLHxY
🎥 Processing video: youtube_videos/UbCq8ZnLHxY.mp4
🧠 Transcript contains 2138 words → 72 text chunks.
📤 Uploaded 72 chunks to Qdrant.
                                                text  \
0  Obviously, the previous example we have seen i...   
1  of vacuum propagation. Obviously, we need to g...   
2  of the basics. Out of this section, this kind ...   
3  of neural kind of structure. In fact, I'll go ...   
4  prop. And of course, we can actually go throug...   

                              video_path     video_id  chunk_id  start  topic  
0  video_segments/UbCq8ZnLHxY_chunk0.mp4  UbCq8ZnLHxY         0      0     -1  
1  video_segments/UbCq8ZnLHxY_chunk1.mp4  UbCq8ZnLHxY         1     10     -1  
2  video_segments/UbCq8ZnLHxY_chunk2.mp4  UbCq8ZnLHxY         2     20     -1  
3  video_segments/UbCq8ZnLHxY_chunk3.mp4  UbCq8ZnLHxY         3     30     -1  
4  video_segments/UbCq8ZnLHxY_chunk4.mp4  UbCq8ZnLHxY         4     40     -1  
🎥 Re-processing: hSv

KeyboardInterrupt: 