In [1]:
# ‚úÖ STEP-BY-STEP GOOGLE COLAB NOTEBOOK FOR YOUR PROJECT
# Pose-to-Text from COIN Dataset (multi-video set)

# --- SETUP ---
!pip install yt-dlp opencv-python mediapipe==0.10.9 torch torchvision torchaudio keras --quiet

import os, json, pickle
from pathlib import Path
import numpy as np
import cv2
import mediapipe as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Masking
from sklearn.metrics import accuracy_score

# --- CONFIGURATION ---
JSON_FILE = "coin_500.json"  # Upload your 500-video JSON file to Colab
VIDEO_DIR = "videos"
CACHE_FILE = "pose_cache.pkl"
Path(VIDEO_DIR).mkdir(exist_ok=True)

# --- DOWNLOAD VIDEOS FROM JSON USING yt-dlp ---
with open(JSON_FILE) as f:
    data = json.load(f)["database"]

video_list = list(data.items())[:500]  # ‚Üê use more videos

for vid_id, meta in video_list:
    url = meta["video_url"].replace("/embed/", "/watch?v=")
    out_dir = os.path.join(VIDEO_DIR, str(meta["recipe_type"]))
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    out_path = os.path.join(out_dir, f"{vid_id}.mp4")
    if not os.path.exists(out_path):
        print(f"‚¨áÔ∏è Downloading: {vid_id}")
        !yt-dlp -f "best[height<=480]" -o "{out_path}" "{url}"
    else:
        print(f"‚úÖ Already downloaded: {vid_id}")

# --- LOAD POSE DATA FROM CACHE IF AVAILABLE ---
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, 'rb') as f:
        pose_data = pickle.load(f)
    print("‚úÖ Loaded cached pose data.")
else:
    # --- EXTRACT 2D POSE KEYPOINTS USING MEDIAPIPE ---
    mp_pose = mp.solutions.pose
    pose = mp_pose.Pose(static_image_mode=False)

    pose_data = []

    for vid_id, meta in video_list:
        file_path = os.path.join(VIDEO_DIR, str(meta["recipe_type"]), f"{vid_id}.mp4")
        print(f"\nüéûÔ∏è Processing video: {vid_id}")
        cap = cv2.VideoCapture(file_path)
        keypoints = []
        frame_count = 0

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
            if frame_count % 10 != 0:
                continue
            if frame_count % 100 == 0:
                print(f"üß© Frame {frame_count}...")

            frame = cv2.resize(frame, (480, 360))
            results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

            if results.pose_landmarks:
                pose_vec = []
                for lm in results.pose_landmarks.landmark:
                    pose_vec.extend([lm.x, lm.y])
                keypoints.append(pose_vec)

        cap.release()
        print(f"‚úÖ Done: {vid_id} with {len(keypoints)} pose frames")

        pose_data.append({
            "id": vid_id,
            "pose_seq": keypoints,
            "labels": [x["label"] for x in meta["annotation"]]
        })

    with open(CACHE_FILE, 'wb') as f:
        pickle.dump(pose_data, f)
    print("üíæ Cached pose data saved.")

# --- PREPARE DATA FOR LSTM TRAINING ---
all_labels = [label for sample in pose_data for label in sample["labels"]]
le = LabelEncoder()
le.fit(all_labels)

X, y = [], []

for sample in pose_data:
    for label in sample["labels"]:
        if sample["pose_seq"]:
            X.append(sample["pose_seq"])
            y.append(le.transform([label])[0])

if not X:
    raise ValueError("‚ùå No pose sequences found. Check if MediaPipe processed any frames.")

X = pad_sequences(X, maxlen=100, dtype='float32', padding='post', truncating='post')
y = np.array(y)

print(f"\n‚úÖ Prepared {len(X)} sequences for training.")

# --- TRAIN/TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- DEFINE AND TRAIN LSTM MODEL ---
input_dim = X.shape[2]

model = Sequential([
    Masking(mask_value=0.0, input_shape=(100, input_dim)),
    LSTM(64),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print("\nüöÄ Training model...")
model.fit(X_train, y_train, epochs=20, batch_size=4, verbose=1)

# --- EVALUATE ON TEST SET ---
y_pred = model.predict(X_test, verbose=0)
y_pred_labels = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_labels)
print(f"\n‚úÖ Test Accuracy: {accuracy * 100:.2f}%")

# --- INFER TEXT FROM VIDEO POSE DATA ---
print("\nüß† POSE TO TEXT OUTPUT\n")

for sample in pose_data:
    if not sample["pose_seq"]:
        print(f"‚ö†Ô∏è Skipping {sample['id']} (no pose found)")
        continue

    seq = pad_sequences([sample["pose_seq"]], maxlen=100, dtype='float32', padding='post', truncating='post')
    pred = model.predict(seq, verbose=0)
    decoded_label = le.inverse_transform([np.argmax(pred)])
    print(f"üé¨ Video ID: {sample['id']} ‚Üí üìù Predicted: {decoded_label[0]}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
üß© Frame 2400...
üß© Frame 2500...
üß© Frame 2600...
üß© Frame 2700...
üß© Frame 2800...
üß© Frame 2900...
üß© Frame 3000...
üß© Frame 3100...
üß© Frame 3200...
üß© Frame 3300...
üß© Frame 3400...
üß© Frame 3500...
üß© Frame 3600...
üß© Frame 3700...
üß© Frame 3800...
üß© Frame 3900...
üß© Frame 4000...
üß© Frame 4100...
üß© Frame 4200...
üß© Frame 4300...
üß© Frame 4400...
üß© Frame 4500...
üß© Frame 4600...
üß© Frame 4700...
üß© Frame 4800...
üß© Frame 4900...
üß© Frame 5000...
üß© Frame 5100...
üß© Frame 5200...
üß© Frame 5300...
‚úÖ Done: rt3SzABuQLs with 207 pose frames

üéûÔ∏è Processing video: 0rjV6GiJelQ
üß© Frame 100...
üß© Frame 200...
üß© Frame 300...
üß© Frame 400...
üß© Frame 500...
üß© Frame 600...
üß© Frame 700...
üß© Frame 800...
üß© Frame 900...
üß© Frame 1000...
üß© Frame 1100...
üß© Frame 1200...
üß© Frame 1300...
üß© Frame 1400...
üß© Frame 1500...
üß© Fr

  super().__init__(**kwargs)



üöÄ Training model...
Epoch 1/20
[1m332/332[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 35ms/step - accuracy: 0.0076 - loss: 6.3214
Epoch 2/20
[1m332/332[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m20s[0m 35ms/step - accuracy: 0.0245 - loss: 5.9564
Epoch 3/20
[1m332/332[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m20s[0m 34ms/step - accuracy: 0.0329 - loss: 5.6625
Epoch 4/20
[1m332/332[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m21s[0m 35ms/step - accuracy: 0.0504 - loss: 5.3382
Epoch 5/20
[1m332/332[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m20s[0m 35ms/step - accuracy: 0.0620 - loss: 5.1136
Epoch 6/20
[1m332/332[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m20s[0m 33ms/step - accuracy: 0.0787 - loss: 4.7768
Epoc