<a href="https://colab.research.google.com/github/Bandi-Lavanya/Indian_Sign_Language_Detection/blob/main/SLD_video_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torch torchvision transformers opencv-python numpy



In [None]:
from transformers import pipeline
from PIL import Image
import cv2

# Load the pre-trained Indian Sign Language recognition model
sign_language_model = pipeline("image-classification", model="hemg/Indian-sign-language-classification")

def recognize_sign_from_frame(frame):
    # Convert the frame from a numpy array to a PIL Image
    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # Convert from BGR to RGB

    # Now call the pipeline with the PIL image
    result = sign_language_model(pil_image)
    return result[0]['label']  # This gives the predicted sign

# Example: Read a frame and recognize the sign
frame = cv2.imread('/content/sign1.jpeg')  # Read an image (frame)
sign = recognize_sign_from_frame(frame)
print(f"Recognized sign: {sign}")


Device set to use cpu


Recognized sign: C


In [None]:
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)  # Get the frame rate of the video
    sign_sequence = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Process every frame or frame at a consistent interval (e.g., 1 per second)
        frame_position = cap.get(cv2.CAP_PROP_POS_FRAMES)
        if int(frame_position) % int(frame_rate) == 0:  # Process every nth frame based on FPS
            sign = recognize_sign_from_frame(frame)
            print(f"Frame {int(frame_position)}: Recognized sign = {sign}")  # Print frame and sign
            sign_sequence.append(sign)

    cap.release()
    return sign_sequence


# Test with a video path
video_path = '/content/WIN_20250215_22_10_36_Pro.mp4'
signs = process_video(video_path)
print(f"Detected sign sequence: {signs}")


Frame 30: Recognized sign = T
Frame 60: Recognized sign = T
Frame 90: Recognized sign = T
Frame 120: Recognized sign = T
Detected sign sequence: ['T', 'T', 'T', 'T']


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model for text generation
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def generate_text_from_sign_sequence(sign_sequence):
    # Join the sequence of signs into a single string (for the model)
    sign_sequence_text = " ".join(sign_sequence)

    # Tokenize input
    inputs = tokenizer.encode("translate sign language to text: " + sign_sequence_text, return_tensors="pt")

    # Generate output text
    outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)

    # Decode and return the output text
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# Example
sign_sequence = ["hello", "how", "are", "you"]
sentence = generate_text_from_sign_sequence(sign_sequence)
print(f"Generated sentence: {sentence}")


Generated sentence: sign language to text: hello how are you


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

def generate_text_from_sign_sequence_gpt2(sign_sequence):
    sign_sequence_text = " ".join(sign_sequence)
    inputs = tokenizer.encode(sign_sequence_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=50, num_beams=4, early_stopping=True)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# Example
sign_sequence = ["hello", "how", "are", "you"]
sentence = generate_text_from_sign_sequence_gpt2(sign_sequence)
print(f"Generated sentence: {sentence}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated sentence: hello how are you doing?"

"I'm fine. I'm fine. I'm fine. I'm fine. I'm fine. I'm fine. I'm fine. I'm fine. I'm fine. I'm fine. I
