**AIR WRITING**

In [1]:
import cv2
import numpy as np
import mediapipe as mp
import math

# Function to get the index finger tip and thumb tip positions
def get_finger_tips(frame, hand_landmarks):
    if hand_landmarks:
        index_finger_tip = hand_landmarks.landmark[mp.solutions.hands.HandLandmark.INDEX_FINGER_TIP]
        thumb_tip = hand_landmarks.landmark[mp.solutions.hands.HandLandmark.THUMB_TIP]
        height, width, _ = frame.shape
        index_finger_tip_px = (int(index_finger_tip.x * width), int(index_finger_tip.y * height))
        thumb_tip_px = (int(thumb_tip.x * width), int(thumb_tip.y * height))
        return index_finger_tip_px, thumb_tip_px
    else:
        return None, None

# Function to calculate distance between two points
def calculate_distance(point1, point2):
    return math.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)

# Function to perform smoothing
def smooth_line(new_point, prev_points, smoothing_factor=0.5):
    if prev_points is None:
        return new_point
    else:
        smoothed_point = tuple(np.round(smoothing_factor * np.array(new_point) + (1 - smoothing_factor) * np.array(prev_points)).astype(int))
        return smoothed_point

def main():
    # Open camera
    cap = cv2.VideoCapture(0)
    # Initialize MediaPipe Hands
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(max_num_hands=1)
    mp_drawing = mp.solutions.drawing_utils

    # Create a blank canvas filled with white color
    canvas = np.ones((480, 640, 3), dtype=np.uint8) * 255

    # Variables for drawing and erasing
    drawing = False
    prev_point = []

    while True:
        ret, frame = cap.read()

        if not ret:
            break

        # Flip the frame horizontally
        frame = cv2.flip(frame, 1)

        # Convert the BGR image to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Process the frame with MediaPipe Hands
        results = hands.process(rgb_frame)

        # Draw hand landmarks on the frame
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        # Get finger tip positions
        index_finger_tip, thumb_tip = get_finger_tips(frame, results.multi_hand_landmarks[0] if results.multi_hand_landmarks else None)

        if index_finger_tip is not None and thumb_tip is not None:
            distance_threshold = 50  # Adjust threshold as needed
            distance = calculate_distance(index_finger_tip, thumb_tip)
            
            if distance < distance_threshold:
                drawing = False
                # Check if index finger is touching the top part of the frame
                if index_finger_tip[1] < 50:  # Adjust the value as needed
                    # Implement clear, save, quit functionality
                    if index_finger_tip[0] < 213:  # 640/3, 3 sections
                        canvas = np.ones((480, 640, 3), dtype=np.uint8) * 255  # Clear canvas
                    elif 213 <= index_finger_tip[0] < 426:  # 640/3 * 2
                        cv2.imwrite('air_canvas.png', canvas)  # Save canvas as image
                    else:
                        break  # Quit program

            else:
                drawing = True

        if drawing:
            cv2.circle(frame, index_finger_tip, 5, (0, 0, 0), -1)  # Draw in black color
            if prev_point is not None:
                smoothed_point = smooth_line(index_finger_tip, prev_point)
                cv2.line(canvas, prev_point, smoothed_point, (0, 0, 0), 4)  # Draw in black color
                prev_point = smoothed_point
            else:
                prev_point = index_finger_tip
        else:
            prev_point = None
        
        # Display canvas
        cv2.imshow('Canvas', canvas)

        # Overlay drawing onto the frame
        frame_with_drawing = cv2.bitwise_and(frame, canvas)
        
        cv2.putText(frame_with_drawing, "Clear", (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
        cv2.putText(frame_with_drawing, "Save", (220, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        cv2.putText(frame_with_drawing, "Quit", (420, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

        # Display frame
        cv2.imshow('Frame', frame_with_drawing)

        # Check for key press
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

In [3]:
from transformers import pipeline
image_path = "C:/Users/aswin/OneDrive/Desktop/Work/IBM/air_canvas.png"
# Create the image-to-text pipeline using the TrOCR model
pipe = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
prediction = pipe(image_path)
prediction

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


[{'generated_text': 'hello world.'}]

In [4]:
from transformers import pipeline
image_path = "air_canvas.png"
pipe = pipeline("image-to-text", model="jinhybr/OCR-Donut-CORD")
prediction = pipe(image_path)
prediction

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


[{'generated_text': '<s_cord-v2><s_menu><s_nm> Aawir</s_nm><s_price> Aawir</s_nm><s_price>'}]