In [1]:
! pip install torch

    jupyter-core!=~5.0,>=4.12
                ^[0m[33m
[0m

In [2]:
import pandas as pd
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from albumentations import Compose, HorizontalFlip, Rotate, RandomBrightnessContrast
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import transforms
import torch
import torch.nn as nn
import torch.nn.functional as F


In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("grassknoted/asl-alphabet")

print("Path to dataset files:", path)

Path to dataset files: /Users/arpo/.cache/kagglehub/datasets/grassknoted/asl-alphabet/versions/1


In [6]:
! ls '{path}/asl_alphabet_train'

[34masl_alphabet_train[m[m


In [7]:
! ls '{path}/asl_alphabet_train/asl_alphabet_train'

[34mA[m[m       [34mD[m[m       [34mG[m[m       [34mJ[m[m       [34mM[m[m       [34mP[m[m       [34mS[m[m       [34mV[m[m       [34mY[m[m       [34mnothing[m[m
[34mB[m[m       [34mE[m[m       [34mH[m[m       [34mK[m[m       [34mN[m[m       [34mQ[m[m       [34mT[m[m       [34mW[m[m       [34mZ[m[m       [34mspace[m[m
[34mC[m[m       [34mF[m[m       [34mI[m[m       [34mL[m[m       [34mO[m[m       [34mR[m[m       [34mU[m[m       [34mX[m[m       [34mdel[m[m


In [4]:
classes = [f for f in os.listdir(f'{path}/asl_alphabet_train/asl_alphabet_train') 
           if not f.startswith('.')]
classes = sorted(classes)
classes

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'del',
 'nothing',
 'space']

In [5]:
# Сохраняем все изображения в DataFrame
df = pd.DataFrame(columns=['img_path', 'symbol', 'id_symbol'])

for id_symb, symbol in enumerate(classes):
    symbol_path = f'{path}/asl_alphabet_train/asl_alphabet_train/{symbol}'
    for img_name in os.listdir(symbol_path):
        img_path = f'{symbol_path}/{img_name}'
        img = cv2.imread(img_path)
        img = cv2.resize(img, (200, 200))
        
        df = pd.concat([
            df, 
            pd.DataFrame([{
                'img_path': img_path,
                'symbol': symbol,
                'id_symbol': id_symb
            }])
        ], ignore_index=True)

In [6]:
df

Unnamed: 0,img_path,symbol,id_symbol
0,/Users/arpo/.cache/kagglehub/datasets/grasskno...,A,0
1,/Users/arpo/.cache/kagglehub/datasets/grasskno...,A,0
2,/Users/arpo/.cache/kagglehub/datasets/grasskno...,A,0
3,/Users/arpo/.cache/kagglehub/datasets/grasskno...,A,0
4,/Users/arpo/.cache/kagglehub/datasets/grasskno...,A,0
...,...,...,...
86995,/Users/arpo/.cache/kagglehub/datasets/grasskno...,space,28
86996,/Users/arpo/.cache/kagglehub/datasets/grasskno...,space,28
86997,/Users/arpo/.cache/kagglehub/datasets/grasskno...,space,28
86998,/Users/arpo/.cache/kagglehub/datasets/grasskno...,space,28


In [10]:
# Аугментация
path_aug = '/Users/arpo/Desktop/code/small_proj'
os.makedirs(f'{path_aug}/augmented_images', exist_ok=True)

augmentation = Compose([
    HorizontalFlip(p=0.5),
    Rotate(limit=20, p=0.7),
    RandomBrightnessContrast(p=0.5),
])

augmented_rows = []

for _, row in df.iterrows():
    img = cv2.imread(row['img_path'])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    for aug_num in range(3):
        augmented = augmentation(image=img)
        augmented_img = augmented['image']
        
        # Сохраняем новое изображение
        base_name = os.path.basename(row['img_path']).split('.')[0]
        aug_path = f'{path_aug}/augmented_images/{base_name}_aug{aug_num}.jpg'
        cv2.imwrite(aug_path, cv2.cvtColor(augmented_img, cv2.COLOR_RGB2BGR))
        
        augmented_rows.append({
            'img_path': aug_path,
            'symbol': row['symbol'],
            'id_symbol': row['id_symbol']
        })

augmented_df = pd.concat([df, pd.DataFrame(augmented_rows)], ignore_index=True)


In [11]:
train_df, temp_df = train_test_split(augmented_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [12]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((200, 200)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class ASLDataset(torch.utils.data.Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = cv2.imread(row['img_path'])
        if img is None:
            raise FileNotFoundError(f"Image not found at {row['img_path']}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        if self.transform:
            img = self.transform(img)
            
        return img, row['id_symbol']
train_loader = DataLoader(ASLDataset(train_df, transform), batch_size=5096)
val_loader = DataLoader(ASLDataset(val_df, transform), batch_size=5096)

In [13]:
num_batches = len(train_loader)
print(f"Количество батчей: {num_batches}")

Количество батчей: 48


In [14]:
for images, labels in train_loader:
    print(images.shape, labels.shape)
    break

KeyboardInterrupt: 

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ASLModel(nn.Module):
    def __init__(self, num_classes=29):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1, stride=2)
        self.conv2 =nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=2)
        self.pool = nn.MaxPool2d(2, 2)
        
        # Автоматический расчет размера для полносвязного слоя
        self._to_linear = None
        with torch.no_grad():  
            test_input = torch.randn(1, 3, 200, 200)
            test_output = self._get_conv_output(test_input)
            self._to_linear = test_output[0].numel() 
        
        # Полносвязные слои
        self.fc1 = nn.Linear(self._to_linear, 512)  # Увеличил размер скрытого слоя
        self.fc2 = nn.Linear(512, num_classes)
        self.dropout = nn.Dropout(0.5)

    def _get_conv_output(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        return x
        
    def forward(self, x):
        x = self._get_conv_output(x)
        x = x.view(x.size(0), -1)  # Преобразуем в [batch_size, features]
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)  

NameError: name 'model' is not defined

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

NameError: name 'model' is not defined

In [56]:

correct = 0
total = 0
for epoch in range(10):  
    model.train() 
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)  
        
        optimizer.zero_grad()
        outputs = model(inputs)  
        loss = criterion(outputs, labels)  
        loss.backward()  
        optimizer.step()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%')

Epoch 1, Loss: 1.8395, Accuracy: 23.90%
Epoch 2, Loss: 1.0581, Accuracy: 39.72%
Epoch 3, Loss: 0.7325, Accuracy: 50.22%
Epoch 4, Loss: 0.5753, Accuracy: 57.36%
Epoch 5, Loss: 0.4478, Accuracy: 62.51%
Epoch 6, Loss: 0.3709, Accuracy: 66.46%
Epoch 7, Loss: 0.3291, Accuracy: 69.58%
Epoch 8, Loss: 0.2746, Accuracy: 72.09%
Epoch 9, Loss: 0.2522, Accuracy: 74.20%
Epoch 10, Loss: 0.2312, Accuracy: 75.94%


In [57]:
torch.save(model, 'asl_model.pth')

In [None]:
import cv2

cap = cv2.VideoCapture(0)

while True:
    ret, image = cap.read()
    cv2.imshow("camera", image)
    if cv2.waitKey(10) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()



: 

In [3]:
model = torch.load('asl_model.pth', weights_only=False, map_location='cpu')
model.eval()

AttributeError: Can't get attribute 'ASLModel' on <module '__main__'>

In [None]:
import cv2
import torch
import numpy as np
from torchvision import transforms



transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((200, 200)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def predict_gesture(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame_tensor = transform(frame_rgb).unsqueeze(0) 
    
    with torch.no_grad():
        output = model(frame_tensor)
        _, predicted = torch.max(output, 1)
        return id_to_label[predicted.item()]

# --- 5. Захват видео с веб-камеры ---
cap = cv2.VideoCapture(0)  # 0 = веб-камера по умолчанию
text = ""  # Здесь будет накапливаться текст

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # --- Детекция руки (можно использовать MediaPipe) ---
    # Пример: обрезка области с рукой (упрощённо)
    h, w = frame.shape[:2]
    roi = frame[50:300, 50:300]  # Область интереса (рука)
    
    # --- Классификация жеста ---
    gesture = predict_gesture(roi)
    
    # --- Накопление текста (если жест изменился) ---
    if len(text) == 0 or gesture != text[-1]:
        text += gesture
    
    # --- Вывод текста на экран ---
    cv2.rectangle(frame, (50, 50), (300, 300), (0, 255, 0), 2)  # ROI
    cv2.putText(frame, f"Gesture: {gesture}", (50, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.putText(frame, f"Text: {text}", (50, h - 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
    
    cv2.imshow("ASL Translator", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):  # Выход по 'q'
        break

cap.release()
cv2.destroyAllWindows()



: 

In [14]:
! pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.21-cp311-cp311-macosx_11_0_universal2.whl.metadata (9.9 kB)
Collecting absl-py (from mediapipe)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting flatbuffers>=2.0 (from mediapipe)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting jax (from mediapipe)
  Downloading jax-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from mediapipe)
  Downloading jaxlib-0.7.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (1.3 kB)
Collecting opencv-contrib-python (from mediapipe)
  Downloading opencv_contrib_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl.metadata (19 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.8-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.2-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl.metadata (1.6 kB)
Collecting sentencepiece (from me

In [20]:
id_to_label = {
    0: "A",
    1: "B",
    2: "C",
    3: "D",
    4: "E",
    5: "F",
    6: "G",
    7: "H",
    8: "I",
    9: "J",
    10: "K",
    11: "L",
    12: "M",
    13: "N",
    14: "O",
    15: "P",
    16: "Q",
    17: "R",
    18: "S",
    19: "T",
    20: "U",
    21: "V",
    22: "W",
    23: "X",
    24: "Y",
    25: "Z",
    26: "del",
    27: "nothing", 
    28: "space" 
}


In [None]:
import cv2
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

# For static images:
IMAGE_FILES = []
with mp_hands.Hands(
    static_image_mode=True,
    max_num_hands=2,
    min_detection_confidence=0.5) as hands:
  for idx, file in enumerate(IMAGE_FILES):
    # Read an image, flip it around y-axis for correct handedness output (see
    # above).
    image = cv2.flip(cv2.imread(file), 1)
    # Convert the BGR image to RGB before processing.
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # Print handedness and draw hand landmarks on the image.
    print('Handedness:', results.multi_handedness)
    if not results.multi_hand_landmarks:
      continue
    image_height, image_width, _ = image.shape
    annotated_image = image.copy()
    for hand_landmarks in results.multi_hand_landmarks:
      print('hand_landmarks:', hand_landmarks)
      print(
          f'Index finger tip coordinates: (',
          f'{hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].x * image_width}, '
          f'{hand_landmarks.landmark[mp_hands.HandLandmark.INDEX_FINGER_TIP].y * image_height})'
      )
      mp_drawing.draw_landmarks(
          annotated_image,
          hand_landmarks,
          mp_hands.HAND_CONNECTIONS,
          mp_drawing_styles.get_default_hand_landmarks_style(),
          mp_drawing_styles.get_default_hand_connections_style())
    cv2.imwrite(
        '/tmp/annotated_image' + str(idx) + '.png', cv2.flip(annotated_image, 1))
    
    cv2.imshow(annotated_image) ###

    # Draw hand world landmarks.
    if not results.multi_hand_world_landmarks:
      continue
    for hand_world_landmarks in results.multi_hand_world_landmarks:
      mp_drawing.plot_landmarks(
        hand_world_landmarks, mp_hands.HAND_CONNECTIONS, azimuth=5)

# For webcam input:
cap = cv2.VideoCapture(0)
with mp_hands.Hands(
    model_complexity=0,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5) as hands:
  while cap.isOpened():
    success, image = cap.read()
    if not success:
      print("Ignoring empty camera frame.")
      # If loading a video, use 'break' instead of 'continue'.
      continue

    # To improve performance, optionally mark the image as not writeable to
    # pass by reference.
    image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    results = hands.process(image)

    # Draw the hand annotations on the image.
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    if results.multi_hand_landmarks:
      for hand_landmarks in results.multi_hand_landmarks:
        mp_drawing.draw_landmarks(
            image,
            hand_landmarks,
            mp_hands.HAND_CONNECTIONS,
            mp_drawing_styles.get_default_hand_landmarks_style(),
            mp_drawing_styles.get_default_hand_connections_style())
    # Flip the image horizontally for a selfie-view display.
    cv2.imshow('MediaPipe Hands', cv2.flip(image, 1))
    if cv2.waitKey(5) & 0xFF == ord('q'):
      break
cap.release()

I0000 00:00:1754562679.551740 6102939 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1754562679.561332 6103268 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754562679.567017 6103266 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1754562681.002293 6102939 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1754562681.009385 6103335 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754562681.012586 6103335 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for fe

: 

In [None]:
import cv2
import mediapipe as mp
import torch
import numpy as np
from torchvision import transforms

# Инициализация MediaPipe
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils



# Загрузка обученной модели
model = torch.load('asl_model.pth', weights_only=False, map_location='cpu')
model.eval()

# Трансформы для изображения (должны совпадать с обучением)
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((200, 200)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


def process_frame(image):
    # Конвертация и предсказание
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_tensor = transform(image_rgb).unsqueeze(0)
    
    with torch.no_grad():
        outputs = model(image_tensor)
        _, predicted = torch.max(outputs, 1)
        return id_to_label[predicted.item()]

def main():
    cap = cv2.VideoCapture(0)
    recognized_text = ""
    last_gesture = None
    hands = mp_hands.Hands(
        min_detection_confidence=0.7,
        min_tracking_confidence=0.5,
        max_num_hands=1)

    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            continue

        # Обнаружение руки
        frame.flags.writeable = False
        results = hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        frame.flags.writeable = True

        if results.multi_hand_landmarks:
            # Получение bounding box руки
            hand_landmarks = results.multi_hand_landmarks[0]
            h, w = frame.shape[:2]
            x_coords = [lm.x * w for lm in hand_landmarks.landmark]
            y_coords = [lm.y * h for lm in hand_landmarks.landmark]
            
            # Обрезка области с рукой + padding
            margin = 50
            x_min, x_max = int(min(x_coords)) - margin, int(max(x_coords)) + margin
            y_min, y_max = int(min(y_coords)) - margin, int(max(y_coords)) + margin
            
            # Проверка границ
            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(w, x_max), min(h, y_max)
            
            hand_roi = frame[y_min:y_max, x_min:x_max]
            
            if hand_roi.size > 0:
                # Классификация жеста
                current_gesture = process_frame(hand_roi)
                
                # Логика обновления текста
                if current_gesture != last_gesture:
                    if current_gesture == "del":
                        recognized_text = recognized_text[:-1]
                    elif current_gesture == "space":
                        recognized_text += " "
                    else:
                        recognized_text += current_gesture
                    last_gesture = current_gesture

                # Отрисовка
                cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
                mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS)

        # Отображение результатов
        frame = cv2.flip(frame, 1)
        cv2.putText(frame, recognized_text, (30, 50), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        cv2.imshow('ASL Translator', frame)
        if cv2.waitKey(5) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

I0000 00:00:1754563298.658117 6105116 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M1 Pro
W0000 00:00:1754563298.667804 6112586 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1754563298.675019 6112586 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


: 