### Text To Speech
The text is fed to gTTS library to get a .mp3 file which is then converted into .wav file. This is then played by pygame.mixer() and file is finally removed using an os command.

In [None]:
import os
from gtts import gTTS
from pygame import mixer
from pydub import AudioSegment

myobj = gTTS(text=mytext, lang='en', s
myobj.save("sign_lang.mp3")
sound = AudioSegment.from_mp3("sign_lang.mp3")
sound.export("sign_lang.wav", format="wav")

mixer.init()
mixer.music.load('sign_lang.wav')
mixer.music.play()
os.system("rm sign_lang.wav sign_lang.mp3")

### MediaPipe Hands
We used hand tracking module of MediaPipe Framework to extract 21 hand landmarks.

In [None]:
import cv2
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
drawing_styles = mp.solutions.drawing_styles

cap = cv2.VideoCapture(0)
with mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = hands.process(image)
        image.flags.writeable = True
        
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        if results.multi_hand_landmarks:
            for coords in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    image, coords, mp.solutions.hands.HAND_CONNECTIONS,
                    drawing_styles.get_default_hand_landmark_style(),
                    drawing_styles.get_default_hand_connection_style())
        cv2.imshow('MediaPipe Hands', image)
        if cv2.waitKey(5) & 0xFF == 27:
            break
cap.release()

### Train Validation Split
Split the data into train & validation data of specified ratios.

In [None]:
import os
import random

path = 'Downloads/ASL2/'
train, valid = path+'train/', path+'valid/'
if not os.path.exists(valid):
    dirs = os.listdir(train)
    os.system(f"mkdir {valid}")

    for d in dirs:
        os.system(f"mkdir {valid+d}")

    for d in dirs:
        sfol, dfol1 = train+d+'/', valid+d+'/'
        trn = os.listdir(sfol)
        val = random.sample(trn,350) 
        trn = [t for t in trn if t not in val]
        print("Moving Files...")
        for x in val: os.system(f"mv {sfol+x} {dfol1}")
        print("Done !")

else:
    print("Folders created already....")

### Extract Hand Coordinates
Extracting 21 hand coordinates for our training images using existing MediaPipe model and storing it in a .csv file

In [None]:
import os
import cv2
import numpy as np
import mediapipe as mp
from PIL import Image
mp_drawing = mp.solutions.drawing_utils
drawing_styles = mp.solutions.drawing_styles
hands = mp.solutions.hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.65)

def extract_labels(img):     #image_path
    image = cv2.imread(img)
    #image = cv2.resize(image, (640,480),interpolation=cv2.INTER_CUBIC)
    image = cv2.cvtColor(cv2.flip(image,1), cv2.COLOR_BGR2RGB)
    results = hands.process()
    lst = []
    if results.multi_hand_landmarks:
        for coords in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                image, coords, mp.solutions.hands.HAND_CONNECTIONS,
                drawing_styles.get_default_hand_landmark_style(),
                drawing_styles.get_default_hand_connection_style())
        for crd in coords.landmark:
            lst.append(crd.x)
            lst.append(crd.y)
            lst.append(crd.z)
    else:
        return None
    return lst

fldr = 'valid'  #'train'
csvf, mode = 'test.csv', 'w'  #'train.csv', 'w'

path = 'Downloads/ASL2/'+f'{fldr}/'
labels = os.listdir(path)
if not os.path.exists(csvf):
    file = open(csvf, mode)
    for lab in labels:
        fol = path+lab+'/'
        trn = os.listdir(fol)
        print(lab)
        for img in trn:
            coords = extract_labels(fol+img)
            if coords:
                coords.append(lab)
                file.write(','.join(map(str,coords))+'\n')
    print('Job Done...!')
    file.close()

else:
    print('Data already extracted....')

### Training our Model
Training a ML model with our coordinate data file and saving our model

In [None]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from xgboost import XGBClassifier as XGB
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('train.csv',header=None)
le = LabelEncoder()
le.fit(df[63])
df[63] = le.transform(df[63])
X, Y = np.asarray(df.iloc[:,:63]), np.asarray(df[63])

xgb = XGB()
xgb.fit(X,Y)

pickle.dump(xgb, open('XGBoost.sav', 'wb'))
pickle.dump(le, open('XEncoder1.sav','wb'))

### Testing our Model
Testing our model with test data and calculating model accuracy score

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
model = pickle.load(open('XGBoost.sav','rb'))
le = pickle.load(open('XEncoder1.sav','rb'))

df2 = pd.read_csv('test_65.csv',header=None)
tst_X, tst_Y = np.asarray(df2.iloc[:,:63]), np.asarray(df2[63])
yhat = model.predict(np.array(tst_X))
yhat = le.inverse_transform(yhat)
score = accuracy_score(tst_Y, yhat)
print(f"Score : {score*100:.3f}%")

### Code Workshop
Here we combine our modules and our pre-trained ML Models to perform Sign Language To Speech process.
Few optimization processes are ongoing...

In [None]:
import os
import cv2
import time
import pickle
import numpy as np
from gtts import gTTS
import mediapipe as mp
from pygame import mixer
from pydub import AudioSegment

mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
drawing_styles = mp.solutions.drawing_styles

def play_voice(mytext):
    myobj = gTTS(text=mytext, lang='en', slow=False)
    myobj.save("sign_lang.mp3")
    sound = AudioSegment.from_mp3("sign_lang.mp3")
    sound.export("sign_lang.wav", format="wav")

    mixer.init()
    mixer.music.load('sign_lang.wav')
    mixer.music.play()
    os.system("rm sign_lang.wav sign_lang.mp3")

mod = pickle.load(open('XGBoost.sav', 'rb'))
le = pickle.load(open('XEncoder1.sav', 'rb'))

txt = ''
cap = cv2.VideoCapture(0)
prev = time.time()
with mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        curr = time.time()
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = hands.process(image)
        
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        if (curr-prev) >= 2:
            prev = time.time()
            if results.multi_hand_landmarks:
                lst = []
                for coords in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(
                        image, coords, mp.solutions.hands.HAND_CONNECTIONS,
                        drawing_styles.get_default_hand_landmarks_style(),
                        drawing_styles.get_default_hand_connections_style())
                for crd in coords.landmark:
                    lst.append(crd.x)
                    lst.append(crd.y)
                    lst.append(crd.z)
                lst = np.asarray(lst).reshape(1,-1)
                tmp = mod.predict(lst)
                txt += str(le.inverse_transform(tmp)).strip("'][")
                #play_voice(str(txt))
        print(txt)
        #cv2.putText(image, str(txt), (25, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,), 2, cv2.LINE_4)
        cv2.imshow('MediaPipe Hands', image)
        if cv2.waitKey(5) & 0xFF == 27:
            break
cap.release()

In [None]:
import os
import cv2
import time
import pickle
import numpy as np
from gtts import gTTS
import mediapipe as mp
from pygame import mixer
from pydub import AudioSegment

mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
drawing_styles = mp.solutions.drawing_styles

def play_voice(mytext):
    myobj = gTTS(text=mytext, lang='en', slow=False)
    myobj.save("sign_lang.mp3")
    sound = AudioSegment.from_mp3("sign_lang.mp3")
    sound.export("sign_lang.wav", format="wav")

    mixer.init()
    mixer.music.load('sign_lang.wav')
    mixer.music.play()
    os.system("rm sign_lang.wav sign_lang.mp3")

mod = pickle.load(open('XGBoost.sav', 'rb'))
le = pickle.load(open('XEncoder1.sav', 'rb'))

txt = ''
cap = cv2.VideoCapture(0)
prev = time.time()
with mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        curr = time.time()
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue

        image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
        image.flags.writeable = False
        results = hands.process(image)
        
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        if (curr-prev) >= 2:
            prev = time.time()
            if results.multi_hand_landmarks:
                lst = []
                for coords in results.multi_hand_landmarks:
                    mp_drawing.draw_landmarks(
                        image, coords, mp.solutions.hands.HAND_CONNECTIONS,
                        drawing_styles.get_default_hand_landmark_style(),
                        drawing_styles.get_default_hand_connection_style())
                for crd in coords.landmark:
                    lst.append(crd.x)
                    lst.append(crd.y)
                    lst.append(crd.z)
                lst = np.asarray(lst).reshape(1,-1)
                tmp = mod.predict(lst)
                txt += str(le.inverse_transform(tmp)).strip("']["
                play_voice(str(txt))
        cv2.putText(image, str(txt), (25, 25), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,), 2, cv2.LINE_4)
        cv2.imshow('MediaPipe Hands', image)
        if cv2.waitKey(5) & 0xFF == 27:
            break
cap.release()

SyntaxError: invalid syntax (<ipython-input-1-b7300b0e6367>, line 62)

In [None]:
import joblib

joblib.dump(model, 'model_char.sav')