# Reading JSON data and converting into data frame

In [43]:
import numpy as np 
import pandas as pd 
import json
import os

# Loading Dataset

In [44]:
main_path = 'data/'
wlas_df = pd.read_json(main_path + 'WLASL_v0.3.json')

In [45]:
wlas_df.head()

Unnamed: 0,gloss,instances
0,book,"[{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra..."
1,drink,"[{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f..."
2,computer,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."
3,before,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."
4,chair,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."


# Checking available data

In [46]:
def get_videos_ids(json_list):   
    videos_list = []    
    for ins in json_list:
        video_id = ins['video_id']
        if os.path.exists(f'{main_path}videos/{video_id}.mp4'):
            videos_list.append(video_id)
    return videos_list

def get_json_features(json_list):   
    videos_ids = []
    videos_urls = []
    for ins in json_list:
        video_id = ins['video_id']
        video_url = ins['url']
        if os.path.exists(f'{main_path}videos/{video_id}.mp4'):
            videos_ids.append(video_id)
            videos_urls.append(video_url)
    return videos_ids, videos_urls

In [47]:
with open(main_path+'WLASL_v0.3.json', 'r') as data_file:
    json_data = data_file.read()

instance_json = json.loads(json_data)

get_videos_ids(instance_json[0]['instances'])[0]

'69241'

In [48]:
len(get_videos_ids(instance_json[0]['instances']))

6

In [49]:
wlas_df['videos_ids'] = wlas_df['instances'].apply(get_videos_ids)
wlas_df

Unnamed: 0,gloss,instances,videos_ids
0,book,"[{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra...","[69241, 07069, 07068, 07070, 07099, 07074]"
1,drink,"[{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f...","[69302, 65539, 17710, 17733, 65540, 17734, 177..."
2,computer,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[12328, 12312, 12311, 12338, 12313, 12314, 123..."
3,before,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[05728, 05749, 05750, 05729, 05730, 65167, 057..."
4,chair,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[09848, 09869, 09849, 09850, 09851, 65328, 09854]"
...,...,...,...
1995,washington,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[62393, 62394, 62395, 62396, 62398]"
1996,waterfall,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[62488, 62489, 62490, 62492, 62493]"
1997,weigh,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[62782, 62783, 62785]"
1998,wheelchair,"[{'bbox': [415, 86, 1811, 1080], 'fps': 25, 'f...","[63044, 63046, 63047, 63050]"


In [55]:
features_df = pd.DataFrame(columns=['gloss', 'video_id', 'url'])
for row in wlas_df.iterrows():
#     print(row[1][1])
    ids, urls = get_json_features(row[1][1])
    word = [row[1][0]] * len(ids)
    df = pd.DataFrame(list(zip(word, ids, urls)), columns = features_df.columns)
    features_df = features_df.append(df, ignore_index=True)
features_df

  features_df = features_df.append(df, ignore_index=True)


Unnamed: 0,gloss,video_id,url
0,book,69241,http://aslbricks.org/New/ASL-Videos/book.mp4
1,book,07069,https://signstock.blob.core.windows.net/signsc...
2,book,07068,https://s3-us-west-1.amazonaws.com/files.start...
3,book,07070,https://media.asldeafined.com/vocabulary/14666...
4,book,07099,http://www.aslsearch.com/signs/videos/book.mp4
...,...,...,...
11975,wheelchair,63047,https://www.signingsavvy.com/signs/mp4/5/5233.mp4
11976,wheelchair,63050,http://www.aslsearch.com/signs/videos/wheelcha...
11977,whistle,63186,https://media.spreadthesign.com/video/mp4/13/9...
11978,whistle,63188,https://www.signingsavvy.com/signs/mp4/9/9961.mp4


In [56]:
features_df.index.name = 'index'
features_df.video_id = features_df.video_id.apply(lambda x: 'A' + str(x))

print(features_df)
features_df.to_csv('features_df.csv', index=False)

            gloss video_id                                                url
index                                                                        
0            book   A69241       http://aslbricks.org/New/ASL-Videos/book.mp4
1            book   A07069  https://signstock.blob.core.windows.net/signsc...
2            book   A07068  https://s3-us-west-1.amazonaws.com/files.start...
3            book   A07070  https://media.asldeafined.com/vocabulary/14666...
4            book   A07099     http://www.aslsearch.com/signs/videos/book.mp4
...           ...      ...                                                ...
11975  wheelchair   A63047  https://www.signingsavvy.com/signs/mp4/5/5233.mp4
11976  wheelchair   A63050  http://www.aslsearch.com/signs/videos/wheelcha...
11977     whistle   A63186  https://media.spreadthesign.com/video/mp4/13/9...
11978     whistle   A63188  https://www.signingsavvy.com/signs/mp4/9/9961.mp4
11979     whistle   A63190  http://www.aslsearch.com/signs/video

# Dataset Classes

In [52]:
wlas_df['samples_num'] = wlas_df['videos_ids'].apply(len)
wlas_df.head()

Unnamed: 0,gloss,instances,videos_ids,samples_num
0,book,"[{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra...","[69241, 07069, 07068, 07070, 07099, 07074]",6
1,drink,"[{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f...","[69302, 65539, 17710, 17733, 65540, 17734, 177...",15
2,computer,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[12328, 12312, 12311, 12338, 12313, 12314, 123...",14
3,before,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[05728, 05749, 05750, 05729, 05730, 65167, 057...",16
4,chair,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[09848, 09869, 09849, 09850, 09851, 65328, 09854]",7


In [53]:
print("minimum number of samples for a word:", wlas_df['samples_num'].min())
print("maximum number of samples for a word:", wlas_df['samples_num'].max())

minimum number of samples for a word: 2
maximum number of samples for a word: 16


In [54]:
words_sample_counts = wlas_df[['gloss', 'samples_num']].groupby('samples_num').agg({"gloss":['count', ', '.join]})
words_sample_counts

Unnamed: 0_level_0,gloss,gloss
Unnamed: 0_level_1,count,join
samples_num,Unnamed: 1_level_2,Unnamed: 2_level_2
2,14,"gloves, careless, wash face, curtain, grey, la..."
3,76,"garage, parents, boots, excuse, furniture, rep..."
4,335,"hello, newspaper, asl, that, will, cards, hate..."
5,511,"clothes, table, movie, clock, pencil, behind, ..."
6,402,"book, birthday, need, have, knife, read, name,..."
7,307,"chair, dance, eat, forget, but, jacket, paint,..."
8,151,"all, blue, hearing, wrong, color, enjoy, time,..."
9,100,"fine, finish, now, can, hat, kiss, cow, meet, ..."
10,48,"year, black, hot, like, many, orange, fish, gr..."
11,26,"deaf, no, walk, mother, woman, dog, family, ap..."
