In [13]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import re

In [None]:
def get_file_id(url):
  """
  This function extracts the file ID from a Google Drive URL.

  Args:
      url: A string containing the Google Drive URL.

  Returns:
      A string containing the file ID.
  """
  match = re.search(r'/d/(.*?)/', url)
  if match:
    return match.group(1)
  else:
    raise ValueError('Invalid Google Drive URL.')

In [None]:
dataset_drive_dir = input("Input your dataset directory for dataset, something like \"/drive/MyDrive/xxx/datasets\": ")
os.chdir(dataset_drive_dir)

Metadata file link: https://drive.google.com/file/d/1XeQ49_RLD4xPfT5Qpr37wFFU38P4PX8P/view?usp=sharing

In [None]:
dataset_path = 'signsuisse'

# Check if the folder of this dataset exists
if not os.path.exists(dataset_path):
  # Create the folder
  os.makedirs(dataset_path)

# Check if the metadata file exists
signsuisse_metadata_path = os.path.join(dataset_path, 'metadata_train.csv')
if not os.path.exists(signsuisse_metadata_path):
  url = input("Metadata file Drive URL = ")
  file_id = get_file_id(url)

  # The process below should take about X minutes
  !gdown --id {file_id} -O metadata_train.csv
  # Move the file to the dataset folder
  !mv metadata_train.csv $dataset_path

In [22]:
full_path = os.getcwd()
dataset_path = "signsuisse"
file_name = "metadata_train.csv" # This file is from the Signsuisse dataset
file_path = os.path.join(full_path, dataset_path, file_name)
df = pd.read_csv(file_path)

In [23]:
df.head()

Unnamed: 0,id,name,spokenLanguage,signedLanguage,category,definition,paraphrase,example,url,videoDuration,exampleVideoDuration
0,126464,WETTEN,de,dsgs,Verb,"zum Ausdruck bringen, dass man sich einer Sach...","erklären, dass man ganz sicher ist, dass etwas...","Der Mann wettet, dass er ab morgen aufhören ka...",https://signsuisse.sgb-fss.ch/lexikon/126464/w...,2.16,5.44
1,121083,TRENTAQUATTRO,it,lis-ch,Allgemein,E' il numero naturale dopo il 33 e prima del 35.,Valore numerico.,Il numero civico di casa mia è il trentaquattro.,https://signsuisse.sgb-fss.ch/it/lexikon/12108...,3.28,6.875
2,112853,TREPPE,de,dsgs,Haus/Gebäude,Eine Treppe ist ein aus Stufen gebildeter Auf-...,Stufe um Stufe für hinauf oder hinunter,"ich steige nicht gerne Treppen hinauf, weil ic...",https://signsuisse.sgb-fss.ch/lexikon/112853/t...,2.548,4.584
3,130573,PALÉO,fr,lsf-ch,",Veranstaltung/Medien","Le Paléo Festival Nyon, généralement appelé Pa...","Musique, Festival, Nyon.","Chaque année, en juillet, à Nyon le Paléo Fest...",https://signsuisse.sgb-fss.ch/fr/lexikon/13057...,2.92,11.28
4,112877,NACHTHEMD,de,dsgs,Allgemein,"Das Nachthemd ist ein weit geschnittenes, meis...","Kleidungsstück, das wie ein sehr langes Hemd a...",Meine Tante liebt das Seiden-Nachthemd.,https://signsuisse.sgb-fss.ch/lexikon/112877/n...,2.959,5.667


**spokenLanguage**: de (German), it (Italian), fr (French)  
**signedLanguage**: dsgs (Swiss-German Sign Language), lis-ch (Italian Sign Language of Switzerland), lsf-ch (French Sign Language of Switzerland)  

We need:  
**spokenLanguage = de (German); signedLanguage = dsgs (Swiss-German Sign Language)**

In [None]:
# Create a new dataframe
df_filtered = df[
    (df["spokenLanguage"] == "de") & (df["signedLanguage"] == "dsgs")
][["id", "name", "spokenLanguage", "signedLanguage"]]

# Lowercase all values in the name column
df_filtered["name"] = df_filtered["name"].str.lower()

df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8544 entries, 0 to 17218
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              8544 non-null   int64 
 1   name            8543 non-null   object
 2   spokenLanguage  8544 non-null   object
 3   signedLanguage  8544 non-null   object
dtypes: int64(1), object(3)
memory usage: 333.8+ KB


In [None]:
# Delete null values
df_filtered.dropna(inplace=True)
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8543 entries, 0 to 17218
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              8543 non-null   int64 
 1   name            8543 non-null   object
 2   spokenLanguage  8543 non-null   object
 3   signedLanguage  8543 non-null   object
dtypes: int64(1), object(3)
memory usage: 333.7+ KB


In [None]:
df_filtered.head()

Unnamed: 0,id,name,spokenLanguage,signedLanguage
0,126464,wetten,de,dsgs
2,112853,treppe,de,dsgs
4,112877,nachthemd,de,dsgs
6,127903,mombasa,de,dsgs
7,117233,picknick,de,dsgs


In [None]:
result_path = os.path.join(full_path, dataset_path, "index_German_SwissGermanSL.csv")
df_filtered.to_csv(result_path, index=False)

Download the openpose dataset and unzip to have .pose files of the Signsuisse dataset

Openpose Zip file link: https://drive.google.com/file/d/1KfRSJi5OUkywvyo-e3uHyhgKRIhDCkXu/view?usp=drive_link

In [None]:
dataset_path = "signsuisse"
os.chdir(dataset_path)

url = input("Zip file Drive URL = ")
file_id = get_file_id(url)
zip_file_name = "openpose.v1.0"

# The process below should take about 5 minutes
!gdown --id {file_id} -O {zip_file_name}.zip
!unzip {zip_file_name}.zip -d {zip_file_name}

After this step the folder `datasets/signsuisse/openpose.v1.0/openpose` should be ready. Files .pose are in that folder.  

Next: Filter "de-dsgs" pose only before using

In [26]:
df = pd.read_csv("index_German_SwissGermanSL.csv")

# Get the "id" column as a list
id_list = df["id"].tolist()

# Check if the "de_dsgs_poses" folder exists, if not create it
if not os.path.exists("de_dsgs_poses"):
  os.makedirs("de_dsgs_poses")

# path to "openpose.v1.0/openpose" folder
openpose_folder_path = os.path.join("openpose.v1.0", "openpose")
destination_folder_path = "de_dsgs_poses"

# Iterate over the id_list and get the corresponding .pose files
# This process should take about 15 minutes
for id in id_list:
  file_name = str(id) + ".pose"
  pose_file_path = os.path.join(openpose_folder_path, file_name)
  if os.path.exists(pose_file_path):
    # Copy the .pose file to the "de_dsgs_poses" folder
    print("Copying {} to {}".format(file_name, destination_folder_path))
    !cp {pose_file_path} de_dsgs_poses