# <center>Script for extracting the german sentences</center> 
## <center>from the EAF transcripts A and B of the annotated DGS Corpus</center> 

--- 

In [1]:
#imports 
from bs4 import BeautifulSoup 
import requests 
from urllib.parse import urljoin 
import urllib.request 
import pandas as pd 
import pickle 

In [2]:
#url of the DGS Corpus 
url_dgs_corpus = "https://www.sign-lang.uni-hamburg.de/meinedgs/ling/start-name_en.html" 

#request the dgs corpus page 
r = requests.get(url_dgs_corpus) 

#get the html contennt of the dgs corpus page 
html = r.text 

In [3]:
#create a content soup from the html content of the dgs corpus page with BeautifulSoup
content_soup = BeautifulSoup(html, 'html.parser') 

In [4]:
#rows with all types of files - ILEX, EAF, MP4...    
rows_with_transcripts = content_soup.find('table', {'class': 'transcripts'}).find_all('tr') 

In [5]:
#list with all hrefs of the EAF files  
list_eaf_files = []  

In [6]:
#get all the cells with transcripts data 
for r in rows_with_transcripts[1:]: 
    cells_with_transcripts = r.find_all('td') 
    
    #cells with the EAF transcript files 
    eaf_files = cells_with_transcripts[5] 
    
    #add the href of each EAF transcript file to a list   
    if(eaf_files.find('a')) != None:
        list_eaf_files.append(eaf_files.find('a').attrs['href']) 
    
    

In [7]:
#list with the absolute urls of each EAF transcript 
absolute_paths_eaf_transcripts = [] 

In [8]:
#create an absolute path for each EAF transcript with 
#taking the base url of the DGS Corpus and 
#the href of each EAF transcript from the list_eaf_files 

for single_eaf in list_eaf_files: 
    absolute_url = urljoin(url_dgs_corpus, single_eaf) 
    absolute_paths_eaf_transcripts.append(absolute_url)  

In [9]:
#this is how an element from the list looks like: 
absolute_paths_eaf_transcripts[0]

'https://www.sign-lang.uni-hamburg.de/meinedgs/eaf/1413451-11105600-11163240.eaf'

<hr style="border:1.5px solid gray"></hr> 

#### Speakers A: 

  - *german sentences:* 
       - <b>TIER_ID="Deutsche_&#xDC;bersetzung_A"</b> 
  - *german gloss sentences:* 
       - <b>TIER_ID="Lexem_Geb&#xE4;rde_r_A" 
       - LINGUISTIC_TYPE_REF="L_tokens_right_left__finer_granularity"</b> 

<hr style="border: 0.5px solid gray"></hr>   

#### Speakers B: 
  - *german sentences:* 
       - <b>TIER_ID="Deutsche_&#xDC;bersetzung_B"</b> 
  - *german gloss sentences:*
       - <b>TIER_ID="Lexem_Geb&#xE4;rde_r_B" 
       - LINGUISTIC_TYPE_REF="L_tokens_right_left__finer_granularity"</b>  

<hr style="border:1.5px solid gray"></hr> 

### <center>Extract the german sentences from speakers A <center>     

In [10]:
#this is a list with the content of all tags that have the attribute "ANNOTATION_VALUE" (they include german glosses, german sentences, 
#english glosses, english sentences, etc.) 

#from this content *only* the tags with german sentences from speakers A must be extracted  
transcript_content_a = [] 

#this is a list for the specific time encoding of each sentence 
time_encodings_a = [] 

#this is a list of the german sentences 
german_sentences_a = [] 

In [11]:
%%time 
#take each transcript from the list with all EAF transcripts and read its content to extract 
#in a data frame all german sentences *only* for speakers A 
for transcript in absolute_paths_eaf_transcripts:    
    with urllib.request.urlopen(transcript) as f:
        content = f.read().decode('utf-8') 
        transcript_content_a = BeautifulSoup(content, 'xml').find_all(name="ANNOTATION_VALUE") 
        time_encodings_a = BeautifulSoup(content, 'xml').find_all(name="TIME_SLOT") 
        for value in range(0, len(transcript_content_a)): 
            #if the value of the tags attribute TIER_ID is a german sentence from speaker A, extract it 
            if transcript_content_a[value].parent.parent.parent.attrs['TIER_ID'] == "Deutsche_Übersetzung_A": 
                #this is the time encoding for the sentence (both starting and ending) 
                time = transcript_content_a[value].parent.attrs 
                #this is the starting time of the sentence 
                start = time['TIME_SLOT_REF1'] 
                #this is the ending time of the sentencec 
                end = time['TIME_SLOT_REF2'] 
                #the sentence itself 
                sentence_a = transcript_content_a[value].text 
                #group the sentence + its start + its end 
                sentence_group_a = [sentence_a, start, end] 
                #add the german sentence to the list of german sentences 
                german_sentences_a.append(sentence_group_a)     

Wall time: 9min 38s


#### Save the list with german sentences A using pickle 

In [12]:
#save the list using pickle where each element is: sentence A + start + end 
#to be used later without extracting it again  

with open("path", "wb") as fp: 
    pickle.dump(german_sentences_a, fp) 

#### Create a data frame for the sentences A 

In [13]:
#list with only the sentence (no timestamps included) 
data_a = [] 

for s in range(0, len(german_sentences_a)): 
    data_a.append(german_sentences_a[s][0]) 

In [14]:
#create a data frame from the list 
df_a = pd.DataFrame(data_a, columns=["German Sentence"])      

df_a 

Unnamed: 0,German Sentence
0,Wie mein Leben aussieht?
1,"Na ja, ich bin als Gehörloser aufgewachsen."
2,"Ich habe eher das Gefühl, wenn ich mir vorstel..."
3,Da treffe ich lieber viele Gehörlose und mache...
4,Aber das ist ja klar.
...,...
31537,Nach der Pause.
31538,"Wenn die Schule vorbei war, wie sah es dann au..."
31539,"Aber es war auch möglich, dass du beispielswei..."
31540,Hast du dir das dann von einem Nachbarn geben ...


In [15]:
#save it as a file where each sentence is on a new line 
df_a.to_csv(f"path", encoding="utf-8-sig", index=False, header=False)  

<hr style="border:1.5px solid gray"></hr> 
 
### <center>Extract the german sentences from the transcripts B<center>  

In [16]:
#this is a list with the content of all tags that have the attribute "ANNOTATION_VALUE" (they include german glosses, german sentences, 
#english glosses, english sentences, etc.) 

#from this content *only* the tags with german sentences from speakers B must be extracted   
transcript_content_b = [] 

#this is a list for the specific time encoding of each sentence 
time_encodings_b = [] 

#this is a list of the german sentences 
german_sentences_b = [] 

In [17]:
%%time 
#take each transcript from the list with all EAF transcripts and read its content to extract 
#in a data frame all german sentences *only* for speakers B    
for transcript in absolute_paths_eaf_transcripts:    
    with urllib.request.urlopen(transcript) as f:
        content = f.read().decode('utf-8') 
        transcript_content_b = BeautifulSoup(content, 'xml').find_all(name="ANNOTATION_VALUE") 
        time_encodings_b = BeautifulSoup(content, 'xml').find_all(name="TIME_SLOT") 
        for value in range(0, len(transcript_content_b)): 
            #if the value of the tags attribute TIER_ID is a german sentence from speaker B, extract it  
            if transcript_content_b[value].parent.parent.parent.attrs['TIER_ID'] == "Deutsche_Übersetzung_B": 
                #this is the time encoding for the sentence (both starting and ending) 
                time = transcript_content_b[value].parent.attrs 
                #this is the starting time of the sentence 
                start = time['TIME_SLOT_REF1'] 
                #this is the ending time of the sentencec 
                end = time['TIME_SLOT_REF2'] 
                #the sentence itself 
                sentence_b = transcript_content_b[value].text 
                #group the sentence + its start + its end 
                sentence_group_b = [sentence_b, start, end] 
                #add the german sentence to the list of german sentences 
                german_sentences_b.append(sentence_group_b) 

Wall time: 9min 47s


#### Save the list with german sentences B using pickle 

In [18]:
#save the list using pickle where each element is: sentence + start + time 
#to use it for extracting the gloss sentences later 

with open("path", "wb") as fp: 
    pickle.dump(german_sentences_b, fp) 

#### Create a data frame for the sentences B 

In [20]:
#list with only the sentence from the list with sentences B (no timestamps included) 
data_b = [] 

for s in range(0, len(german_sentences_b)): 
    data_b.append(german_sentences_b[s][0])  

In [21]:
#create a data frame from the list with sentences B 
df_b = pd.DataFrame(data_b, columns=["German Sentence"])       

df_b 

Unnamed: 0,German Sentence
0,"Ich war traurig, als ich von Dianas tödlichem ..."
1,"Denn als sie noch am Leben war, hat sie der We..."
2,Sie war die beste und netteste Königin von Eng...
3,Darum war ich schockiert und traurig über den ...
4,Weil es sie nicht mehr geben würde.
...,...
32375,"Hausaufgaben? In Mathematik, Deutsch."
32376,Wenn ich zum Beispiel im Unterricht nicht fert...
32377,"Wenn man die Aufgaben vergaß, gab es eine Strafe."
32378,Wenn der fertig war?


In [22]:
#save it as a file where each sentence is on a new line 
df_b.to_csv(f"path", encoding="utf-8-sig", index=False, header=False)   

<hr style="border:1.5px solid gray"></hr> 

#### Concatenating df_a and df_b gives a total of 63922 sentences  

In [23]:
frames = [df_a, df_b] 

result = pd.concat(frames) 

In [24]:
#save the data frame with the concatenated data frames for sentences a and sentences b 
result.to_csv(f"path", encoding="utf-8-sig", index=False, header=False)   
