# Information Collection

    


In [47]:
import pandas as pd
from tqdm import tqdm
from abc import ABC, abstractmethod
import os
import wave

class DataExtractorStrategy:
    def __init__(self, target):
        self.target = target
        
    @abstractmethod
    def extractData(self):
        pass

class YoutubeExtractor(DataExtractorStrategy):
    def __init__(self, url, start, end):
        super(YoutubeExtractor, self).__init__(url)
        self.start = start
        self.end = end

    def extractData(self, name):
        print('-ss ' + self.start + ' -t ' + self.end)
        if len(self.target) > 1:
            vid_name = "'" + OUTPUT_DIR + name + '_' + '%(id)s.%(ext)s' + "'"
            start_end = "'-ss " + self.start + " -t " + self.end + " -ar 8000'"
            print(start_end)
            
            command = "youtube-dl -x --audio-format wav --output " + vid_name + " --postprocessor-args " + start_end + " \"" + self.target + "\""
            os.system(command)
            print(command)
        else:
            print("Empty url found!")
            

class DataExtractor:
    
    def __init__(self, csvFilePath):
        self.filePath=csvFilePath
    
    def readData(self):
        print("Reading data: ", self.filePath)
        self.df = pd.read_csv(self.filePath)
    
    def prepareData(self):
        print("Preparing data")
        for index, row in tqdm(self.df.iterrows()):
            print(row)
            url = row["url"]
            strategy = row["strategy"]
            start = row["start"]
            end = row["end"]
            label = row["label"]
            
            self.getData(url, start, end, strategy, label)
            
    def getData(self, url, start, end ,strategy, label):
        if(strategy == 'youtube'):
            dataExtractor = YoutubeExtractor(url, start, end)
            dataExtractor.extractData(name=label)
        else:
            raise Exception("Strategy <", strategy, "> is not supported! (at url: " + url + ")")


In [48]:
OUTPUT_DIR = "audio_data/"
de = DataExtractor('../data.csv')
de.readData()
de.prepareData()

0it [00:00, ?it/s]

Reading data:  ../data.csv
Preparing data
url         https://www.youtube.com/watch?v=4APIf4O6Inc
strategy                                        youtube
start                                       00:00:08.00
end                                         00:01:07.00
label                                      angelamerkel
Name: 0, dtype: object
-ss 00:00:08.00 -t 00:01:07.00
'-ss 00:00:08.00 -t 00:01:07.00 -ar 8000'


1it [00:07,  7.89s/it]

youtube-dl -x --audio-format wav --output 'audio_data/angelamerkel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:08.00 -t 00:01:07.00 -ar 8000' "https://www.youtube.com/watch?v=4APIf4O6Inc"
url         https://www.youtube.com/watch?v=v63SKddbG2w
strategy                                        youtube
start                                       00:11:00.00
end                                         00:11:14.00
label                                      angelamerkel
Name: 1, dtype: object
-ss 00:11:00.00 -t 00:11:14.00
'-ss 00:11:00.00 -t 00:11:14.00 -ar 8000'


2it [00:31, 12.70s/it]

youtube-dl -x --audio-format wav --output 'audio_data/angelamerkel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:11:00.00 -t 00:11:14.00 -ar 8000' "https://www.youtube.com/watch?v=v63SKddbG2w"
url         https://www.youtube.com/watch?v=ZhIurnatBJw
strategy                                        youtube
start                                       00:02:02.00
end                                         00:02:40.00
label                                      angelamerkel
Name: 2, dtype: object
-ss 00:02:02.00 -t 00:02:40.00
'-ss 00:02:02.00 -t 00:02:40.00 -ar 8000'


3it [01:16, 22.36s/it]

youtube-dl -x --audio-format wav --output 'audio_data/angelamerkel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:02:02.00 -t 00:02:40.00 -ar 8000' "https://www.youtube.com/watch?v=ZhIurnatBJw"
url         https://www.youtube.com/watch?v=ltIDG1xzSWc
strategy                                        youtube
start                                       00:00:15.00
end                                         00:00:35.00
label                                      angelamerkel
Name: 3, dtype: object
-ss 00:00:15.00 -t 00:00:35.00
'-ss 00:00:15.00 -t 00:00:35.00 -ar 8000'


4it [01:29, 19.57s/it]

youtube-dl -x --audio-format wav --output 'audio_data/angelamerkel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:15.00 -t 00:00:35.00 -ar 8000' "https://www.youtube.com/watch?v=ltIDG1xzSWc"
url         https://www.youtube.com/watch?v=SD939Q6LxRg
strategy                                        youtube
start                                       00:00:12.00
end                                         00:00:25.00
label                                      angelamerkel
Name: 4, dtype: object
-ss 00:00:12.00 -t 00:00:25.00
'-ss 00:00:12.00 -t 00:00:25.00 -ar 8000'


5it [01:37, 15.94s/it]

youtube-dl -x --audio-format wav --output 'audio_data/angelamerkel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:12.00 -t 00:00:25.00 -ar 8000' "https://www.youtube.com/watch?v=SD939Q6LxRg"
url         https://www.youtube.com/watch?v=Wf9bvwPlzgs
strategy                                        youtube
start                                       00:00:51.00
end                                         00:02:25.00
label                                      angelamerkel
Name: 5, dtype: object
-ss 00:00:51.00 -t 00:02:25.00
'-ss 00:00:51.00 -t 00:02:25.00 -ar 8000'


6it [02:02, 18.74s/it]

youtube-dl -x --audio-format wav --output 'audio_data/angelamerkel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:51.00 -t 00:02:25.00 -ar 8000' "https://www.youtube.com/watch?v=Wf9bvwPlzgs"
url         https://www.youtube.com/watch?v=D6I0jH4uT8I
strategy                                        youtube
start                                       00:00:05.00
end                                         00:00:42.00
label                                       aliceweidel
Name: 6, dtype: object
-ss 00:00:05.00 -t 00:00:42.00
'-ss 00:00:05.00 -t 00:00:42.00 -ar 8000'


7it [02:12, 16.12s/it]

youtube-dl -x --audio-format wav --output 'audio_data/aliceweidel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:05.00 -t 00:00:42.00 -ar 8000' "https://www.youtube.com/watch?v=D6I0jH4uT8I"
url         https://www.youtube.com/watch?v=So_3I-wQVpc
strategy                                        youtube
start                                       00:00:23.00
end                                         00:00:35.00
label                                       aliceweidel
Name: 7, dtype: object
-ss 00:00:23.00 -t 00:00:35.00
'-ss 00:00:23.00 -t 00:00:35.00 -ar 8000'


8it [02:22, 14.35s/it]

youtube-dl -x --audio-format wav --output 'audio_data/aliceweidel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:23.00 -t 00:00:35.00 -ar 8000' "https://www.youtube.com/watch?v=So_3I-wQVpc"
url         https://www.youtube.com/watch?v=w-m5O_upKkk
strategy                                        youtube
start                                       00:00:16.00
end                                         00:00:24.00
label                                       aliceweidel
Name: 8, dtype: object
-ss 00:00:16.00 -t 00:00:24.00
'-ss 00:00:16.00 -t 00:00:24.00 -ar 8000'


9it [02:29, 11.97s/it]

youtube-dl -x --audio-format wav --output 'audio_data/aliceweidel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:16.00 -t 00:00:24.00 -ar 8000' "https://www.youtube.com/watch?v=w-m5O_upKkk"
url         https://www.youtube.com/watch?v=AO8OShVa8u4
strategy                                        youtube
start                                       00:00:13.00
end                                         00:00:30.00
label                                       aliceweidel
Name: 9, dtype: object
-ss 00:00:13.00 -t 00:00:30.00
'-ss 00:00:13.00 -t 00:00:30.00 -ar 8000'


10it [02:33,  9.79s/it]

youtube-dl -x --audio-format wav --output 'audio_data/aliceweidel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:13.00 -t 00:00:30.00 -ar 8000' "https://www.youtube.com/watch?v=AO8OShVa8u4"
url         https://www.youtube.com/watch?v=fNihW_QE168
strategy                                        youtube
start                                       00:00:52.00
end                                         00:01:15.00
label                                      karambadiaby
Name: 10, dtype: object
-ss 00:00:52.00 -t 00:01:15.00
'-ss 00:00:52.00 -t 00:01:15.00 -ar 8000'


11it [02:39,  8.53s/it]

youtube-dl -x --audio-format wav --output 'audio_data/karambadiaby_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:52.00 -t 00:01:15.00 -ar 8000' "https://www.youtube.com/watch?v=fNihW_QE168"
url         https://www.youtube.com/watch?v=Wj7-r2O-bzc
strategy                                        youtube
start                                       00:00:07.00
end                                         00:01:00.00
label                                      karambadiaby
Name: 11, dtype: object
-ss 00:00:07.00 -t 00:01:00.00
'-ss 00:00:07.00 -t 00:01:00.00 -ar 8000'


12it [02:45,  7.88s/it]

youtube-dl -x --audio-format wav --output 'audio_data/karambadiaby_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:07.00 -t 00:01:00.00 -ar 8000' "https://www.youtube.com/watch?v=Wj7-r2O-bzc"
url         https://www.youtube.com/watch?v=F6vjLGZJA4w
strategy                                        youtube
start                                       00:00:43.00
end                                         00:00:56.00
label                                      karambadiaby
Name: 12, dtype: object
-ss 00:00:43.00 -t 00:00:56.00
'-ss 00:00:43.00 -t 00:00:56.00 -ar 8000'


13it [02:51,  7.15s/it]

youtube-dl -x --audio-format wav --output 'audio_data/karambadiaby_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:43.00 -t 00:00:56.00 -ar 8000' "https://www.youtube.com/watch?v=F6vjLGZJA4w"
url         https://www.youtube.com/watch?v=ZtLEcdcd58U
strategy                                        youtube
start                                       00:00:05.00
end                                         00:00:27.00
label                                      karambadiaby
Name: 13, dtype: object
-ss 00:00:05.00 -t 00:00:27.00
'-ss 00:00:05.00 -t 00:00:27.00 -ar 8000'


14it [02:55,  6.39s/it]

youtube-dl -x --audio-format wav --output 'audio_data/karambadiaby_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:05.00 -t 00:00:27.00 -ar 8000' "https://www.youtube.com/watch?v=ZtLEcdcd58U"
url         https://www.youtube.com/watch?v=fg7dWahOv-w
strategy                                        youtube
start                                       00:00:01.00
end                                         00:00:50.00
label                                      habeckrobert
Name: 14, dtype: object
-ss 00:00:01.00 -t 00:00:50.00
'-ss 00:00:01.00 -t 00:00:50.00 -ar 8000'


15it [03:00,  5.99s/it]

youtube-dl -x --audio-format wav --output 'audio_data/habeckrobert_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:01.00 -t 00:00:50.00 -ar 8000' "https://www.youtube.com/watch?v=fg7dWahOv-w"
url         https://www.youtube.com/watch?v=HEA3n9acVMA
strategy                                        youtube
start                                       00:00:03.00
end                                         00:00:45.00
label                                      habeckrobert
Name: 15, dtype: object
-ss 00:00:03.00 -t 00:00:45.00
'-ss 00:00:03.00 -t 00:00:45.00 -ar 8000'


16it [03:20, 10.06s/it]

youtube-dl -x --audio-format wav --output 'audio_data/habeckrobert_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:03.00 -t 00:00:45.00 -ar 8000' "https://www.youtube.com/watch?v=HEA3n9acVMA"
url         https://www.youtube.com/watch?v=HEA3n9acVMA
strategy                                        youtube
start                                       00:00:03.00
end                                         00:00:45.00
label                                      habeckrobert
Name: 16, dtype: object
-ss 00:00:03.00 -t 00:00:45.00
'-ss 00:00:03.00 -t 00:00:45.00 -ar 8000'


17it [03:40, 12.93s/it]

youtube-dl -x --audio-format wav --output 'audio_data/habeckrobert_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:03.00 -t 00:00:45.00 -ar 8000' "https://www.youtube.com/watch?v=HEA3n9acVMA"
url         https://www.youtube.com/watch?v=1EauH_lWqEY
strategy                                        youtube
start                                       00:00:02.00
end                                         00:00:40.00
label                                      habeckrobert
Name: 17, dtype: object
-ss 00:00:02.00 -t 00:00:40.00
'-ss 00:00:02.00 -t 00:00:40.00 -ar 8000'


18it [03:59, 13.30s/it]

youtube-dl -x --audio-format wav --output 'audio_data/habeckrobert_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:02.00 -t 00:00:40.00 -ar 8000' "https://www.youtube.com/watch?v=1EauH_lWqEY"





# Download Test Audio

In [49]:
OUTPUT_DIR = "test_audio_data/"
tde = DataExtractor('../test_data.csv')
tde.readData()
tde.prepareData()

0it [00:00, ?it/s]

Reading data:  ../test_data.csv
Preparing data
url         https://www.youtube.com/watch?v=S3C5H-2SqYU
strategy                                        youtube
start                                       00:00:15.00
end                                         00:00:40.00
label                                      angelamerkel
Name: 0, dtype: object
-ss 00:00:15.00 -t 00:00:40.00
'-ss 00:00:15.00 -t 00:00:40.00 -ar 8000'


1it [00:06,  6.66s/it]

youtube-dl -x --audio-format wav --output 'test_audio_data/angelamerkel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:15.00 -t 00:00:40.00 -ar 8000' "https://www.youtube.com/watch?v=S3C5H-2SqYU"
url         https://www.youtube.com/watch?v=u0KAGFJ76QM
strategy                                        youtube
start                                       00:00:01.00
end                                         00:00:30.00
label                                       aliceweidel
Name: 1, dtype: object
-ss 00:00:01.00 -t 00:00:30.00
'-ss 00:00:01.00 -t 00:00:30.00 -ar 8000'


2it [00:13,  6.76s/it]

youtube-dl -x --audio-format wav --output 'test_audio_data/aliceweidel_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:01.00 -t 00:00:30.00 -ar 8000' "https://www.youtube.com/watch?v=u0KAGFJ76QM"
url         https://www.youtube.com/watch?v=ZtLEcdcd58U
strategy                                        youtube
start                                       00:01:10.00
end                                         00:01:33.00
label                                      karambadiaby
Name: 2, dtype: object
-ss 00:01:10.00 -t 00:01:33.00
'-ss 00:01:10.00 -t 00:01:33.00 -ar 8000'


3it [00:18,  6.08s/it]

youtube-dl -x --audio-format wav --output 'test_audio_data/karambadiaby_%(id)s.%(ext)s' --postprocessor-args '-ss 00:01:10.00 -t 00:01:33.00 -ar 8000' "https://www.youtube.com/watch?v=ZtLEcdcd58U"
url         https://www.youtube.com/watch?v=jte-Ch6woAs
strategy                                        youtube
start                                       00:00:10.00
end                                         00:00:40.00
label                                      habeckrobert
Name: 3, dtype: object
-ss 00:00:10.00 -t 00:00:40.00
'-ss 00:00:10.00 -t 00:00:40.00 -ar 8000'


4it [00:25,  6.43s/it]

youtube-dl -x --audio-format wav --output 'test_audio_data/habeckrobert_%(id)s.%(ext)s' --postprocessor-args '-ss 00:00:10.00 -t 00:00:40.00 -ar 8000' "https://www.youtube.com/watch?v=jte-Ch6woAs"



