In [6]:
# Database
import pandas as pd
import sys
import os
import time
import numpy as np
import tqdm

# Logging
from v_log import VLogger
import logging

#S3 interaction
from io import StringIO 
import boto3
import subprocess
import re

In [20]:
def load_df_s3(folder_list_path, file_name, S3_BUCKET = 'musicemotions'):
    """
    folder_list_path = ["folder1", "folder2", "folder3"]
    file_name = "1.csv"
    """

    s3 = boto3.client("s3")
    
    # Convert list of folders in S3 to path
    path_S3 = os.path.join(*folder_list_path,file_name) 
    print(path_S3)
    csv_obj = s3.get_object(Bucket = S3_BUCKET,  Key = path_S3)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string))
    return df

def get_audio_size(url,max_audio_size = 6):
    """
    Uses th youtube-dl -F argument to look for audio formats of that video
    It lists the audio formats in different lines
    Some of them terminate with MiB or KiB (size of audio file)
    Some do not end with that... so maybe the only size you get is the high quality video
    This size may be 4 or 5 times larger than the worst audio size, which is not reported in size, so we will trust
    this criteria and download it. Whichever length that the first audio surpases 6MB or the highest quality ones
    surpass 4 times that amount (24MB) they will not be eligible for download
    """
    direct_output = subprocess.check_output(f'youtube-dl -F {url}', shell=True) 
    words_output = str(direct_output).split("\\n")
    counter_audio_record = 0;
    for word in words_output:
        counter_audio_record += 1; #which record has info abut MiB or KiB (maybe the first one has not this data)
        if "MiB" in word:
            try:
                size_audio = re.findall(r"(\d+)(\.\d*)?MiB",word)
                size_audio_MiB = int(size_audio[0][0])
                break
            except:
                size_audio_MiB = 1000;
        if "KiB" in word:
            try:
                size_audio = re.findall(r"(\d+)(\.\d*)?KiB",word)
                size_audio_MiB = int(size_audio[0][0]) / 1000
                break
            except:
                size_audio_MiB = 1000;
    if size_audio_MiB < max_audio_size:
        return True, size_audio_MiB, words_output# the size is less than the maximum, hence, download it!
    # This correction is done for songs that on the first audio record has not a MiB or KiB
    # but when looking at the high quaity audio a register is there and maybe its 10MB so the song with the worst
    #audio will be small enough to be downloaded
    elif size_audio_MiB >= max_audio_size and counter_audio_record > 5:  # if it was the first listed audio size and 
        if size_audio_MiB < max_audio_size*4: # if unless listed in HIGH quality audio does not oduble the max size
            return True, size_audio_MiB, words_output #mark it as downloadable
        else: # if it surpasses the 24MB in the high quality audio, better not to download it just in case
            return False, size_audio_MiB, words_output
    else:
        return False, size_audio_MiB, words_output
    
    
def comando_youtube(track_id, url, path_audio = 'data/'):
    """
    Once the audio file size has been checked, we will download the worst audio to mp3 format
    path_output = data/
    url = youtube url
    """
    comando1 = f'youtube-dl -ci -f "worstaudio" -x --audio-format mp3 '
    path_output = path_audio +  track_id + ".mp3"
    comando2 = f" --output {path_output}"
    return comando1 + url + comando2

def file_to_S3(local_path, S3_path,  S3_BUCKET = 'musicemotions'):
    """
    local_path = os.path.join("..","webscrapping","log","WebScrap.log")
    S3_path = nonmatch-query/log.txt
    """
    if S3_path:
        s3 = boto3.resource('s3')
        resp = s3.Object(S3_BUCKET, S3_path).put(Body=open(local_path, 'rb'))
    else:
        s3 = boto3.resource('s3')
        resp = s3.Object(S3_BUCKET, S3_path).put(Body=open(local_path, 'rb'))
    return resp

# UPLOADING SONGS
def get_destination_folder_mp3(audio_file):
    return os.path.join("fs",audio_file[2],audio_file[3],audio_file[4], audio_file)

def upload_audio_minibatch():
    for audio_file in os.listdir("data"):
        audio_local_path = os.path.join("data", audio_file)
        audio_S3_path = get_destination_folder_mp3(audio_file)
        try:
            resp_audio = file_to_S3(audio_local_path, audio_S3_path,  S3_BUCKET = 'musicemotions')
        os.remove(os.path.join("data", audio_file))

In [3]:
batch_num = 0;

In [4]:
folder_match_results = "match-results"
file_name = f'{batch_num}.csv'
df = load_df_s3([folder_match_results], file_name)
df = df.sort_values("batch_id")
# Seleccionamos solo los registros que tengan batch_id > 0 (los -1 ya se han descargado)
df = df[df["batch_id"] >= 0]

match-results/0.csv


In [5]:
df

Unnamed: 0,track_id,url,batch_id
10232,TRNPEOL12903CBC2B8,https://www.youtube.com/watch?v=yxh95J0Gzx4,0
17721,TRXRACP12903CDB5E3,https://www.youtube.com/watch?v=wLDdF-8bSgw,0
4348,TRFTZYT12903CF81B1,https://www.youtube.com/watch?v=567c-LeINzs,0
15503,TRUQYPR128F92D067F,https://www.youtube.com/watch?v=UcoSZx8DjdM,0
6672,TRIWOBB128F4264F5F,https://www.youtube.com/watch?v=45IDW7D2td0,0
...,...,...,...
6845,TRJBGJU128F4273129,https://www.youtube.com/watch?v=3_jq8G_9uvo,1000
6843,TRJBGAG128F9322109,https://www.youtube.com/watch?v=8HVt9uw1Uq8,1000
6842,TRJBFQS128F1452CCC,https://www.youtube.com/watch?v=P-6rmwpXiG4,1000
6849,TRJBINI128F42442B1,https://www.youtube.com/watch?v=FZ6zvWrG4n4,1000


### Get audio size

In [6]:
df_sample = df.sample(20, random_state = 4).copy()

In [7]:
df_sample.shape

(20, 3)

In [8]:
a = list(); b=list(); c= list();
for ii, row in tqdm.tqdm_notebook(df_sample.iterrows()):
    track_id, url, batch_id = row
    
    # Get the audio sizes available and filter out those that are over the reasonable length
    resp, size_audio, request_response = get_audio_size(url)
    a.append(resp); b.append(size_audio); c.append(request_response)
    
    # Skip the song that exceeds file size max in MiB:
    if not resp:
        continue
        
    # Specify output path
    comando_descargar_audio = comando_youtube(track_id, url)
        
    # Download the audio file
    comando_output = subprocess.check_output(comando_descargar_audio, shell=True) 
    if "100%" in str(comando_output):
        print("Successfully downloaded: ", track_id)
        continue
    else:
        # create a log info event indicating that error
        print("Skip song: ", track_id)
        continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Successfully downloaded:  TRYZAGM12903CF1F6E
Successfully downloaded:  TRFSTLZ128F93138A3
Successfully downloaded:  TRBUAPG128F148B5B8
Successfully downloaded:  TRIUSBL128F932E5BC
Successfully downloaded:  TRYHESR128E079942C
Successfully downloaded:  TRZEBEX12903CE7549
Successfully downloaded:  TRQQXIO128E078166F
Successfully downloaded:  TRXVYED128F92F6EAD
Successfully downloaded:  TRXHZUE12903CEF936
Successfully downloaded:  TROTOKJ128F933268E
Successfully downloaded:  TRCVSRS128F428D1BE
Successfully downloaded:  TRLGIQE12903CD63B4
Successfully downloaded:  TRDDCPL128F4271A9F
Successfully downloaded:  TRVNEDO12903D06BBA
Successfully downloaded:  TRDJMVQ12903CB45AB
Successfully downloaded:  TRDLBIB128F4226B45
Successfully downloaded:  TRVNNZZ12903CB395F
Successfully downloaded:  TRUTIEL128F4268E76
Successfully downloaded:  TRFTIIQ12903CDD027
Successfully downloaded:  TRQHCEZ128F4231723



## Upload audio

In [9]:
os.listdir("data")

[]

In [15]:
ss = 'TRFSTLZ128F93138A3.mp3'
get_destination_folder_mp3(ss)

'fs/F/S/T/TRFSTLZ128F93138A3.mp3'

['TRFSTLZ128F93138A3.mp3', 'TRXVYED128F92F6EAD.mp3', 'TRXHZUE12903CEF936.mp3', 'TRYHESR128E079942C.mp3', 'TRLGIQE12903CD63B4.mp3', 'TRDDCPL128F4271A9F.mp3', 'TRVNNZZ12903CB395F.mp3', 'TRUTIEL128F4268E76.mp3', 'TRDJMVQ12903CB45AB.mp3', 'TRFTIIQ12903CDD027.mp3', 'TRIUSBL128F932E5BC.mp3', 'TRQQXIO128E078166F.mp3', 'TRZEBEX12903CE7549.mp3', 'TRDLBIB128F4226B45.mp3', 'TRCVSRS128F428D1BE.mp3', 'TRQHCEZ128F4231723.mp3', 'TRYZAGM12903CF1F6E.mp3', 'TRVNEDO12903D06BBA.mp3', 'TROTOKJ128F933268E.mp3', 'TRBUAPG128F148B5B8.mp3']
data/TRFSTLZ128F93138A3.mp3
fs/F/S/T/TRFSTLZ128F93138A3.mp3
-------------------------
['TRXVYED128F92F6EAD.mp3', 'TRXHZUE12903CEF936.mp3', 'TRYHESR128E079942C.mp3', 'TRLGIQE12903CD63B4.mp3', 'TRDDCPL128F4271A9F.mp3', 'TRVNNZZ12903CB395F.mp3', 'TRUTIEL128F4268E76.mp3', 'TRDJMVQ12903CB45AB.mp3', 'TRFTIIQ12903CDD027.mp3', 'TRIUSBL128F932E5BC.mp3', 'TRQQXIO128E078166F.mp3', 'TRZEBEX12903CE7549.mp3', 'TRDLBIB128F4226B45.mp3', 'TRCVSRS128F428D1BE.mp3', 'TRQHCEZ128F4231723.mp3', 'T

# Dev

In [14]:
resp, size_audio, request_response = get_audio_size("https://www.youtube.com/watch?v=dKnXG-fyrfA")

In [4]:
resp

NameError: name 'resp' is not defined

In [16]:
size_audio

21

In [17]:
request_response

["b'[youtube] dKnXG-fyrfA: Downloading webpage",
 '[info] Available formats for dKnXG-fyrfA:',
 'format code  extension  resolution note',
 '249          webm       audio only tiny   58k , opus @ 50k (48000Hz), 21.56MiB',
 '250          webm       audio only tiny   75k , opus @ 70k (48000Hz), 28.44MiB',
 '140          m4a        audio only tiny  134k , m4a_dash container, mp4a.40.2@128k (44100Hz), 61.51MiB',
 '251          webm       audio only tiny  136k , opus @160k (48000Hz), 56.38MiB',
 '160          mp4        256x144    144p  128k , avc1.4d400c, 30fps, video only, 38.37MiB',
 '278          webm       256x144    144p  143k , webm container, vp9, 30fps, video only, 46.19MiB',
 '242          webm       426x240    240p  228k , vp9, 30fps, video only, 77.65MiB',
 '133          mp4        426x240    240p  384k , avc1.4d4015, 30fps, video only, 83.94MiB',
 '243          webm       640x360    360p  413k , vp9, 30fps, video only, 135.57MiB',
 '134          mp4        640x360    360p  680k

In [59]:
direct_output = subprocess.check_output(f'youtube-dl -F {url}', shell=True) 
words_output = str(direct_output).split("\\n")
counter_audio_record = 0;

In [60]:
words_output

["b'[youtube] 9dgQSVAC7Mg: Downloading webpage",
 '[youtube] 9dgQSVAC7Mg: Downloading MPD manifest',
 '[info] Available formats for 9dgQSVAC7Mg:',
 'format code  extension  resolution note',
 '139          m4a        audio only DASH audio   48k , m4a_dash container, mp4a.40.5@ 48k (22050Hz)',
 '140          m4a        audio only DASH audio  128k , m4a_dash container, mp4a.40.2@128k (44100Hz)',
 '251          webm       audio only DASH audio  137k , webm_dash container, opus @160k (48000Hz)',
 '278          webm       192x144    DASH video   95k , webm_dash container, vp9, 25fps, video only',
 '160          mp4        192x144    DASH video  108k , mp4_dash container, avc1.4d400c, 25fps, video only',
 '242          webm       320x240    DASH video  220k , webm_dash container, vp9, 25fps, video only',
 '133          mp4        320x240    DASH video  247k , mp4_dash container, avc1.4d400d, 25fps, video only',
 '18           mp4        384x288    240p  552k , avc1.42001E, mp4a.40.2@ 96k (44

In [61]:
for word in words_output:
    counter_audio_record += 1; #which record has info abut MiB or KiB (maybe the first one has not this data)
    if "MiB" in word:
        try:
            size_audio = re.findall(r"(\d+)(\.\d*)?MiB",word)
            size_audio_MiB = int(size_audio[0][0])
            break
        except:
            size_audio_MiB = 1000;
    if "KiB" in word:
        try:
            size_audio = re.findall(r"(\d+)(\.\d*)?KiB",word)
            size_audio_MiB = int(size_audio[0][0]) / 1000
            break
        except:
            size_audio_MiB = 1000;

In [62]:
size_audio

[('20', '.24')]

In [64]:
words_output

["b'[youtube] 9dgQSVAC7Mg: Downloading webpage",
 '[youtube] 9dgQSVAC7Mg: Downloading MPD manifest',
 '[info] Available formats for 9dgQSVAC7Mg:',
 'format code  extension  resolution note',
 '139          m4a        audio only DASH audio   48k , m4a_dash container, mp4a.40.5@ 48k (22050Hz)',
 '140          m4a        audio only DASH audio  128k , m4a_dash container, mp4a.40.2@128k (44100Hz)',
 '251          webm       audio only DASH audio  137k , webm_dash container, opus @160k (48000Hz)',
 '278          webm       192x144    DASH video   95k , webm_dash container, vp9, 25fps, video only',
 '160          mp4        192x144    DASH video  108k , mp4_dash container, avc1.4d400c, 25fps, video only',
 '242          webm       320x240    DASH video  220k , webm_dash container, vp9, 25fps, video only',
 '133          mp4        320x240    DASH video  247k , mp4_dash container, avc1.4d400d, 25fps, video only',
 '18           mp4        384x288    240p  552k , avc1.42001E, mp4a.40.2@ 96k (44

In [69]:
max_audio_size = 6
if size_audio_MiB < max_audio_size:
    print(True, size_audio_MiB, words_output)# the size is less than the maximum, hence, download it!
# This correction is done for songs that on the first audio record has not a MiB or KiB
# but when looking at the high quaity audio a register is there and maybe its 10MB so the song with the worst
#audio will be small enough to be downloaded
elif size_audio_MiB >= max_audio_size and counter_audio_record > 5:  # if it was the first listed audio size and 
    if size_audio_MiB < max_audio_size*2: # if unless listed in HIGH quality audio does not oduble the max size
        print(True, size_audio_MiB, words_output) #mark it as downloadable
else:
    print(False, size_audio_MiB, words_output)

In [3]:
resp_audio

NameError: name 'resp_audio' is not defined