In [37]:
from py2neo import Graph
import pandas as pd
import sys
import numpy as np
import boto3
import psycopg2
import db_utils
import tqdm

# 1. Get the songs we want to download

## 1.1 Query the Neo4j Database

In [33]:
graph = Graph(bolt = True, host = "localhost", name = "Spotify", user = "neo4j", password = "qrks")

In [34]:
query = """
        MATCH (t:Track) WHERE t.yt_views > 2000000000 AND t.url is NOT NULL RETURN t.track_id, t.url
"""

In [35]:
cursor = graph.run(query)
df = pd.DataFrame.from_records(cursor, columns=cursor.keys())

In [36]:
df.head()

Unnamed: 0,t.track_id,t.url
0,1KqvRRQd3lfM0pAlkGPrgD,https://www.youtube.com/watch?v=JGwWNGJdvx8
1,0tgVpDi06FyKpA1z0VMD4v,https://www.youtube.com/watch?v=2Vv-BfVoq4g
2,34gCuhDGsG4bRPIf9bb02f,https://www.youtube.com/watch?v=lp-EO5I60KA
3,7fwXWKdDNI5IutOMc5OKYw,https://www.youtube.com/watch?v=wnJ6LuUFpMo
4,69bp2EbF7Q2rqc5N3ylezZ,https://www.youtube.com/watch?v=fRh_vgS2dFE


## 1.2 Query the Spotify Database (PostgreSQL)

In [20]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [21]:
query = """
    SELECT T2.track_id, T2.href FROM (SELECT * from master_track where track_id = '7mWFF4gPADjTQjC97CgFVt') T1
        INNER JOIN track_url T2 ON T1.track_id = T2.track_id;
"""

In [22]:
df = db_utils.select_table(cursor, direct_query=query,
                                 sel_cols=["track_id", "href"])
df.drop_duplicates(inplace=True)
conn.close()

In [23]:
df.head()

Unnamed: 0,track_id,href
0,7mWFF4gPADjTQjC97CgFVt,https://www.youtube.com/watch?v=CsSgg_iZHz4


# 2. Send jobs to *jobs_download* from df

In [66]:
# Create SQS client
sqs = boto3.client('sqs')
URL_q_jobs = "https://sqs.eu-west-2.amazonaws.com/555381533193/jobs_download"
URL_q_status = "https://sqs.eu-west-2.amazonaws.com/555381533193/status"

In [71]:
def send_message_jobs(track_id, href):
    """Send message to SQS queue: get_batch.fifo
    Specifying at which batch_num and iteration it has arrived
    """
    response = sqs.send_message(
        QueueUrl=URL_q_jobs,
        DelaySeconds=0,
        MessageAttributes={},
        MessageBody=(f"{track_id}::{href}")    )


The df must have 2 columns: track_id and the url sent

In [129]:
# Send messages
df.columns = ["track_id", "url"]
for i, row in tqdm.tqdm_notebook(df.iterrows()):
    send_message_jobs(row.track_id, row.url)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# 3. Get the status

In [126]:
def get_status():
    """
    Receive message from SQS queue: job_download
    """
    response = sqs.receive_message(
        QueueUrl=URL_q_status,
        MaxNumberOfMessages=3,
        MessageAttributeNames=[
            'All'
        ],
        VisibilityTimeout=30, # TODO!!! 
        WaitTimeSeconds=0
    )
    return response

def parse_job_message(response):
    """Parses the response as json to output the batch and iter numbers from message"""
    resp_list = response["Messages"][0]["Body"].split("::")
    return resp_list

In [102]:
counter = 0
set_messages = set()
while counter < 10:
    counter += 1
    resp_status = get_status()
    try:
        set_messages.add(parse_job_message(response))
    except KeyError:
        print("Finished!")
        break

In [104]:
counter

10

In [107]:
resp_status = get_status()
resp_status

{'ResponseMetadata': {'RequestId': 'e7d6ffca-0de6-5972-aea3-a8b317665e89',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e7d6ffca-0de6-5972-aea3-a8b317665e89',
   'date': 'Fri, 03 Jul 2020 11:36:32 GMT',
   'content-type': 'text/xml',
   'content-length': '240'},
  'RetryAttempts': 0}}

In [127]:
status_response = get_status()
vals = parse_job_message(status_response)

In [128]:
vals

['i-08c75636009ae4a27',
 '1',
 '1KqvRRQd3lfM0pAlkGPrgD',
 'https://www.youtube.com/watch?v=JGwWNGJdvx8',
 '2020-07-03 09:58:06']

In [57]:

values.append(vals)

In [58]:
df_status = pd.DataFrame(values, columns = ["instance_id", "status", "track_id", "yt_url", "date"])

In [59]:
df_status

Unnamed: 0,instance_id,status,track_id,yt_url,date
0,i-08c75636009ae4a27,1,1KqvRRQd3lfM0pAlkGPrgD,https://www.youtube.com/watch?v=JGwWNGJdvx8,2020-07-03 09:58:06


In [48]:
status_response["Messages"][0]["Body"].split()

'i-08c75636009ae4a27::1::1KqvRRQd3lfM0pAlkGPrgD::https://www.youtube.com/watch?v=JGwWNGJdvx8::2020-07-03 09:58:06'

In [43]:
st

{'Messages': [{'MessageId': '57a6cc2e-ad6a-488b-8afa-9cd91a030346',
   'ReceiptHandle': 'AQEByniGB3NJzvb/Fb7EnsjfgP1bdDAZgxYzrzagJNu+uGJYITnbPKw7aCb9MUoL4iuJOFhsB2ricIjszEMIbLSjZS/vHR4wcybd0XXso1EujD+r65/0k9g7d40hkNQlFiTaRFETKcZNTjQZqISa1gkAPFOfktDcMTt9PHpEKa/6Ar+QdMH27/ffKpcYoyoz9jlEiVohLAgpnuvijyHmPG/91bVzCq9/LyAqyVyVAoIMlJ6cwzgEwn6GVDzNmXVEnNOMOk2z2C4Y6y5g5ejNDXVMWw4sztlWaG7V1XyiLa8vk8JmngKMVyghjeaAxx9r5XYKL5ZIlT5Rhx/5HLwuBYP3fxbBdz4gEZYuHxcYDeU8vgzNI8kZ1xxsIlO8TKe4sY2M',
   'MD5OfBody': 'fc07e76eb16684d51bd68241c96daa19',
   'Body': 'i-08c75636009ae4a27::1::1KqvRRQd3lfM0pAlkGPrgD::https://www.youtube.com/watch?v=JGwWNGJdvx8::2020-07-03 09:58:06'}],
 'ResponseMetadata': {'RequestId': 'a37f39be-558b-57dc-9386-6ec751d9f659',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a37f39be-558b-57dc-9386-6ec751d9f659',
   'date': 'Fri, 03 Jul 2020 10:01:54 GMT',
   'content-type': 'text/xml',
   'content-length': '939'},
  'RetryAttempts': 0}}

In [None]:
all_messages=[]
rs=get_status(10)
while len(rs)>0:
    all_messages.extend(rs)
    rs=q.get_messages(10)

In [76]:
def get_job():
    """
    Receive message from SQS queue: job_download.fifo
    """
    response = sqs.receive_message(
        QueueUrl=URL_q_jobs,
        MaxNumberOfMessages=1,
        MessageAttributeNames=[
            'All'
        ],
        VisibilityTimeout=30, # TODO!!! 
        WaitTimeSeconds=0
    )
    return response

In [77]:
response = get_job()

In [78]:
response

{'Messages': [{'MessageId': '0c2caaf7-b04f-45db-b7b3-ebe75ef9e972',
   'ReceiptHandle': 'AQEBGp8+72PD/bA3aRQqIXlhCXkAkSkbdZEw86kvv7f9P/ajK9YToR8icN4tTX8wsaWus8Zcg0qag9YHzgv6FjumXaFp3MvOKX6UEit9hRQXD45UWgbPVMLO9iZ6MK4O3Perwa7xWGo2n4eDOvIm4h1wv+QAjyZmiyuJmZC1UZ92JTtyPWlF9t9D1fwASOLM36KgBW3nqx1uzDRkErOGrdOcLU8rdVai1FiO05r1KVEeBwf3ZAn4fdlfgg9wveKyT/AllgPidChcX7LHWttuwwzIkRdPJqM6Jz3o0YHitaU0iKlvO9ThmK59VXpYIZbM3dugJbdhE5lNIqskvVqNjgDh4FDWzzQEEQZqXn52aVyKnQP6Ay/9Quo7CZwfRVtHhaTgDXTkSz6ovZy29FLUKP8Bgg==',
   'MD5OfBody': '6cd5fe3ea6a491873bc1a17c1097f424',
   'Body': '1WniHvhq9zTkny0WvGXX8o::https://www.youtube.com/watch?v=6Mgqbai3fKo'}],
 'ResponseMetadata': {'RequestId': '967f77a6-7816-51f6-9f92-20df75f85ed7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '967f77a6-7816-51f6-9f92-20df75f85ed7',
   'date': 'Fri, 03 Jul 2020 10:42:54 GMT',
   'content-type': 'text/xml',
   'content-length': '918'},
  'RetryAttempts': 0}}

# 4. List current S3 objects (downloaded audio)

In [120]:
import boto3
s3 = boto3.resource('s3')
allFiles = s3.Bucket("tfmdavid").objects.all()

In [124]:
downloaded_mp3 = list()
for file in allFiles:
    
    ff = file.key
    if ".mp3" in ff:
        downloaded_mp3.append(ff.split("/")[1].split(".")[0])

In [125]:
downloaded_mp3

['0ENSn4fwAbCGeFGVUbXEU3',
 '0azC730Exh71aQlOt9Zj3y',
 '0bs6Z1COIcTukOIql6CRGF',
 '0tgVpDi06FyKpA1z0VMD4v',
 '1KqvRRQd3lfM0pAlkGPrgD',
 '1QzFhzIOW7CyRJLpmq5CM0',
 '1WniHvhq9zTkny0WvGXX8o',
 '1brwdYwjltrJo7WHpIvbYt',
 '1q676iYDR3GeJtOHdyggIU',
 '1u8c2t2Cy7UBoG4ArRcF5g',
 '1zsG4eaZmkA1dvjDDsAGLK',
 '2QBk6HiFiv2exuwGaupOoE',
 '2ekn2ttSfGqwhhate0LSR0',
 '32OlwWuMpZ6b0aN2RZOeMS',
 '34gCuhDGsG4bRPIf9bb02f',
 '3CRDbSIZ4r5MsZ0YwxuEkn',
 '3MPvgWrFplhNi3QncC5eUS',
 '3V8UKqhEK5zBkBb6d6ub8i',
 '3pzjHKrQSvXGHQ98dx18HI',
 '4VrWlk8IQxevMvERoX08iC',
 '4X5V0XWXYbjlq4yBuPiYfA',
 '4tCtwWceOPWzenK2HAIJSb',
 '58ENoZ4jRR0tnHPagOwYqk',
 '5eWgDlp3k6Tb5RD8690s6I',
 '5jrdCoLpJSvHHorevXBATy',
 '5ygDXis42ncn6kYG14lEVG',
 '69bp2EbF7Q2rqc5N3ylezZ',
 '6F5c58TMEs1byxUstkzVeM',
 '6Za3190Sbw39BBC77WSS1C',
 '6epn3r7S14KUqlReYr77hA',
 '6habFhsOp2NvshLv26DqMb',
 '7BKLCZ1jbUBVqRi2FVlTVw',
 '7DFNE7NO0raLIUbgzY2rzm',
 '7fa9MBXhVfQ8P8Df9OEbD8',
 '7fwXWKdDNI5IutOMc5OKYw',
 '7n9Q6bXSjm74uCtajkddPt',
 '7wqSzGeodspE3V6RBD5W8L',
 