In [157]:
from py2neo import Graph
import pandas as pd
import sys
import numpy as np
import boto3
import psycopg2
import db_utils
import tqdm

# 1. Get the songs we want to download

## 1.1 Query the Neo4j Database

In [158]:
graph = Graph(bolt = True, host = "localhost", name = "Spotify", user = "neo4j", password = "qrks")

In [159]:
query = """
        MATCH (t:Track) WHERE t.yt_views > 1000000000 AND t.url is NOT NULL RETURN t.track_id, t.url
"""

In [160]:
cursor = graph.run(query)
df = pd.DataFrame.from_records(cursor, columns=cursor.keys())

In [161]:
df.head()

Unnamed: 0,t.track_id,t.url
0,2ksOAxtIxY8yElEWw8RhgK,https://www.youtube.com/watch?v=0VR3dfZf9Yg
1,5W83ErFkO3aKAIS1WMi6u0,https://www.youtube.com/watch?v=gFZfwWZV074
2,6SIrNxmmdbv1KUbFBu1PaN,https://www.youtube.com/watch?v=ycV6cnK3SIs
3,2XW4DbS6NddZxRPm5rMCeY,https://www.youtube.com/watch?v=xpVfcZ0ZcFM
4,116H0KvKr2Zl4RPuVBruDO,https://www.youtube.com/watch?v=OSUxrSe5GbI


In [162]:
df.shape

(191, 2)

## 1.2 Query the Spotify Database (PostgreSQL)

In [195]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [196]:
query = """
    SELECT T2.track_id, T2.href FROM (SELECT * from master_track where track_id = '6g1NlCpW7fgqDnWbCCDrHl') T1
        INNER JOIN track_url T2 ON T1.track_id = T2.track_id;
"""

In [197]:
df = db_utils.select_table(cursor, direct_query=query,
                                 sel_cols=["track_id", "href"])
df.drop_duplicates(inplace=True)
conn.close()

In [198]:
df.head()

Unnamed: 0,track_id,href
0,6g1NlCpW7fgqDnWbCCDrHl,https://www.youtube.com/watch?v=ynGi25x2kMw


# 2. Get the status and check if the song has not been downloaded yet

## 2.1 Get status table

In [188]:
def query_results_to_df(query_results):
    if len(query_results) == 0:
        return False
    cols = ["instance_id","track_id","yt_url","date", "stat"]
    return pd.DataFrame(query_results, columns=cols)

def get_status():
    query_status = """ 
    SELECT * FROM status
    """
    ENDPOINT="tracksurl.czjs6btlvfgd.eu-west-2.rds.amazonaws.com"
    PORT="5432"
    USR="david"
    REGION="eu-west-2"
    DBNAME="postgres"
    PSSWD=["qrks","jfut","iv","uf","1"]

    conn = psycopg2.connect(host=ENDPOINT, port=PORT, database=DBNAME, user=USR, password=''.join(PSSWD))
    cur = conn.cursor()
    cur.execute(query_status)
    query_results = cur.fetchall()
    df_status = query_results_to_df(query_results)
    conn.close()
    return df_status

In [189]:
df_status = get_status()

In [190]:
# Set of songs already downloaded
already_downloaded = set(df_status.track_id)

### Filter the df for those songs that have already been downloaded, eliminate them from df

In [200]:
df = df[~df["track_id"].isin(already_downloaded)]

In [201]:
df

Unnamed: 0,track_id,href
0,6g1NlCpW7fgqDnWbCCDrHl,https://www.youtube.com/watch?v=ynGi25x2kMw


## 2.2 Send jobs to *jobs_download* from df

In [202]:
# Create SQS client
sqs = boto3.client('sqs')
URL_q_jobs = "https://sqs.eu-west-2.amazonaws.com/555381533193/jobs_download"
URL_q_status = "https://sqs.eu-west-2.amazonaws.com/555381533193/status"

In [203]:
def send_message_jobs(track_id, href):
    """Send message to SQS queue: get_batch.fifo
    Specifying at which batch_num and iteration it has arrived
    """
    response = sqs.send_message(
        QueueUrl=URL_q_jobs,
        DelaySeconds=0,
        MessageAttributes={},
        MessageBody=(f"{track_id}::{href}")    )


The df must have 2 columns: track_id and the url sent

In [204]:
# Send messages
df.columns = ["track_id", "url"]
for i, row in tqdm.tqdm_notebook(df.iterrows()):
    send_message_jobs(row.track_id, row.url)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# 4. List current S3 objects (downloaded audio)

In [166]:
import boto3
s3 = boto3.resource('s3')
allFiles = s3.Bucket("tfmdavid").objects.all()

In [167]:
downloaded_mp3 = list()
for file in allFiles:
    
    ff = file.key
    if ".mp3" in ff:
        downloaded_mp3.append(ff.split("/")[1].split(".")[0])

In [168]:
downloaded_mp3

['116H0KvKr2Zl4RPuVBruDO',
 '1m69ELEgE6k5ZWsap40ozt',
 '2XW4DbS6NddZxRPm5rMCeY',
 '2ksOAxtIxY8yElEWw8RhgK',
 '2qG5sZ7Si6sdK74qLxedYM',
 '34gCuhDGsG4bRPIf9bb02f',
 '39N9RPD9MRb5WmoLzNzPeA',
 '3KkXRkHbMCARz0aVfEt68P',
 '3a1lNhkSLSkpJE4MSHpDu9',
 '3hB5DgAiMAQ4DzYbsMq1IT',
 '3pzjHKrQSvXGHQ98dx18HI',
 '4R8BJggjosTswLxtkw8V7P',
 '4wFHfY9IILHLNwakPuFogD',
 '4zWO4gvuFtw6EJZC5FFGlr',
 '58q2HKrzhC3ozto2nDdN4z',
 '5W83ErFkO3aKAIS1WMi6u0',
 '69bp2EbF7Q2rqc5N3ylezZ',
 '6RUKPb4LETWmmr3iAEQktW',
 '6SIrNxmmdbv1KUbFBu1PaN',
 '6Za3190Sbw39BBC77WSS1C',
 '6nmz4imkDcmtwMjocAzFSx',
 '7fwXWKdDNI5IutOMc5OKYw']

# 5. Monitor Status

In [185]:
# By month
#df_status[df_status['date'].dt.strftime('%Y-%m') == '2020-06']

#By day
#df_day = df_status[df_status['date'].dt.strftime('%Y-%m-%d') == '2020-07-03']

# Interval of minutes
df_hour = df_status[df_status['date'] \
                    .dt.strftime('%Y-%m-%d %H:%M').between("2020-07-03 22:45", "2020-07-03 22:46")] # comment
                    
                    

In [187]:
df_hour

Unnamed: 0,instance_id,track_id,yt_url,date,stat
1,i-0fca39308700d1813,34gCuhDGsG4bRPIf9bb02f,https://www.youtube.com/watch?v=lp-EO5I60KA,2020-07-03 22:45:10,1
2,i-0fca39308700d1813,7fwXWKdDNI5IutOMc5OKYw,https://www.youtube.com/watch?v=wnJ6LuUFpMo,2020-07-03 22:45:17,1
3,i-0fca39308700d1813,6nmz4imkDcmtwMjocAzFSx,https://www.youtube.com/watch?v=uxpDa-c-4Mc,2020-07-03 22:45:27,1
4,i-0fca39308700d1813,5W83ErFkO3aKAIS1WMi6u0,https://www.youtube.com/watch?v=gFZfwWZV074,2020-07-03 22:45:36,1
5,i-0fca39308700d1813,6SIrNxmmdbv1KUbFBu1PaN,https://www.youtube.com/watch?v=ycV6cnK3SIs,2020-07-03 22:45:44,1
6,i-0fca39308700d1813,116H0KvKr2Zl4RPuVBruDO,https://www.youtube.com/watch?v=OSUxrSe5GbI,2020-07-03 22:45:51,1
7,i-0fca39308700d1813,39N9RPD9MRb5WmoLzNzPeA,https://www.youtube.com/watch?v=_I_D_8Z4sJE,2020-07-03 22:46:00,1
8,i-0fca39308700d1813,58q2HKrzhC3ozto2nDdN4z,https://www.youtube.com/watch?v=xTlNMmZKwpA,2020-07-03 22:46:09,1
9,i-0fca39308700d1813,3hB5DgAiMAQ4DzYbsMq1IT,https://www.youtube.com/watch?v=oyEuk8j8imI,2020-07-03 22:46:22,1
10,i-0fca39308700d1813,2qG5sZ7Si6sdK74qLxedYM,https://www.youtube.com/watch?v=p7bfOZek9t4,2020-07-03 22:46:29,1


In [205]:
df_status['diff'] = df_status.groupby('instance_id')['date'].diff() / np.timedelta64(1, 's')

In [207]:
np.mean(df_status["diff"])

9.203389830508474

In [208]:
np.std(df_status["diff"])

1.8112610179515318