In [1]:
from py2neo import Graph
import pandas as pd
import sys
import numpy as np
import boto3
import psycopg2
import db_utils
import tqdm
import matplotlib

# 0. List all the songs downloaded in S3

In [3]:
# List all the files in the S3 bucket audio folder as .mp3
import boto3
s3 = boto3.resource('s3')
allFiles = s3.Bucket("tfmdavid").objects.filter(Prefix='audio')

downloaded_mp3 = list()
for file in allFiles:
    
    ff = file.key
    if ".mp3" in ff:
        downloaded_mp3.append(ff.split("/")[1].split(".")[0])

In [4]:
# Convert to a set
set_songs = set(downloaded_mp3)

In [5]:
len(set_songs)

56334

In [8]:
len(downloaded_mp3)

56334

## Check the size 

In [35]:
def get_size_check(track_id):
    """
    Checks if the original audio is more than 10MB of audio, if it is, returns a False
    """
    s3 = boto3.resource('s3')
    key_mp3 = "audio/" + track_id + ".mp3"
    s3object = s3.Object('tfmdavid',key_mp3)
    file_size = s3object.content_length #size in bytes
    size_megas = file_size / 1000000
    if size_megas > 10:
        return False
    else: 
        return True

# 1. Send jobs to *jobs_specto*

In [6]:
# Create SQS client
sqs = boto3.client('sqs')
URL_q_jobs = "https://sqs.eu-west-2.amazonaws.com/555381533193/jobs_specto"
URL_q_status = "https://sqs.eu-west-2.amazonaws.com/555381533193/status_specto"

In [7]:
def send_message_jobs(track_id):
    """Send message to SQS queue: the track from which we want to extract the spectograms
    """
    response = sqs.send_message(
        QueueUrl=URL_q_jobs,
        DelaySeconds=0,
        MessageAttributes={},
        MessageBody=(f"{track_id}")    )


In [11]:
for track_id in tqdm.tqdm_notebook(downloaded_mp3):
    send_message_jobs(track_id)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=56334.0), HTML(value='')))




# 2. List all specto songs

In [14]:
# List all the files in the S3 bucket audio folder as .mp3
import boto3
s3 = boto3.resource('s3')
allFiles = s3.Bucket("tfmdavid").objects.filter(Prefix='spec')

specto = list()
for file in tqdm.tqdm_notebook(allFiles):
    
    ff = file.key
    if ".jpg" in ff:
        specto.append(ff.split("/")[1].split(".")[0].split("__")[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# 3. Query status_specto results

In [37]:
def query_results_to_df(query_results):
    if len(query_results) == 0:
        return False
    cols = ["instance_id","stat","track_id","win","ini","fin","rows","cols","date"]
    return pd.DataFrame(query_results, columns=cols)

In [38]:
query_status = """ 
SELECT * FROM status_specto
"""
ENDPOINT="tracksurl.czjs6btlvfgd.eu-west-2.rds.amazonaws.com"
PORT="5432"
USR="david"
REGION="eu-west-2"
DBNAME="postgres"
PSSWD=["qrks","jfut","iv","uf","1"]

conn = psycopg2.connect(host=ENDPOINT, port=PORT, database=DBNAME, user=USR, password=''.join(PSSWD))
cur = conn.cursor()
cur.execute(query_status)
query_results = cur.fetchall()
df_status = query_results_to_df(query_results)
conn.close()

### Basic statistics

In [42]:
# Unique tracks
len(set(df_status.track_id))

56287

In [63]:
56287 * 56287

3168226369

In [50]:
# Everythin is correctly processed
len(set(df_status.stat))

1

In [51]:
# Number of tracks processed by instance
df_track_per_instance = df_status.groupby("instance_id")["track_id"].nunique()
df_track_per_instance = pd.DataFrame(df_track_per_instance)
df_track_per_instance.sort_values("track_id", ascending = False)

Unnamed: 0_level_0,track_id
instance_id,Unnamed: 1_level_1
i-0d4ae1de430e5f358,1740
i-0a99a59641905bf90,1718
i-02128f60a7d245ef0,1713
i-0163a2365e18dc4dc,1713
i-08bc790c85f973018,1704
...,...
i-082f7712582fdd9ff,132
i-006a14ef33eaa3de2,52
i-0c3a34d80a66aebf2,41
i-016142111f658b4fa,37


In [53]:
# Number of windows per track on average
df_win_per_track = pd.DataFrame(df_status.groupby("track_id")["win"].count())

In [55]:
np.mean(df_win_per_track["win"])

11.44335281681383

In [56]:
np.std(df_win_per_track["win"])

15.883272448566299

In [61]:
# See the image dimensions
# FIlter only those cols with 937 or 938
df_filt = df_status[df_status["cols"].isin({937, 938})]

df_img_dim = pd.DataFrame(df_filt.groupby(["rows","cols"])["track_id"].count())

In [62]:
df_img_dim

Unnamed: 0_level_0,Unnamed: 1_level_0,track_id
rows,cols,Unnamed: 2_level_1
256,937,615909
256,938,28138
