# Migrate Clean.db to S3

One of the parts of the projects contains working on S3, this means moving all that is being done in SQLite database to an S3 bucket, since it is the most economic version for what we want to do. What this script will do is take the clean.db and create a file system that can replicate this datbase in a S3 bucket as a file system:

In [1]:
# Database
import sqlalchemy as db
import pandas as pd
import sys
import os
import time
import re #for avoiding looking at titles with starting parenthesis
import numpy as np
import tqdm
from io import StringIO 
import boto3

In [2]:
folder_path = "nonmatch-query"

## 1. Database connection 

In [3]:
## 1.1 Connect to the database CLEAN

#Paths
path_db_final = os.path.join("..","data","MSD","clean.db")
path_sql_connection_db =  'sqlite:///' + path_db_final

#Connect
engine = db.create_engine(path_sql_connection_db)
connection = engine.connect()

In [4]:
def query_db(qq, con = connection, to_df = False):
    res = con.execute(qq)
    if to_df:
        return pd.DataFrame(res.fetchall())
    else:
        return res.fetchall()

## 2. Create all the files from the batches

In [5]:
batch_max = query_db(f"SELECT max(batch_id) FROM match where batch_id < 1000")[0][0]

In [6]:
exceptions = {1: 9500, 14: 2700} # index where the WebScrapping_nonmatch.py failed or stopped, so we note down
# which are the last uploaded iterations of that batch (each iteration is the index in the dataframe of the track).

## 3. Functions

In [29]:
def save_df_to_S3(df, folder_path, name_file, S3_BUCKET = 'musicemotions'):
    #Connect to S3
    s3 = boto3.client("s3")
    
    #Set the destination path
    path_S3 = folder_path + "/" + name_file
    
    # Buffer dataframe to upload
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index = False)

    resp = s3.put_object(Bucket = S3_BUCKET, Key = path_S3, Body = csv_buffer.getvalue())
    return resp

def ls_S3(folder_path, S3_BUCKET = 'musicemotions', maxkeys = 1000):
    #Connect to S3
    s3 = boto3.client("s3")
    
    # S3 list objects
    response = s3.list_objects_v2(
                Bucket=S3_BUCKET,
                Prefix =folder_path,
                MaxKeys=maxkeys )

    files_inside_folder = list()
    for contents_folder in response["Contents"]:
        
        # Get the contents of the folder
        file_names = contents_folder["Key"].split("/")[-1]
        
        #If the name of the file is not empty:
        if len(file_names):
            files_inside_folder.append(file_names)
    return files_inside_folder

def create_df_batch(batch_num):
    # Take all songs and clean the titles and create a set to compare with yt titles
    df = query_db(f"SELECT DISTINCT * FROM nonmatch where batch_id = {batch_num} ", to_df=True)

    df.columns = ["track_id", "query","batch_id"]
    df  = df.sort_values(["track_id"])
    df.index = np.arange(1, df.shape[0] + 1, 1)

    #Restart from the position that was stucked
    if batch_num in exceptions:
        idx_start =  exceptions[batch_num]
        df = df.iloc[idx_start:]
    return df

### 2.1 Non-match

In [30]:
for b_num in tqdm.tqdm_notebook(range(0,batch_max + 1)):
    df_b_num = create_df_batch(b_num)
    res = save_df_to_S3(df_b_num, folder_path, f'{b_num}.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))




## 3. S3 - connection

In [11]:
np.array(ls_S3(folder_path))

array(['0.csv', '1.csv', '10.csv', '11.csv', '12.csv', '13.csv', '14.csv',
       '15.csv', '16.csv', '17.csv', '18.csv', '19.csv', '2.csv',
       '20.csv', '21.csv', '22.csv', '23.csv', '24.csv', '25.csv',
       '26.csv', '27.csv', '28.csv', '29.csv', '3.csv', '30.csv',
       '31.csv', '32.csv', '33.csv', '34.csv', '35.csv', '4.csv', '5.csv',
       '6.csv', '7.csv', '8.csv', '9.csv'], dtype='<U6')

# 4. Read df

In [24]:
name_file = '0.csv'

In [25]:
def load_df_s3(folder_path, file_name, S3_BUCKET = 'musicemotions'):
    s3 = boto3.client("s3")
    path_S3 = folder_path + "/" + file_name  
    csv_obj = s3.get_object(Bucket = S3_BUCKET,  Key = path_S3)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string))
    return df

In [26]:
dfx = load_df_s3(folder_path, name_file)

In [27]:
dfx

Unnamed: 0,track_id,query,batch_id
0,TRAAAZU128F4226F7A,stop laughing moose,0
1,TRAADHC128F4227F6B,let them come to berlin the weathermen,0
2,TRAAFFR128F42719EA,bokstavelig talt (skit) tungtvann,0
3,TRAAFJW128F428A424,stickin in my eye nofx,0
4,TRAAFXF128F4267A2A,brother mine eric schwartz,0
...,...,...,...
20469,TRZZXHU12903CF4706,whoa we the kings,0
20470,TRZZXVN128F93285B4,abschied asp,0
20471,TRZZYLO12903CAC06C,i ve never seen the righteous forsaken dallas ...,0
20472,TRZZZTZ128F92C5A5F,for your eyes only (performed by dea li) holly...,0


In [None]:
 #Set the destination path
    path_S3 = folder_path + "/" + name_file
    
    # Buffer dataframe to upload
    csv_buffer = StringIO()
    df_nonmatch.to_csv(csv_buffer, index = False)

    resp = s3.put_object(Bucket = S3_BUCKET, Key = path_S3, Body = csv_buffer.getvalue())