# Migrate Clean.db to S3

One of the parts of the projects contains working on S3, this means moving all that is being done in SQLite database to an S3 bucket, since it is the most economic version for what we want to do. What this script will do is take the clean.db and create a file system that can replicate this datbase in a S3 bucket as a file system:

In [1]:
# Database
import sqlalchemy as db
import pandas as pd
import sys
import os
import time
import re #for avoiding looking at titles with starting parenthesis
import numpy as np
import tqdm
from io import StringIO 
import boto3

## 1. Database connection 

In [3]:
## 1.1 Connect to the database CLEAN

#Paths
path_db_final = os.path.join("..","data","MSD","clean.db")
path_sql_connection_db =  'sqlite:///' + path_db_final

#Connect
engine = db.create_engine(path_sql_connection_db)
connection = engine.connect()

In [4]:
def query_db(qq, con = connection, to_df = False):
    res = con.execute(qq)
    if to_df:
        return pd.DataFrame(res.fetchall())
    else:
        return res.fetchall()

## 2. Create all the files from the batches

In [5]:
batch_max = query_db(f"SELECT max(batch_id) FROM match where batch_id < 1000")[0][0]

In [6]:
exceptions = {1: 9500, 14: 2700} # index where the WebScrapping_nonmatch.py failed or stopped, so we note down
# which are the last uploaded iterations of that batch (each iteration is the index in the dataframe of the track).

## 3. Functions

In [79]:
def save_df_to_S3(df, folder_path, name_file, S3_BUCKET = 'musicemotions'):
    #Connect to S3
    s3 = boto3.client("s3")
    
    #Set the destination path
    path_S3 = folder_path + "/" + name_file
    
    # Buffer dataframe to upload
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index = False)

    resp = s3.put_object(Bucket = S3_BUCKET, Key = path_S3, Body = csv_buffer.getvalue())
    return resp

def ls_S3(folder_path, S3_BUCKET = 'musicemotions', maxkeys = 1000):
    #Connect to S3
    s3 = boto3.client("s3")
    
    # S3 list objects
    response = s3.list_objects_v2(
                Bucket=S3_BUCKET,
                Prefix =folder_path,
                MaxKeys=maxkeys )

    files_inside_folder = list()
    for contents_folder in response["Contents"]:
        
        # Get the contents of the folder
        file_names = contents_folder["Key"].split("/")[-1]
        
        #If the name of the file is not empty:
        if len(file_names):
            files_inside_folder.append(file_names)
    return files_inside_folder

def create_df_batch_nonmatch(batch_num):
    # Take all songs and clean the titles and create a set to compare with yt titles
    df = query_db(f"SELECT DISTINCT * FROM nonmatch where batch_id = {batch_num} ", to_df=True)

    df.columns = ["track_id", "query","batch_id"]
    df  = df.sort_values(["track_id"])
    df.index = np.arange(1, df.shape[0] + 1, 1)

    #Restart from the position that was stucked
    if batch_num in exceptions:
        idx_start =  exceptions[batch_num]
        df = df.iloc[idx_start:]
    return df


def create_df_batch_match(batch_num):
    # Take all songs and clean the titles and create a set to compare with yt titles
    batch_num_nonmatch = batch_num + 1000
    df = query_db(f"SELECT DISTINCT * FROM match where batch_id in ({batch_num}, {batch_num_nonmatch})", to_df=True)

    df.columns = ["track_id", "url","batch_id"]
    df  = df.sort_values(["track_id"])
    df.index = np.arange(1, df.shape[0] + 1, 1)
    return df

def create_df_batch_nonmatch_nonmatch(batch_num):
    # Take all songs and clean the titles and create a set to compare with yt titles
    batch_num_nonmatch = batch_num + 1000
    df = query_db(f"SELECT DISTINCT * FROM nonmatch where batch_id = {batch_num_nonmatch} ", to_df=True)

    df.columns = ["track_id", "query","batch_id"]
    df  = df.sort_values(["track_id"])
    df.index = np.arange(1, df.shape[0] + 1, 1)
    return df

# 1. Migration of Non-Match: queries

In [None]:
folder_path = "nonmatch-query"

## Non-match

In [30]:
for b_num in tqdm.tqdm_notebook(range(0,batch_max + 1)):
    df_b_num = create_df_batch_nonmatch(b_num)
    res = save_df_to_S3(df_b_num, folder_path, f'{b_num}.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))




## S3 - connection

In [11]:
np.array(ls_S3(folder_path))

array(['0.csv', '1.csv', '10.csv', '11.csv', '12.csv', '13.csv', '14.csv',
       '15.csv', '16.csv', '17.csv', '18.csv', '19.csv', '2.csv',
       '20.csv', '21.csv', '22.csv', '23.csv', '24.csv', '25.csv',
       '26.csv', '27.csv', '28.csv', '29.csv', '3.csv', '30.csv',
       '31.csv', '32.csv', '33.csv', '34.csv', '35.csv', '4.csv', '5.csv',
       '6.csv', '7.csv', '8.csv', '9.csv'], dtype='<U6')

# 2. Migration of Match: results

In [68]:
folder_path = "match-results"

## Match

In [69]:
for b_num in tqdm.tqdm_notebook(range(0,batch_max + 1)):
    df_b_num = create_df_batch_match(b_num)
    res = save_df_to_S3(df_b_num, folder_path, f'{b_num}.csv')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))




# 3. Migration of NonMatched: results

In [None]:
folder_path = "nonmatch-results"

In [None]:
for b_num in tqdm.tqdm_notebook(range(0,batch_max + 1)):
    df_b_num = create_df_batch_nonmatch_nonmatch(b_num)
    res = save_df_to_S3(df_b_num, folder_path, f'{b_num}.csv')

# 4. Read df

In [74]:
name_file = '35.csv'

In [75]:
def load_df_s3(folder_path, file_name, S3_BUCKET = 'musicemotions'):
    s3 = boto3.client("s3")
    path_S3 = folder_path + "/" + file_name  
    csv_obj = s3.get_object(Bucket = S3_BUCKET,  Key = path_S3)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string))
    return df

In [76]:
dfx = load_df_s3(folder_path, name_file)

In [77]:
dfx

Unnamed: 0,track_id,url,batch_id
0,TRAAXRN128F4214AC9,https://www.youtube.com/watch?v=QelLZOMnUqY,35
1,TRABOED12903D0DB16,https://www.youtube.com/watch?v=Nv2v45D7nyM,35
2,TRADCBW12903CEFFAA,https://www.youtube.com/watch?v=IiBXhTLbwRo,35
3,TRADDAQ12903CBBEA4,https://www.youtube.com/watch?v=lN8uDYyNgxY,35
4,TRAEHMU128F92FAE6E,https://www.youtube.com/watch?v=6SSz97zGiyA,35
...,...,...,...
826,TRZXBEH128F4239266,https://www.youtube.com/watch?v=K_GYy7Zfiig,35
827,TRZXNZF128F931EDF3,https://www.youtube.com/watch?v=q5DnGxNQExE,35
828,TRZZANS128F426808A,https://www.youtube.com/watch?v=L0zjCMMbnjo,35
829,TRZZIGO128F1499EE9,https://www.youtube.com/watch?v=jVtYa-MnOIY,35


# 5. Load any file to S3

In [31]:
!pwd

D:\Google Drive\25. SaturdaysAI\0_Project\project\aws


In [35]:
path_log

'..\\webscrapping\\log\\WebScrap.log'

In [None]:
 #Set the destination path
    path_S3 = folder_path + "/" + name_file
    
    # Buffer dataframe to upload
    csv_buffer = StringIO()
    df_nonmatch.to_csv(csv_buffer, index = False)

    resp = s3.put_object(Bucket = S3_BUCKET, Key = path_S3, Body = csv_buffer.getvalue())

In [39]:
def file_to_S3(local_path, S3_path,  S3_BUCKET = 'musicemotions'):
    """
    local_path = os.path.join("..","webscrapping","log","WebScrap.log")
    S3_path = nonmatch-query/log.txt
    """
    if S3_path:
        s3 = boto3.resource('s3')
        resp = s3.Object(S3_BUCKET, S3_path).put(Body=open(local_path, 'rb'))
    else:
        s3 = boto3.resource('s3')
        resp = s3.Object(S3_BUCKET, S3_path).put(Body=open(local_path, 'rb'))
    return resp

In [None]:
def S3_to_obj(folder_path, file_name, S3_BUCKET = 'musicemotions'):
    s3 = boto3.client("s3")
    path_S3 = folder_path + "/" + file_name  
    csv_obj = s3.get_object(Bucket = S3_BUCKET,  Key = path_S3)
    body = csv_obj['Body']
    csv_string = body.read().decode('utf-8')
    df = pd.read_csv(StringIO(csv_string))
    return df

In [47]:
content = list()
with open(os.path.join("..","webscrapping","log","WebScrap.log"),'r') as f:
    for line in f.readlines():
        content.append(line.strip())

In [48]:
all_content = "\\n".join(content)

In [50]:
s3 = boto3.resource('s3')
resp = s3.Object(S3_BUCKET, "prova.txt").put(Body=all_content)

# Tests

In [51]:
match_dict = {"TR1": "yjkalskdalk", "TR2": "akasdjkaslk", "TR3": "asldkasjldkasld"}

In [54]:
df_match = pd.DataFrame(match_dict.items())

Unnamed: 0,0,1
0,TR1,yjkalskdalk
1,TR2,akasdjkaslk
2,TR3,asldkasjldkasld
