In [108]:
# Web Scrapping: Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Database
import sqlalchemy as db
import pandas as pd
import sys
import os
import time
import re #for avoiding looking at titles with starting parenthesis
import numpy as np
import tqdm

# Logging
from v_log import VLogger
import logging

In [110]:
# Arguments
batch_num = 0

# Fixed batch_size
batch_size = 10

In [111]:
# Start logging
#log = VLogger(f'Batch {batch_num}', uri_log=f"log/WebScrap_nonmatch.log", file_log_level = logging.INFO)

# 1. Load the clean database

In [112]:
## 1.1 Connect to the database CLEAN

#Paths
path_db_final = os.path.join("..","data","MSD","clean.db")
path_sql_connection_db =  'sqlite:///' + path_db_final

#Connect
engine = db.create_engine(path_sql_connection_db)
connection = engine.connect()

#Log
#log.info("1 Connect to database...")

In [115]:
def query_db(qq, con = connection, to_df = False):
    res = con.execute(qq)
    if to_df:
        return pd.DataFrame(res.fetchall())
    else:
        return res.fetchall()

In [116]:
def query_insert(qq, con = connection):
    try:
        res = con.execute(qq)
        return res
    except:
        return False        

In [163]:
#Log
#log.info("2 Select all songs...")

# Take all songs and clean the titles and create a set to compare with yt titles
df = query_db(f"SELECT * FROM nonmatch where batch_id = {batch_num} ", to_df=True)
df.columns = ["track_id", "query","batch_id"]

# Separate all the non alphabetical chracters with spaces, remove them and create a set of words
df["query_clean_list"] = df["query"].apply(lambda x: fun_clean_title(x, prog, list_return=True))
df["query_clean_list"] = df["query_clean_list"].apply(lambda x: re.sub('[^0-9a-zA-Z]+', ' ', x))
df["query_clean_set"] =  df["query_clean_list"].apply(lambda x: set(x.split(" ")))


#Log
#log.info("2 Select all songs... (Completed)")

In [173]:
df.head()

Unnamed: 0,track_id,query,batch_id,query_clean_set,query_clean_list
0,TRMUOZE12903CDF721,a picture of you the bristols,0,"{the, you, bristols, picture, of, a}",a picture of you the bristols
1,TRAFPWS12903CDF737,i ll be gone the bristols,0,"{i, the, bristols, ll, gone, be}",i ll be gone the bristols
2,TRVIMSB12903CDF720,so fine the bristols,0,"{bristols, so, fine, the}",so fine the bristols
3,TRDXGYA12903CDF729,who does she think she is the bristols,0,"{she, is, the, bristols, who, does, think}",who does she think she is the bristols
4,TRYESJS12903CDF730,old man mose the bristols,0,"{the, bristols, old, mose, man}",old man mose the bristols


# 2. WebScrapping

Query the non-matched titles

### Functions

In [200]:
#Regular expression to avoid strings in MSD initiated with NOT letters
prog = re.compile("^[A-Za-z]") 

def fun_clean_title(titart, prog, list_return = False):
    """
    Cleans the title of a MSD song to avoid () or [] or any special character
    """
    # CLEANING TITLE
    words_list = titart.split(" ")
    # we want to avoid words that don't start with a character
    words_set = set()
    words_clean = list()
    for ww in words_list:
        result = prog.match(ww)
        if result is not None: # avoid starting word with parenthesis
            if '\\' not in ww: #avoid non-coded characters \x19
                if (")" not in ww) and ("(" not in ww):
                    words_set.add(ww.lower())
                    words_clean.append(ww.lower())
    if not list_return:
        return words_set
    else:
        return " ".join(words_clean)

def query_yt_song(qq_song, query_set):
        #Search that artist on youtube
        browser.get(f"https://www.youtube.com/results?search_query={qq_song}")

        # List all the elements in video-title
        vid_title_elems = browser.find_elements_by_id('video-title')

        # Save videos and their URL as tuple
        for vte in vid_title_elems:
            
            yt_title =  vte.get_attribute("title")
            yt_href  =  vte.get_attribute('href')
            
            #Compare that title with the query and if coincides in all words except 1 get that href
            if compare_song_vs_title(yt_title, query_set):
                
                #Make sure that the href is not a playlist (hence playlist does not have href: None)
                if yt_href:
                    return yt_href
        return ""

def compare_song_vs_title(yt_tit, query_set):
    
    # YOUTUBE SONG to SET (cleaned)
    yt_set = fun_clean_title(yt_tit, prog, list_return=True) #returns a string
    
    #SUBSTITUTE ANY NON ALPHANUMERICA CHARACTERS by white space
    yt_set = re.sub('[^0-9a-zA-Z]+', ' ', yt_set)
    yt_set = set(yt_set.split(" ")) #convert the words separated by spaces into a set
    
    #Maybe that set contains words with one letter, so in that case we will remove them
    neat_query_set = set();
    for nn in list(query_set):
        if len(nn) > 1:
            neat_query_set.add(nn)
    query_set = neat_query_set; # only take the query set as the neat set without single letters or white spaces
    
    # Intersection
    int_set = query_set.intersection(yt_set)
    
    # Compare the length of the yt_set with the query set
    if len(int_set) >= (len(query_set) - 1): # allow one missing word in the intersection compared to the query set
        return True
    elif len(query_set) > 4: # if the query is bigger thatn 4, allow a intersection set coincidence of 2 words less
        if len(int_set) >= (len(query_set) - 2):
            return True
    else:
        return False

### Action

In [201]:
dt_test = df.sample(100, random_state=42)

In [202]:
dict_match = dict()
dict_nonmatch = dict()
browser = webdriver.Chrome()


for ii, row in tqdm.tqdm(dt_test.iterrows()):
    track_id, query, batch_id, query_clean_set, query_clean_list = row
    
    # Query yt song and compare the titles with the query set
    result = query_yt_song(query_clean_list, query_clean_set)
    
    # Save the cases in which we find a match
    if len(result): #there is a href
        dict_match[track_id] = result
    else: #if not, only store the query done
        dict_nonmatch[track_id] = query_clean_list
browser.close()


0it [00:00, ?it/s][A
1it [00:01,  1.61s/it][A
2it [00:02,  1.46s/it][A
3it [00:03,  1.38s/it][A
4it [00:04,  1.23s/it][A
5it [00:05,  1.19s/it][A
6it [00:06,  1.10s/it][A
7it [00:07,  1.05s/it][A
8it [00:08,  1.04s/it][A
9it [00:09,  1.00it/s][A
10it [00:10,  1.02s/it][A
11it [00:11,  1.03it/s][A
12it [00:12,  1.04s/it][A
13it [00:13,  1.00it/s][A
14it [00:14,  1.03it/s][A
15it [00:15,  1.03it/s][A
16it [00:16,  1.01s/it][A
17it [00:17,  1.03it/s][A
18it [00:18,  1.08s/it][A
19it [00:19,  1.04s/it][A
20it [00:20,  1.01it/s][A
21it [00:21,  1.02it/s][A
22it [00:22,  1.04it/s][A
23it [00:23,  1.05it/s][A
24it [00:24,  1.04it/s][A
25it [00:25,  1.04it/s][A
26it [00:26,  1.00it/s][A
27it [00:27,  1.03it/s][A
28it [00:28,  1.05it/s][A
29it [00:29,  1.05it/s][A
30it [00:30,  1.01s/it][A
31it [00:31,  1.02it/s][A
32it [00:32,  1.04s/it][A
33it [00:33,  1.00s/it][A
34it [00:34,  1.05it/s][A
35it [00:35,  1.03it/s][A
36it [00:36,  1.02s/it][A
37it [00:37,  

In [204]:
dict_nonmatch

{'TRMUVPJ128F14AE671': 'got to b tru great advenuture album steven curtis chapman',
 'TRVODSD128F424424F': 'bye bye baby big maceo',
 'TRUNQIN128F423D5D7': 'outra vez charlie byrd',
 'TRZEEAF12903CCFD79': 'beauty of the sea the gabe dixon band',
 'TRFGCTI128E079594C': 'debussy etudes vi pour les huits doigts pierre laurent aimard',
 'TRLNJKL128F42396B9': 'why do they never play les savy fav on the radio jetplane landing',
 'TRDGNAI128F92E79F3': 'i m a gamblin woman memphis minnie',
 'TRWSREF128F92EFF67': 'polka medley tiller s folly',
 'TRPZXIS128F4275CB1': 'why do i feel wilks',
 'TRAGWKV128F92F0F35': 'love smashed on a rock martyn bates'}

### Upload to database

In [211]:
def insert_dmatch(dmatch, batch_num):
    query_insert_string = "insert into match values "
    for track_id, url in dmatch.items():
        query_insert_string += f"('{track_id}', '{url}', '{batch_num + 1000}'),"
    query_insert_string = query_insert_string[:-1]
    query_insert_string = query_insert_string + ";"
    
    # Insert
    try:
        res = query_insert(query_insert_string)
        return res, query_insert_string
    except:
        return False, query_insert_string

def insert_dnonmatch(dnonmatch, batch_num):
    query_insert_string = "insert into nonmatch values "
    for track_id, yt_query in dnonmatch.items():
        yt_query = yt_query.replace("'"," ")
        query_insert_string += f"('{track_id}', '{yt_query}', '{batch_num + 1000}'),"
    query_insert_string = query_insert_string[:-1]
    query_insert_string = query_insert_string + ";"
    
     # Insert
    try:
        res = query_insert(query_insert_string)
        return res, query_insert_string
    except:
        return False, query_insert_string

Uploading....

In [212]:
#Log
log.info("6 Inserting into match and nonmatch tables...")

# Run the queries to insert info
resmatch, qqmatch = insert_dmatch(dict_match, batch_num)
resnonmatch, qqnonmatch = insert_dnonmatch(dict_nonmatch, batch_num)

# Report errors in the logging
if resmatch is False:
    log.info(f"6.1 EROR INSERTING query in match: {qqmatch}...")
    
if resnonmatch is False:
    log.info(f"6.1 EROR INSERTING query in nonmatch: {qqnonmatch}...")
    
if resmatch:
    if resnonmatch:
        log.info(f"7 Succesfully scrapped Batch {batch_num}")

Done
