In [115]:
# Web Scrapping: Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Database
import sqlalchemy as db
import pandas as pd
import sys
import os
import time
import re #for avoiding looking at titles with starting parenthesis
import numpy as np
import tqdm

# 1. Load the clean database

In [106]:
## 1.1 Connect to the database CLEAN

#Paths
path_db_final = os.path.join("..","data","MSD","clean.db")
path_sql_connection_db =  'sqlite:///' + path_db_final

#Connect
engine = db.create_engine(path_sql_connection_db)
connection = engine.connect()

In [107]:
def query_db(qq, con = connection, to_df = False):
    res = con.execute(qq)
    if to_df:
        return pd.DataFrame(res.fetchall())
    else:
        return res.fetchall()

In [108]:
df = query_db("SELECT * FROM youtube_url", to_df=True)
df.columns = ["track_id","title","artist_id","yt_url","duration","year","artist_name","artist_iid"]

## 1.1 Create useful fields

In [109]:
# Join the title and the artist name
df["tit_art"] = df["title"] + " " + df["artist_name"]
df["tit_art"] = df["tit_art"].str.lower()
df["title"] = df["title"].str.lower()
df["artist_name"] = df["artist_name"].str.lower()

In [110]:
df.head()

Unnamed: 0,track_id,title,artist_id,yt_url,duration,year,artist_name,artist_iid,tit_art
0,TRMMMKD128F425225D,tanssi vaan,ARMVN3U1187FB3A1EB,,156.55138,1995,karkkiautomaatti,21822,tanssi vaan karkkiautomaatti
1,TRMLAVN128F4252261,yeah yeah jenni,ARMVN3U1187FB3A1EB,,92.52526,1995,karkkiautomaatti,21822,yeah yeah jenni karkkiautomaatti
2,TRWUBYW128F4252258,äl-oo-vee,ARMVN3U1187FB3A1EB,,76.77342,1995,karkkiautomaatti,21822,äl-oo-vee karkkiautomaatti
3,TRGBNVG128F425224D,jää beibi jää,ARMVN3U1187FB3A1EB,,107.59791,1995,karkkiautomaatti,21822,jää beibi jää karkkiautomaatti
4,TRBZRME128F425225E,takaisin en tuu,ARMVN3U1187FB3A1EB,,80.61342,1995,karkkiautomaatti,21822,takaisin en tuu karkkiautomaatti


# 2. WebScrapping

Create a list for every artist_iid, we create a tuple tit_art & track_id, so that we can iterate for that artist through all of the songs in the MSD and see if any of the track id coincides all the words in the tit_art with the words in the youtube title

### Functions

In [277]:
def fun_clean_title(titart, prog):
    """
    Cleans the title of a MSD song to avoid () or [] or any special character
    """
    # CLEANING TITLE
    words_list = titart.split(" ")
    # we want to avoid words that don't start with a character
    words_set = set()
    for ww in words_list:
        result = prog.match(ww)
        if result is not None: # avoid starting word with parenthesis
            if '\\' not in ww: #avoid non-coded characters \x19
                words_set.add(ww)
    return words_set

In [273]:
def get_MSD_title_match_yt_title(artist_name, dict_URL_artist, tuple_titart_trackid, prog):
    """
    Get the artist_name and the list of tuples of songs-songID for that artist in MSD
    Returns: a list of matchins and the list of non-mtched songs-songID
    The list of matchings includes songID - youtubeHREF
    """
    # Lista final
    list_matchings = []

    # Loop over each yt song
    for yts_tit, yts_href in dict_URL_artist[artist_name]:
        if yts_href is None:
            continue #if it is a playlist

        # Split into words the yt_title
        words_yt_tit = list(yts_tit.split(" "))
        words_yt_tit = [stt.lower() for stt in words_yt_tit] # convert strings to lower
        words_yt_tit = set(words_yt_tit)

        # Loop over all the songs in MSD for that artist_name 
        #(droping in case of we find a match the song from the MSD list)
        for ii, tup in enumerate(tuple_titart_trackid):

            # Get the MSD TitArt and the Track ID
            msd_titart, msd_tid = tup
            
            # Avoid songs that are named totally equal as their artist_name
            if msd_titart == artist_name:
                continue

            # CLEAN title of MSD
            set_titart_msd = fun_clean_title(msd_titart, prog)
            length_titart_msd = len(set_titart_msd)

            # Make the intersection
            intersect_yt_msd_tit = set_titart_msd.intersection(words_yt_tit)

            # See if all the words in MSD are present in that yt_title
            if len(intersect_yt_msd_tit) == length_titart_msd:

                # Add the track ID and the href
                list_matchings.append((msd_tid,yts_href))

                # Delete the song ffrom the tuple_titar_trackid
                del tuple_titart_trackid[ii]

                #Get out of the loop since we have found a match for that yt song
                break
    return list_matchings, tuple_titart_trackid

In [336]:
def WebScrapperYoutube(df_WS):
    """
    Performs the WebScrapping
    """

    # Create a dictionary of all the searches with the artist and the titles-href as tuples in that dictionary
    browser = webdriver.Chrome()

    # OUTPUT: Dict for the matching songs and the non-matching
    dict_matching = dict()
    dict_non_matching = dict()

    #Regular expression to avoid strings in MSD initiated with NOT letters
    prog = re.compile("^[A-Za-z]") 

    # Loop for each batch of artists
    for ii,row in tqdm.notebook.tqdm(df_WS.iterrows()):

        # Get all the list of tracks for that artist
        artist_iid, artist_name, tuple_titart_trackid = row

        # ----------------------------------------------
        # Dicts for each artist
        dict_URL_artist = dict()

        #Output
        dict_matching[artist_name] = dict()
        dict_non_matching[artist_name] = dict()
        # ----------------------------------------------

        # ----------------------------------------------
        #Search that artist on youtube
        browser.get(f"https://www.youtube.com/results?search_query={artist_name}")

        # List all the elements in video-title
        vid_title_elems = browser.find_elements_by_id('video-title')

        #Save youtube results for that artists
        dict_URL_artist[artist_name] = list()

        # Save videos and their URL as tuple
        for vte in vid_title_elems:
            try:
                yt_title =  vte.get_attribute("title")
                yt_href  =  vte.get_attribute('href')
                dict_URL_artist[artist_name].append((yt_title, yt_href))
            except:
                continue
        # ----------------------------------------------

        # ----------------------------------------------
        try:
            # Run the title comparator function
            match_LIST, notmatch_LIT = get_MSD_title_match_yt_title(artist_name, dict_URL_artist, tuple_titart_trackid, prog)
            # ----------------------------------------------
            dict_matching[artist_name] = match_LIST
            dict_non_matching[artist_name] = notmatch_LIT
        except:
            dict_non_matching[artist_name] = tuple_titart_trackid

    browser.close()        
    return dict_matching, dict_non_matching

### Songs for the artists in the same register as a list

In [338]:
# Create a unique register for each artist to store all of his songs
df_list_tit_art = df.groupby(['artist_iid','artist_name'])[['tit_art', "track_id"]].apply(lambda x: x.values.tolist()) \
    .reset_index(name = "tuple_titart_trackid").sort_values(['artist_iid']).set_index("artist_iid")

### Launch as batchs the artists queries to youtube (Selenium)

In [344]:
batch_num = 0

# Fixed batch_size
batch_size = 10

# Number of all artists
num_queries = df_list_tit_art.shape[0]

#List of artist_iid that will take part in that batch
l_queries = df_list_tit_art.index.values

# Select the range of artist_iid for that batch
idx_artist_iid_batch =  l_queries[batch_num*batch_size : (batch_num+1)*batch_size]

# Select the dataframe registers for that batch
df_WebScrapping = df_list_tit_art.loc[idx_artist_iid_batch,:].reset_index()

In [346]:
dmatch, dnonmatch = WebScrapperYoutube(df_WebScrapping)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [347]:
dmatch

{'the bristols': [('TRNBXWJ12903CDF72B',
   'https://www.youtube.com/watch?v=aUHK6yrFboQ'),
  ('TRYEKZM12903CDF71D', 'https://www.youtube.com/watch?v=1MJGPxw1bK4'),
  ('TRQAUBZ12903CDF733', 'https://www.youtube.com/watch?v=DXgYTCduBpw')],
 'stephen varcoe': [],
 'carroll thompson': [('TRDBNUI128F933DE6E',
   'https://www.youtube.com/watch?v=MeimeGWNCAo'),
  ('TRXVMSV128F933DF63', 'https://www.youtube.com/watch?v=umq4a1jJ3Kw'),
  ('TRBSYZF12903CAE305', 'https://www.youtube.com/watch?v=dXyikEd2SC0'),
  ('TRRVVRV12903CD6C7C', 'https://www.youtube.com/watch?v=EVtfun47zIg')],
 'gorodisch': [],
 '1.000 mexicans': [('TRDGDXS128F92E88F9',
   'https://www.youtube.com/watch?v=8cBDbbl7Fso'),
  ('TRSUKEK128F92E88FD', 'https://www.youtube.com/watch?v=NNBoZQi_P6Y'),
  ('TRORXHO128F92E88F5', 'https://www.youtube.com/watch?v=z50CiuPqr8A')],
 'the meatmen': [],
 'miles davis': [],
 'deepack': [('TRNPEOL12903CBC2B8',
   'https://www.youtube.com/watch?v=yxh95J0Gzx4')],
 'basslovers united': [('TRWJBQX129

In [348]:
dnonmatch

{'the bristols': [['a picture of you the bristols', 'TRMUOZE12903CDF721'],
  ["i'll be gone the bristols", 'TRAFPWS12903CDF737'],
  ["you're a moody guy the bristols", 'TREYJSR12903CDF71B'],
  ['so fine the bristols', 'TRVIMSB12903CDF720'],
  ['who does she think she is the bristols', 'TRDXGYA12903CDF729'],
  ['old man mose the bristols', 'TRYESJS12903CDF730']],
 'stephen varcoe': [['silent night (mohr; trans. & arr. willcocks) (1985 digital remaster) stephen varcoe',
   'TRMZLJF128F4269EAC'],
  ['mass no. 6 in g major_ hob.xxii:6_ "missa sancti nicolai"_ "nikolaimesse": credo: credo in unum deum stephen varcoe',
   'TRWMSSB12903CF70A8'],
  ['weihnachtslieder op. 8 (1985 digital remaster): iii.     the three kings (trans. h. n. bate: arr. ivor atkins) stephen varcoe',
   'TRHHCRG128F4269EB8'],
  ['silent night stephen varcoe', 'TRFGNPS128F423BD2C'],
  ['mass no. 6 in g major_ hob.xxii:6_ "missa sancti nicolai"_ "nikolaimesse": credo: et resurrexit - stephen varcoe',
   'TRQVLUX12903CF7

In [287]:
# Create a dictionary of all the searches with the artist and the titles-href as tuples in that dictionary
browser = webdriver.Chrome()

# OUTPUT: Dict for the matching songs and the non-matching
dict_matching = dict()
dict_non_matching = dict()

#Regular expression to avoid strings in MSD initiated with NOT letters
prog = re.compile("^[A-Za-z]") 

# Loop for each batch of artists
for ii,row in tqdm.notebook.tqdm(df_list_tit_art.sample(50, random_state=6).iterrows()):
    
    # Get all the list of tracks for that artist
    artist_iid, artist_name, tuple_titart_trackid = row
    
    # ----------------------------------------------
    # Dicts for each artist
    dict_URL_artist = dict()
    
    #Output
    dict_matching[artist_name] = dict()
    dict_non_matching[artist_name] = dict()
    # ----------------------------------------------
    
    # ----------------------------------------------
    #Search that artist on youtube
    browser.get(f"https://www.youtube.com/results?search_query={artist_name}")
    
    # List all the elements in video-title
    vid_title_elems = browser.find_elements_by_id('video-title')
    
    #Save youtube results for that artists
    dict_URL_artist[artist_name] = list()
    
    # Save videos and their URL as tuple
    for vte in vid_title_elems:
        try:
            yt_title =  vte.get_attribute("title")
            yt_href  =  vte.get_attribute('href')
            dict_URL_artist[artist_name].append((yt_title, yt_href))
        except:
            continue
    # ----------------------------------------------
    
    # ----------------------------------------------
    try:
        # Run the title comparator function
        match_LIST, notmatch_LIT = get_MSD_title_match_yt_title(artist_name, dict_URL_artist, tuple_titart_trackid, prog)
        # ----------------------------------------------
        dict_matching[artist_name] = match_LIST
        dict_non_matching[artist_name] = notmatch_LIT
    except:
        dict_non_matching[artist_name] = tuple_titart_trackid
    
browser.close()        

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


