In [2]:
# Web Scrapping: Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [79]:
# Database
import sqlalchemy as db
import pandas as pd
import sys
import os
import time


In [32]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print('%2.2f ms' % ((te - ts) * 1000))
        return result
    return timed

In [37]:
@timeit
def query_db(qq, con = connection, to_df = False):
    res = con.execute(qq)
    if to_df:
        return pd.DataFrame(res.fetchall())
    else:
        return res.fetchall()

def read_titles(fichero):
    df_titles = pd.read_csv(fichero,delimiter = '<SEP>', 
            header = None, names = ["aId", "NestId","artist", "title"], encoding = 'utf-8')
    return df_titles

# 1. Create the list of artists and songs to look for

## 1.1 Connect to the database

In [9]:
#Paths
path_db = os.path.join("..","data","MSD","youtubeURL.db")
path_sql_connection_db =  'sqlite:///' + path_db

#Connect
engine = db.create_engine(path_sql_connection_db)
connection = engine.connect()

In [123]:
connection.close()

In [40]:
#query_db("SELECT COUNT(DISTINCT(artist_id)) FROM YOUTUBE_URL")

In [39]:
#query_db("SELECT * from YOUTUBE_URL where artist_id = 'ARZNFNV1187FB3DA62'",to_df = True)

## 1.2 Get the set of artists

In [44]:
get_artists = query_db("SELECT DISTINCT(artist_id) from YOUTUBE_URL",to_df = True)
get_artists.columns = ["artist_id"]

279.41 ms


In [47]:
sample_artists = list(get_artists["artist_id"].head())

In [48]:
sample_artists

['AR002UA1187B9A637D',
 'AR003FB1187B994355',
 'AR006821187FB5192B',
 'AR009211187B989185',
 'AR009SZ1187B9A73F4']

## 1.3 For one artist get the song names

In [121]:
# Song query by id
art_q = sample_artists[0]
art_q = "AR002UA1187B9A637D"
query_songs = f"SELECT DISTINCT track_id, title, artist_id, artist_name from YOUTUBE_URL where artist_id = '{art_q}'"
df_songs_art = query_db(query_songs, to_df = True); 
df_songs_art.columns = ["track_id", "title", "artist_id", "artist_name"];

# Join the title and the artist name
df_songs_art["tit&art"] = df_songs_art["title"] + " " + df_songs_art["artist_name"]
df_songs_art["tit&art"] = df_songs_art["tit&art"].str.lower()

1.02 ms


In [122]:
df_songs_art

Unnamed: 0,track_id,title,artist_id,artist_name,tit&art
0,TRMUOZE12903CDF721,A Picture Of You,AR002UA1187B9A637D,The Bristols,a picture of you the bristols
1,TRQAUBZ12903CDF733,The Beating Of My Heart,AR002UA1187B9A637D,The Bristols,the beating of my heart the bristols
2,TRAFPWS12903CDF737,I'll Be Gone,AR002UA1187B9A637D,The Bristols,i'll be gone the bristols
3,TRNBXWJ12903CDF72B,Little Baby,AR002UA1187B9A637D,The Bristols,little baby the bristols
4,TREYJSR12903CDF71B,You're a Moody Guy,AR002UA1187B9A637D,The Bristols,you're a moody guy the bristols
5,TRVIMSB12903CDF720,So Fine,AR002UA1187B9A637D,The Bristols,so fine the bristols
6,TRDXGYA12903CDF729,Who Does She Think She Is,AR002UA1187B9A637D,The Bristols,who does she think she is the bristols
7,TRYESJS12903CDF730,Old Man Mose,AR002UA1187B9A637D,The Bristols,old man mose the bristols
8,TRYEKZM12903CDF71D,Romeo And Juliet,AR002UA1187B9A637D,The Bristols,romeo and juliet the bristols


In [101]:
dict_songs_art = dict()
for ii,row in df_songs_art.iterrows():
    track_id, title, artist_id, artist_name, titart = row
    # IMPORTANT!! Here we set that the words in the title should match the TITLE of the MSD
    dict_songs_art[track_id] = title.split(" ")

In [102]:
dict_songs_art

{'TRMUOZE12903CDF721': ['a', 'picture', 'of', 'you', 'the', 'bristols'],
 'TRQAUBZ12903CDF733': ['the',
  'beating',
  'of',
  'my',
  'heart',
  'the',
  'bristols'],
 'TRAFPWS12903CDF737': ["i'll", 'be', 'gone', 'the', 'bristols'],
 'TRNBXWJ12903CDF72B': ['little', 'baby', 'the', 'bristols'],
 'TREYJSR12903CDF71B': ["you're", 'a', 'moody', 'guy', 'the', 'bristols'],
 'TRVIMSB12903CDF720': ['so', 'fine', 'the', 'bristols'],
 'TRDXGYA12903CDF729': ['who',
  'does',
  'she',
  'think',
  'she',
  'is',
  'the',
  'bristols'],
 'TRYESJS12903CDF730': ['old', 'man', 'mose', 'the', 'bristols'],
 'TRYEKZM12903CDF71D': ['romeo', 'and', 'juliet', 'the', 'bristols']}

# 2. WebScrapping

In [96]:
browser = webdriver.Chrome()

In [5]:
query="Rihanna"
browser.get(f"https://www.youtube.com/results?search_query={query}")

In [7]:
# ids = browser.find_elements_by_xpath('//*[@href]')
# for ii in ids:
#     print(ii.get_attribute('href'))

In [8]:
# ids = browser.find_elements_by_xpath('//*[@href]')
# for ii in ids:
#     print(ii.get_attribute('title'))
# (/*[@id="video-title"]/yt-formatted-string)

In [9]:
ids = browser.find_elements_by_id('video-title')
titles = [x.text for x in ids]
# print out all the titles.
print('titles:')
print(titles, '\n')

titles:
['Mix: Rihanna', 'Rihanna - Only Girl (In The World) (Official Music Video)', 'Rihanna - Diamonds', 'Rihanna - Diamonds', 'Rihanna - Love On The Brain', 'Rihanna - We Found Love ft. Calvin Harris', 'Rihanna - Man Down', 'Rihanna Megamix - The Adventures of BadGalRiRi (40+ Hits in 1 Megamix!)', 'Rihanna - Umbrella (Orange Version) (Official Music Video) ft. JAY-Z', 'DJ Khaled ft. Rihanna, Bryson Tiller - Wild Thoughts (Official Video)', "Rihanna's Best Songs", "Rihanna - What's My Name? (Official Music Video) ft. Drake", 'Rihanna - Only Girl (In The World) (Official Music Video)', 'Rihanna - Where Have You Been', 'Calvin Harris - This Is What You Came For (Official Video) ft. Rihanna', "Rihanna - Don't Stop The Music", 'Rihanna - Rude Boy (Official Music Video)', 'Rihanna - Needed Me', 'Eminem - Love The Way You Lie ft. Rihanna', 'Rihanna - Only Girl (In The World) (Official Music Video)', "Rihanna - What's My Name? (Official Music Video) ft. Drake", 'Rihanna Love On the Brain |

In [None]:
ids = browser.find_elements_by_id('video-title')
titles = [x.text for x in ids]
# print out all the titles.
print('titles:')
print(titles, '\n')

In [None]:
ids = browser.find_elements_by_id('video-title')
for ii in ids:
    print(ii.get_attribute('title'))

In [10]:
ids = browser.find_elements_by_id('video-title')
for ii in ids:
    print(ii.get_attribute('href'))

None
None
None
https://www.youtube.com/watch?v=lWA2pjMjpBs
https://www.youtube.com/watch?v=0RyInjfgNc4
https://www.youtube.com/watch?v=tg00YEETFzg
https://www.youtube.com/watch?v=sEhy-RXkNo0
https://www.youtube.com/watch?v=JUuw1O-SRts
https://www.youtube.com/watch?v=CvBfHwUxHIk
https://www.youtube.com/watch?v=fyaI4-5849w
None
None
None
https://www.youtube.com/watch?v=HBxt_v0WF6Y
https://www.youtube.com/watch?v=kOkQ4T5WO9E
https://www.youtube.com/watch?v=yd8jh9QYfEs
https://www.youtube.com/watch?v=e82VE8UtW8A
https://www.youtube.com/watch?v=wfN4PVaOU5Q
https://www.youtube.com/watch?v=uelHwf8o7_U
https://www.youtube.com/watch?v=pa14VNsdSYM
https://www.youtube.com/watch?v=U0CGsw6h60k
https://www.youtube.com/watch?v=yXvyJDqqQec
https://www.youtube.com/watch?v=o3mP3mJDL2k


In [2]:
ids = browser.find_elements_by_class_name('video-title')
titles = [x.text for x in ids]
# print out all the titles.
print('titles:')
print(titles, '\n')

ModuleNotFoundError: No module named 'numpy'