In [210]:
import pandas as pd
import sys
import numpy as np
import os
import requests
import datetime
import base64
import json
import re
from collections import Counter, defaultdict
import psycopg2
import tqdm
from bs4 import BeautifulSoup
from importlib import reload

In [197]:
def get_title(content):
    title = content.find("h3", class_ = "yt-lockup-title")
    return title.find("a").get("title")

def get_metadata(content):
    metadata = content.find("div", class_ = "yt-lockup-meta")
    visualizations = metadata.find_all("li")
    number_views = "-1"
    for vv in visualizations:
        if "visual" in vv.text:
            number_views = vv.text 
    return number_views

def get_artist(content):
    return content.find("div", class_ = "yt-lockup-byline").text

def get_descr(content):
    description = content.find("div", class_ = "yt-lockup-description")
    return description.text

def get_href(content):
    return content.find("a").get("href")

# Clean string
def clean_string(st):
    st = st.lower()
    st = st.replace(","," ")
    st = st.replace(";", " ")
    st = st.replace("&", " ")
    st = st.replace("[", " ")
    st = st.replace("]", " ")
    st = st.replace("-", " ")
    return st

def split_string(st):
    return [particle for particle in st.split(" ") if len(particle)>0]

def match_title(queried, qtrack, qartist, a, b, c):
    """
    a = title.find("a").get("title") (text that appears on the title)
    b = artist_yt.text (text that appears on the artist section)
    c = description.text (text that appears on the description)
    queried = track_name + " " + artist_name
    qtrack = track_name
    qartist = artist_name
    """
    
    # Clearning
    a = clean_string(a)
    b = clean_string(b)
    c = clean_string(c)
    queried = clean_string(queried)
    qtrack = clean_string(track_name)
    qartist = clean_string(qartist)
    
    # Splitting
    a = set(split_string(a))
    b = set(split_string(b))
    c = set(split_string(c))
    queried = set(split_string(queried))
    qtrack = set(split_string(qtrack))
    qartist = set(split_string(qartist))
    
    # Matching
    match = False
    lquery = len(queried) #length of total query
    ltrack = len(qtrack) #length of elements of qtrack
    lartist = len(qartist) # length of elements of qartis
    
    # Check match with title
    if len(a & queried) == lquery:
        match = True
    elif len(b & queried) == lartist: # if the artist name coincides
        # if the title contains the track name or at least almost all words except 1
        if len(a & qtrack) >= (ltrack -1): 
            match = True
    elif len(c & queried) == lquery:
        match = True
    elif len(a & queried) == (lquery - 1): # allow one word not to be present
        match = True
        
    return match

In [240]:
# Read the queries dataframe
df = pd.read_csv("../data/01_queries_yt/queries.csv", delimiter = ";")
df.set_index("index", inplace=True)

df = df.loc[range(1,10000)]
df = df.sample(30)

In [243]:
df.head()

Unnamed: 0_level_0,artist_id,artist_name,popularity,is_main,track_id,track_name,streams
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6553,1Hsdzj7Dlq2I7tHP7501T4,Niall Horan,84,True,764Q4RKFyJCisAofG4mQiA,you and me,52702
3164,7c0XG5cIJTrrAgEC3ULPiq,Ty Dolla $ign,89,True,4ccv9Mfcsc3MpaSi9h9tH6,whatever you on,14994
2161,6vWDO969PvNqNYHIOW5v0m,Beyoncé,91,True,7I2amOVnwKIZj0tXz6Lz3E,flawless remix,6651537
4639,0EmeFodog0BfCgMzAIvKQp,Shakira,86,True,3N18UD3tL8TPKsiUWN9YRR,comme moi,167028
5692,3p7PcrEHaaKLJnPUGOtRlT,Henrique & Juliano,85,True,3056x1XkmICnQmji68rrft,desejando eu (ao vivo),3275316


In [244]:
results = dict()
failed = defaultdict(dict)

In [252]:
for i, row in tqdm.tqdm_notebook(df.iterrows()):
    artist_name = row.artist_name
    artist_id = row.artist_id
    track_name = row.track_name
    track_id = row.track_id
    
    # Query
    queried = track_name + " " + artist_name
    qtrack = track_name
    qartist = artist_name
    qq = f'https://www.youtube.com/results?search_query={queried}'
    
    # Requests
    r = requests.get(qq)
    r.encoding = 'utf-8'

    # Read the html text with Beautiful soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # All titles entries
    all_content = soup.find_all("div", class_="yt-lockup-content")
    if not len(all_content):
        failed[artist_id][track_id] = "CONTENT" + " " + queried
    for content in all_content:
        
        # TITLE
        try:
            yt_title = get_title(content)
        except:
            yt_title = ""
        # ARTIST
        try:
            yt_artist = get_artist(content)
        except:
            yt_artist = ""
        
        # DESCRIPTION
        try:
            yt_desc = get_descr(content)
        except:
            yt_desc = ""
        
        
        # METADATA
        try:
            yt_meta = get_metadata(content)
        except:
            yt_meta = ""
            
        # GET HREF: avoid playlist (string list= in the href url)
        track_url = get_href(content)
        if "list=" in track_url:
            failed[artist_id][track_id] = "PLAYLIST"
            continue
        
        # Check match
        match = match_title(queried, qtrack, qartist, yt_title, yt_artist, yt_desc)

        # If we have a match and it's not a playlist
        if match and len(track_url):
            
            # Create an entry for this artist
            if not artist_id in results:
                results[artist_id] = dict()
                
            # Save the song info
            results[artist_id][track_id] = dict()
            results[artist_id][track_id]["name"] = track_name
            results[artist_id][track_id]["href"] = "https://www.youtube.com" + track_url
            results[artist_id][track_id]["meta"] = yt_meta
            
            # Break the loop for the tiltes and go to another song
            break
        else:
            failed[artist_id][track_id] = queried
            continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [257]:
results

{'0EmeFodog0BfCgMzAIvKQp': {'3N18UD3tL8TPKsiUWN9YRR': {'name': 'comme moi',
   'href': 'https://www.youtube.com/watch?v=YVaahuLdrng',
   'meta': '25.969.125 visualizaciones'}},
 '2FXC3k01G6Gw61bmprjgqS': {'7dS5EaCoMnN7DzlpT6aRn2': {'name': 'take me to church',
   'href': 'https://www.youtube.com/watch?v=PVjiKRfKpPI',
   'meta': '367.683.755 visualizaciones'}},
 '6TIYQ3jFPwQSRmorSezPxX': {'1bWEngw5tAF7vWByrf5Oy5': {'name': 'bloody valentine',
   'href': 'https://www.youtube.com/watch?v=wSdT-SArM2Q',
   'meta': '22.818.029 visualizaciones'}},
 '04gDigrS5kc9YWfZHwBETP': {'4tnnfGGLBYWwotE4Q8H5jf': {'name': 'sugar - remix',
   'href': 'https://www.youtube.com/watch?v=0q-80dzp6PU',
   'meta': '3.959.549 visualizaciones'}},
 '1wZtkThiXbVNtj6hee6dz9': {'1dNTcbUtbU3wRxKBCno6bd': {'name': 'te siento',
   'href': 'https://www.youtube.com/watch?v=SKWxOsbt9gU',
   'meta': '231.636.561 visualizaciones'}},
 '1dfeR4HaWDbWqFHLkxsg1d': {'2fuCquhmrzHpu5xcA1ci9x': {'name': 'under pressure - remastered',
 

In [230]:
queried

'marijuana Hef'

In [254]:
queried = "me gusta Anuel Aa"
qq = f'https://www.youtube.com/results?search_query={queried}'

# Requests
r = requests.get(qq)
r.encoding = 'utf-8'

# Read the html text with Beautiful soup
soup = BeautifulSoup(r.text, 'html.parser')

# All titles entries
all_content = soup.find_all("div", class_="yt-lockup-content")

In [256]:
soup

<!DOCTYPE html>
<html dir="ltr" gl="ES" lang="es-ES" style="font-size: 10px;font-family: Roboto, Arial, sans-serif;"><head><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="AhbmRDASY7NuOZD9cFMgQihZ+mQpCwa8WTGdTx82vSar9ddBQbziBfZXZg+ScofvEZDdHQNCEwz4yM7HjBS9RgkAAABneyJvcmlnaW4iOiJodHRwczovL3lvdXR1YmUuY29tOjQ0MyIsImZlYXR1cmUiOiJXZWJDb21wb25lbnRzVjAiLCJleHBpcnkiOjE2MDM0ODY4NTYsImlzU3ViZG9tYWluIjp0cnVlfQ==" data-expires="2020-10-23" data-feature="Web Components V0" http-equiv="origin-trial"/><meta content="Av2+1qfUp3MwEfAFcCccykS1qFmvLiCrMZ//pHQKnRZWG9dldVo8HYuJmGj2wZ7nDg+xE4RQMQ+Ku1zKM3PvYAIAAABmeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZS5jb206NDQzIiwiZmVhdHVyZSI6IldlYkNvbXBvbmVudHNWMCIsImV4cGlyeSI6MTYwMzgzNjc3MiwiaXNTdWJkb21haW4iOnRydWV9" data-expires="2020-10-27" data-feature="Web Components V0" http-equiv="origin-trial"/><meta content="AixUK+8UEShlt6+JX1wy9eg+XL+eV5PYSEDPH3C90JNVbIkE1Rg1FyVUfu2bZ/y6Pm1xbPLzuwHYHjv4uKPNnA4AAABqeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZXByb2QuY29tOjQ0My

In [207]:
len(set(df.artist_id))

99

In [208]:
len(fail.keys())

NameError: name 'fail' is not defined