In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import re
import time
import datetime
import json
import requests
import numpy as np
import warnings
from typing import Callable, Coroutine, List
import asyncio
import aiohttp
from lxml import etree

In [2]:
user_agent = "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
headers = {'User-Agent': user_agent}

In [3]:
# First get all page links
def get_page_links():
    url = "https://www.songkick.com/festivals/countries/world?page=1"
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    block = soup.find_all('div', id="event-listings")
    last_page = int(block[-1].find_all('a', rel="nofollow")[-1].text)
    return [f"https://www.songkick.com/festivals/countries/world?page={i}" for i in range(1, 
        last_page+1
    )]
# Async function to get all festival links
async def get_festival_links(session, url, hrefs=[]):
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        festivals = soup.find_all('div', id="event-listings")
        for festival in festivals:
            for a in festival.find_all('a', href=True):
                if a['href'].startswith('/festivals?page='):
                    pass
                else:
                    hrefs.append("https://www.songkick.com/" + a['href'])
        return hrefs
# Main function to get all festival links
async def main():
    async with aiohttp.ClientSession() as session:
        urls = get_page_links()
        tasks = []
        for url in urls:
            tasks.append(get_festival_links(session, url))
        hrefs = await asyncio.gather(*tasks)
        hrefs = list(set(hrefs[0]))
        return hrefs

hrefs = await main()
len(hrefs)

1634

In [4]:
# Async function to get all festival data
async def get_festival_data(session, url):
    async with session.get(url) as response:
        html = await response.text()
        soup = BeautifulSoup(html, "html.parser")
        block = soup.find('div', class_="col-8 primary")
        try:
            header = soup.find('div', class_="date-and-name")
            name = header.find('span').text
            date = header.find('p').text
            try:
                venue = block.find('div', class_="venue-info-details").find('a').text
                location = block.find('p', class_="venue-hcard").find('span').text.strip()
            except:
                venue = np.nan
                location = np.nan
            artistes = block.find('div', class_="festival-details").find('ul', class_="festival").find_all('li')
            artistes = [artiste.text for artiste in artistes]
            all_artistes = []
            for artiste in artistes:
                all_artistes.append(
                    [artiste, name, date, venue, location]
                )
            return all_artistes
        except:
            pass
# Main function to get all festival data
async def get_details():
    async with aiohttp.ClientSession() as session:
        urls = hrefs
        tasks = []
        for url in urls:
            tasks.append(get_festival_data(session, url))
        data = await asyncio.gather(*tasks)
        data = [i for i in data if i is not None]
        return data
details = await get_details()
df = pd.DataFrame(columns=['artiste', 'festival', 'date', 'venue', 'location'])
for detail in details:
    df = pd.concat([df, pd.DataFrame(detail, columns=['artiste', 'festival', 'date', 'venue', 'location'])], axis=0)
df = df.reset_index(drop=True)
df['date_start'] = df.date.apply(lambda x: x.split(' – ')[0].strip())
df['date_start'] = pd.to_datetime(df['date_start'])
df['date_end'] = df.date.apply(lambda x: x.split(' – ')[1].strip() if ' – ' in x else x)
df['date_end'] = pd.to_datetime(df['date_end'])
df.drop('date', axis=1, inplace=True)
df['address'] = df.location.apply(lambda x: str(x).split('\n')[0])
df['city'] = df.location.apply(lambda x: str(x).split('\n')[-1].split(',')[0])
df['country'] = df.location.apply(lambda x: str(x).split('\n')[-1].split(',')[-1])
df.drop('location', axis=1, inplace=True)
df

Unnamed: 0,artiste,festival,venue,date_start,date_end,address,city,country
0,Arctic Monkeys,The Falls Music and Arts Festival,,2022-12-31,2023-01-02,,,
1,Telenova,The Falls Music and Arts Festival,,2022-12-31,2023-01-02,,,
2,Lordi,HAMMERFEST XIV,O2 Academy Birmingham,2023-02-11,2023-02-11,"16-18 Horsefair, Bristol Street",Birmingham,UK
3,Discharge,HAMMERFEST XIV,O2 Academy Birmingham,2023-02-11,2023-02-11,"16-18 Horsefair, Bristol Street",Birmingham,UK
4,Primordial,HAMMERFEST XIV,O2 Academy Birmingham,2023-02-11,2023-02-11,"16-18 Horsefair, Bristol Street",Birmingham,UK
...,...,...,...,...,...,...,...,...
8355,Jackson & the Janks,Brooklyn Folk Festival,St. Ann & the Holy Trinity Church,2022-10-21,2022-10-23,157 Montague St.,Brooklyn,US
8356,Clinton Davis,Brooklyn Folk Festival,St. Ann & the Holy Trinity Church,2022-10-21,2022-10-23,157 Montague St.,Brooklyn,US
8357,Dan + Claudia Zanes,Brooklyn Folk Festival,St. Ann & the Holy Trinity Church,2022-10-21,2022-10-23,157 Montague St.,Brooklyn,US
8358,The Lovestruck Balladeers,Brooklyn Folk Festival,St. Ann & the Holy Trinity Church,2022-10-21,2022-10-23,157 Montague St.,Brooklyn,US


In [None]:
df[df.festival == 'Ouest Park']