In [1]:
import pandas as pd
import numpy as np
import requests
import time
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

### Web Scraping
Here I used selenium's webdriver to open phantasytour and scrape all 109 dead + company shows for setlists, dates and tour

In [2]:
setlists_url = "https://www.phantasytour.com/bands/dead/setlists"

In [3]:
def get_shows(driver):
    showstable = driver.find_element_by_xpath("//tbody[@data-bind='foreach: setlists']")
    shows_list = re.split("(Submit a setlist correction)|(Submit a setlist for this show)",showstable.text)
    shows_clean = list()
    for show in shows_list:
        if show and show != 'Submit a setlist for this show' and show != 'Submit a setlist correction':
            show_temp = {}
            show_split = show.split('\n')
            if show_split[0]=='':
                show_split = show_split[1:]     

            show_temp[show_split[0]] = show_split[1:]
            shows_clean.append(show_temp)
            
    return shows_clean
    
    
def get_next_page(pg_num):
    print(f"Next Page: counter - {pg_num}")
    time.sleep(5)
    
    
    next_page = f"//a[@href = '#/page/{pg_num}']"
    
    next_page_button = driver.find_element_by_xpath(next_page)
    next_page_button.send_keys(Keys.RETURN)

    time.sleep(20)

In [4]:
chromedriver = "/Users/brian_newborn/Downloads/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)

setlists_url = "https://www.phantasytour.com/bands/dead/setlists"
driver.get(setlists_url)

In [5]:
pages_of_shows = list()
for p in range(1,5):
    get_next_page(p)
    pages_of_shows.append(get_shows(driver))
    
    


Next Page: counter - 1
Next Page: counter - 2
Next Page: counter - 3
Next Page: counter - 4


In [6]:
len(pages_of_shows)

4

### Data Cleaning
The data here was pretty messy - some of the early shows have a notes section between the title and sets, while some have notes after. There are a few iterations of dictionaries to pandas back to lists to facilitate flexible cleaning

In [7]:
pages_of_shows[3]

[{'11/27/2015 • MGM Grand Garden Arena • Las Vegas, NV': ['Tour: Dead & Co. Fall Tour 2015',
   'Notes: Dead and Company featuring Bob Weir, Bill Kreutzmann, Mickey Hart, John Mayer, Oteil Burbridge and Jeff Chimenti',
   "Set 1: Shakedown Street, Uncle John's Band, Althea, Jack Straw, Row Jimmy, Black-Throated Wind, Casey Jones",
   'Set 2: Dark Star, Playing in the Band, Let It Grow, St. Stephen, Drums, Space, Wharf Rat, Sugar Magnolia',
   'Encore: Ripple',
   '']},
 {'11/25/2015 • 1st Bank Center • Broomfield, CO': ['Tour: Dead & Co. Fall Tour 2015',
   'Notes: Dead and Company featuring Bob Weir, Bill Kreutzmann, Mickey Hart, John Mayer, Oteil Burbridge and Jeff Chimenti',
   'Set 1: Hell in a Bucket, Brown-Eyed Women, Feel Like a Stranger, Peggy-O, Little Red Rooster, Bird Song, The Music Never Stopped',
   "Set 2: Truckin, He's Gone, Eyes of the World, Terrapin Station, Drums, Space, Stella Blue, China Cat Sunflower, I Know You Rider",
   'Encore: Not Fade Away',
   '']},
 {'11/

In [8]:
full_setlists = list()
for idx, page in enumerate(pages_of_shows):
    idx += 1
    for show in page:
        full_setlists.append(show)

In [9]:
full_setlists

[{'07/07/2018 • Dodger Stadium • Los Angeles, CA': ['Tour: Summer Tour 2018',
   'Set 1: Playing in the Band, Bertha, Jack Straw, Big Railroad Blues, Peggy-O, Ramble On Rose, Duck Joke, Cumberland Blues, Deal',
   'Set 2: Sugar Magnolia> Scarlet Begonias> Fire on the Mountain> Althea> Eyes of the World> Drums> Space> Stella Blue, Sunshine Daydream',
   'Encore: Brokedown Palace, Not Fade Away, One More Saturday Night',
   '']},
 {'07/06/2018 • Mattress Firm Amphitheater • Chula Vista, CA': ['Tour: Summer Tour 2018',
   'Set 1: Hell in a Bucket, Big Boss Man, Tennessee Jed, Friend of the Devil, Bird Song> Loose Lucy> Bird Song',
   'Set 2: Mr. Charlie, Truckin> New Speedway Boogie> Estimated Prophet> Shakedown Street> Drums> Space> I Need a Miracle> Dear Prudence> U.S. Blues',
   "Encore: Knockin on Heaven's Door",
   '']},
 {'07/03/2018 • Shoreline Amphitheatre • Mountain View, CA': ['Tour: Summer Tour 2018',
   'Set 1: Iko Iko, Alabama Getaway, New Minglewood Blues, Brown-Eyed Women, 

In [10]:
df = pd.DataFrame()
for show in full_setlists:
    show_df = pd.DataFrame.from_dict(show)
    df = pd.concat([df,show_df],axis=1)

In [11]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
"07/07/2018 • Dodger Stadium • Los Angeles, CA",Tour: Summer Tour 2018,"Set 1: Playing in the Band, Bertha, Jack Straw...",Set 2: Sugar Magnolia> Scarlet Begonias> Fire ...,"Encore: Brokedown Palace, Not Fade Away, One M...",,,,,,,,
"07/06/2018 • Mattress Firm Amphitheater • Chula Vista, CA",Tour: Summer Tour 2018,"Set 1: Hell in a Bucket, Big Boss Man, Tenness...","Set 2: Mr. Charlie, Truckin> New Speedway Boog...",Encore: Knockin on Heaven's Door,,,,,,,,
"07/03/2018 • Shoreline Amphitheatre • Mountain View, CA",Tour: Summer Tour 2018,"Set 1: Iko Iko, Alabama Getaway, New Minglewoo...",Set 2: China Cat Sunflower> Althea> Viola Lee ...,Encore: Ripple,2nd verse,,,,,,,
"07/02/2018 • Shoreline Amphitheatre • Mountain View, CA",Tour: Summer Tour 2018,"Set 1: Feel Like a Stranger, Dancin in the Str...",Set 2: Lost Sailor> Saint of Circumstance> Hel...,Encore: Touch of Grey,1st verse,,,,,,,
"06/30/2018 • Autzen Stadium • Eugene, OR",Tour: Summer Tour 2018,"Set 1: Deal, Me and My Uncle, Here Comes Sunsh...",Set 2: Dark Star> El Paso> Dark Star> St. Step...,Encore: Brokedown Palace,,,,,,,,
"06/29/2018 • Gorge Amphitheatre • George, WA",Tour: Summer Tour 2018,"Set 1: Mississippi Half-Step Uptown Toodeloo, ...","Set 2: Playing in the Band> Eyes of the World,...",Encore: U.S. Blues,Live debut by Dead & Company,,,,,,,
"06/23/2018 • Alpine Valley Music Theatre • East Troy, WI",Tour: Summer Tour 2018,"Set 1: The Music Never Stopped> Easy Answers, ...","Set 2: The Weight, Shakedown Street, Althea, C...",Encore: One More Saturday Night,Sit in by Justin Vernon,,,,,,,
"06/22/2018 • Alpine Valley Music Theatre • East Troy, WI",Tour: Summer Tour 2018,"Set 1: Hell in a Bucket, Brown-Eyed Women, Gre...",Set 2: Viola Lee Blues> Estimated Prophet> Unc...,Encore: Ripple,,,,,,,,
"06/20/2018 • Blossom Music Center • Cuyahoga Falls, OH",Tour: Summer Tour 2018,"Set 1: Feel Like a Stranger, They Love Each Ot...",Set 2: Lost Sailor> Saint of Circumstance> He'...,Encore: Knockin on Heaven's Door,,,,,,,,
"06/19/2018 • Darien Lake P.A.C. • Darien Center, NY",Tour: Summer Tour 2018,"Set 1: Cold Rain and Snow, Tennessee Jed, Dire...","Set 2: Iko Iko, Dark Star> Truckin> Smokestack...",Encore: Werewolves of London,,,,,,,,


After creating "df", I soon realized that each setlist entry had different data structures, so I would have to clean these pretty heavily. The 2015 shows, for example, have a "Notes" section early on that would hvae to be removed

In [12]:
for idx, row in df.T.iterrows():
#     print(idx)
    if idx == '12/27/2015 • Bill Graham Civic Auditorium • San Francisco, CA':
        print(row)

0                      Tour: Dead & Co. NYE Run 2015-16
1     Notes: Dead and Company featuring Bob Weir, Bi...
2     Set 1: Truckin> Cold Rain and Snow, Brown-Eyed...
3     Set 2: Samson and Delilah, Deal> He's Gone> Es...
4                           Encore: Ripple, Casey Jones
5                                                      
6                                                   NaN
7                                                   NaN
8                                                   NaN
9                                                   NaN
10                                                  NaN
11                                                  NaN
Name: 12/27/2015 • Bill Graham Civic Auditorium • San Francisco, CA, dtype: object


I just want Tour, Set 1, Set 2 and Encore sections, so lets iterrate through each show and each item therewithin, and take only those that start with those 4 phrases.

In [13]:
df_clean_list = list()

for idx,row in df.T.iterrows():

   
    row = row.dropna()
    clean_data = []

    clean_data.append(idx)
    
    for r_idx,item in row.iteritems():
        
        if re.match("Tour", item.split(":")[0]):
            clean_data.append(item)
        elif re.match("Set 1", item.split(":")[0]):
            clean_data.append(item)
        elif re.match("Set 2", item.split(":")[0]):
            clean_data.append(item)
        elif re.match("Encore", item.split(":")[0]):
            clean_data.append(item)

            
    df_clean_list.append(clean_data)

    

In [14]:
df_clean_list

[['07/07/2018 • Dodger Stadium • Los Angeles, CA',
  'Tour: Summer Tour 2018',
  'Set 1: Playing in the Band, Bertha, Jack Straw, Big Railroad Blues, Peggy-O, Ramble On Rose, Duck Joke, Cumberland Blues, Deal',
  'Set 2: Sugar Magnolia> Scarlet Begonias> Fire on the Mountain> Althea> Eyes of the World> Drums> Space> Stella Blue, Sunshine Daydream',
  'Encore: Brokedown Palace, Not Fade Away, One More Saturday Night'],
 ['07/06/2018 • Mattress Firm Amphitheater • Chula Vista, CA',
  'Tour: Summer Tour 2018',
  'Set 1: Hell in a Bucket, Big Boss Man, Tennessee Jed, Friend of the Devil, Bird Song> Loose Lucy> Bird Song',
  'Set 2: Mr. Charlie, Truckin> New Speedway Boogie> Estimated Prophet> Shakedown Street> Drums> Space> I Need a Miracle> Dear Prudence> U.S. Blues',
  "Encore: Knockin on Heaven's Door"],
 ['07/03/2018 • Shoreline Amphitheatre • Mountain View, CA',
  'Tour: Summer Tour 2018',
  'Set 1: Iko Iko, Alabama Getaway, New Minglewood Blues, Brown-Eyed Women, Cassidy, Ship of Foo

In [15]:
df_clean = pd.DataFrame()
for show in df_clean_list:
    show_df = pd.DataFrame(show)
    df_clean = pd.concat([df_clean,show_df],axis=1)

df_clean is now sufficient to start. Next step is to unfold the set1 and set2 boxes, but at least each column is now aligned with the others

In [16]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = -1

In [17]:
sf_set1 = '''Set 1: Playing in the Band, Deal, Scarlet Begonias> Fire on the Mountain, Not Fade Away, Touch of Grey'''
sf_tour = '''Band Together Benefit: Benefit show to raise money for those affected by the North Bay wildfires'''

In [18]:
df_clean = df_clean.T.reset_index(drop=True)

In [19]:
df_clean

Unnamed: 0,0,1,2,3,4
0,"07/07/2018 • Dodger Stadium • Los Angeles, CA",Tour: Summer Tour 2018,"Set 1: Playing in the Band, Bertha, Jack Straw, Big Railroad Blues, Peggy-O, Ramble On Rose, Duck Joke, Cumberland Blues, Deal","Set 2: Sugar Magnolia> Scarlet Begonias> Fire on the Mountain> Althea> Eyes of the World> Drums> Space> Stella Blue, Sunshine Daydream","Encore: Brokedown Palace, Not Fade Away, One More Saturday Night"
1,"07/06/2018 • Mattress Firm Amphitheater • Chula Vista, CA",Tour: Summer Tour 2018,"Set 1: Hell in a Bucket, Big Boss Man, Tennessee Jed, Friend of the Devil, Bird Song> Loose Lucy> Bird Song","Set 2: Mr. Charlie, Truckin> New Speedway Boogie> Estimated Prophet> Shakedown Street> Drums> Space> I Need a Miracle> Dear Prudence> U.S. Blues",Encore: Knockin on Heaven's Door
2,"07/03/2018 • Shoreline Amphitheatre • Mountain View, CA",Tour: Summer Tour 2018,"Set 1: Iko Iko, Alabama Getaway, New Minglewood Blues, Brown-Eyed Women, Cassidy, Ship of Fools, Let It Grow","Set 2: China Cat Sunflower> Althea> Viola Lee Blues> I Know You Rider> Drums> Space> The Other One1, Days Between> Casey Jones",Encore: Ripple
3,"07/02/2018 • Shoreline Amphitheatre • Mountain View, CA",Tour: Summer Tour 2018,"Set 1: Feel Like a Stranger, Dancin in the Streets, It Hurts Me Too, Sitting On Top of the World, Corrina, They Love Each Other, Throwing Stones",Set 2: Lost Sailor> Saint of Circumstance> Help on the Way> Slipknot!> Franklin's Tower> Drums> Space> The Other One1> Wharf Rat> Not Fade Away,Encore: Touch of Grey
4,"06/30/2018 • Autzen Stadium • Eugene, OR",Tour: Summer Tour 2018,"Set 1: Deal, Me and My Uncle, Here Comes Sunshine, Black-Throated Wind, Cold Rain and Snow, Peggy-O, One More Saturday Night",Set 2: Dark Star> El Paso> Dark Star> St. Stephen> The Eleven> Drums> Space> Morning Dew> Turn On Your Lovelight,Encore: Brokedown Palace
5,"06/29/2018 • Gorge Amphitheatre • George, WA",Tour: Summer Tour 2018,"Set 1: Mississippi Half-Step Uptown Toodeloo, Bertha, Tennessee Jed, Mr. Charlie1, Loser, Crazy Fingers> Cumberland Blues","Set 2: Playing in the Band> Eyes of the World, Comes a Time> Terrapin Station> Drums> Space> New Speedway Boogie> Black Peter> Playin' Reprise",Encore: U.S. Blues
6,"06/23/2018 • Alpine Valley Music Theatre • East Troy, WI",Tour: Summer Tour 2018,"Set 1: The Music Never Stopped> Easy Answers, Alabama Getaway, Big River, Jack Straw, Black Muddy River1, Friend of the Devil1, Bird Song1","Set 2: The Weight, Shakedown Street, Althea, China Cat Sunflower> I Know You Rider> Drums> Space> All Along the Watchtower> Standing on the Moon> Sugar Magnolia",Encore: One More Saturday Night
7,"06/22/2018 • Alpine Valley Music Theatre • East Troy, WI",Tour: Summer Tour 2018,"Set 1: Hell in a Bucket, Brown-Eyed Women, Greatest Story Ever Told, Deep Elem Blues, Even So, Ramble On Rose, Cassidy, Touch of Grey",Set 2: Viola Lee Blues> Estimated Prophet> Uncle John's Band> China Doll> Viola Lee Blues> Drums> Space> The Wheel> A Hard Rain's A-Gonna Fall> Viola Lee Blues> Goin Down the Road Feelin Bad,Encore: Ripple
8,"06/20/2018 • Blossom Music Center • Cuyahoga Falls, OH",Tour: Summer Tour 2018,"Set 1: Feel Like a Stranger, They Love Each Other, It's All Over Now, Row Jimmy, West L.A. Fadeaway, Loose Lucy, Ship of Fools, Passenger","Set 2: Lost Sailor> Saint of Circumstance> He's Gone> Scarlet Begonias> Fire on the Mountain> Drums> Space> I Need a Miracle> Dear Prudence, Throwing Stones",Encore: Knockin on Heaven's Door
9,"06/19/2018 • Darien Lake P.A.C. • Darien Center, NY",Tour: Summer Tour 2018,"Set 1: Cold Rain and Snow, Tennessee Jed, Dire Wolf, Queen Jane Approximately, If I Had the World to Give, Here Comes Sunshine, Little Red Rooster, Let It Grow","Set 2: Iko Iko, Dark Star> Truckin> Smokestack Lightning> Dark Star> Deal> Drums> Space> Wharf Rat> Casey Jones",Encore: Werewolves of London


In [20]:
df_clean.iloc[40,2] = sf_set1
df_clean.iloc[40,1] = sf_tour

In [21]:
df_clean

Unnamed: 0,0,1,2,3,4
0,"07/07/2018 • Dodger Stadium • Los Angeles, CA",Tour: Summer Tour 2018,"Set 1: Playing in the Band, Bertha, Jack Straw, Big Railroad Blues, Peggy-O, Ramble On Rose, Duck Joke, Cumberland Blues, Deal","Set 2: Sugar Magnolia> Scarlet Begonias> Fire on the Mountain> Althea> Eyes of the World> Drums> Space> Stella Blue, Sunshine Daydream","Encore: Brokedown Palace, Not Fade Away, One More Saturday Night"
1,"07/06/2018 • Mattress Firm Amphitheater • Chula Vista, CA",Tour: Summer Tour 2018,"Set 1: Hell in a Bucket, Big Boss Man, Tennessee Jed, Friend of the Devil, Bird Song> Loose Lucy> Bird Song","Set 2: Mr. Charlie, Truckin> New Speedway Boogie> Estimated Prophet> Shakedown Street> Drums> Space> I Need a Miracle> Dear Prudence> U.S. Blues",Encore: Knockin on Heaven's Door
2,"07/03/2018 • Shoreline Amphitheatre • Mountain View, CA",Tour: Summer Tour 2018,"Set 1: Iko Iko, Alabama Getaway, New Minglewood Blues, Brown-Eyed Women, Cassidy, Ship of Fools, Let It Grow","Set 2: China Cat Sunflower> Althea> Viola Lee Blues> I Know You Rider> Drums> Space> The Other One1, Days Between> Casey Jones",Encore: Ripple
3,"07/02/2018 • Shoreline Amphitheatre • Mountain View, CA",Tour: Summer Tour 2018,"Set 1: Feel Like a Stranger, Dancin in the Streets, It Hurts Me Too, Sitting On Top of the World, Corrina, They Love Each Other, Throwing Stones",Set 2: Lost Sailor> Saint of Circumstance> Help on the Way> Slipknot!> Franklin's Tower> Drums> Space> The Other One1> Wharf Rat> Not Fade Away,Encore: Touch of Grey
4,"06/30/2018 • Autzen Stadium • Eugene, OR",Tour: Summer Tour 2018,"Set 1: Deal, Me and My Uncle, Here Comes Sunshine, Black-Throated Wind, Cold Rain and Snow, Peggy-O, One More Saturday Night",Set 2: Dark Star> El Paso> Dark Star> St. Stephen> The Eleven> Drums> Space> Morning Dew> Turn On Your Lovelight,Encore: Brokedown Palace
5,"06/29/2018 • Gorge Amphitheatre • George, WA",Tour: Summer Tour 2018,"Set 1: Mississippi Half-Step Uptown Toodeloo, Bertha, Tennessee Jed, Mr. Charlie1, Loser, Crazy Fingers> Cumberland Blues","Set 2: Playing in the Band> Eyes of the World, Comes a Time> Terrapin Station> Drums> Space> New Speedway Boogie> Black Peter> Playin' Reprise",Encore: U.S. Blues
6,"06/23/2018 • Alpine Valley Music Theatre • East Troy, WI",Tour: Summer Tour 2018,"Set 1: The Music Never Stopped> Easy Answers, Alabama Getaway, Big River, Jack Straw, Black Muddy River1, Friend of the Devil1, Bird Song1","Set 2: The Weight, Shakedown Street, Althea, China Cat Sunflower> I Know You Rider> Drums> Space> All Along the Watchtower> Standing on the Moon> Sugar Magnolia",Encore: One More Saturday Night
7,"06/22/2018 • Alpine Valley Music Theatre • East Troy, WI",Tour: Summer Tour 2018,"Set 1: Hell in a Bucket, Brown-Eyed Women, Greatest Story Ever Told, Deep Elem Blues, Even So, Ramble On Rose, Cassidy, Touch of Grey",Set 2: Viola Lee Blues> Estimated Prophet> Uncle John's Band> China Doll> Viola Lee Blues> Drums> Space> The Wheel> A Hard Rain's A-Gonna Fall> Viola Lee Blues> Goin Down the Road Feelin Bad,Encore: Ripple
8,"06/20/2018 • Blossom Music Center • Cuyahoga Falls, OH",Tour: Summer Tour 2018,"Set 1: Feel Like a Stranger, They Love Each Other, It's All Over Now, Row Jimmy, West L.A. Fadeaway, Loose Lucy, Ship of Fools, Passenger","Set 2: Lost Sailor> Saint of Circumstance> He's Gone> Scarlet Begonias> Fire on the Mountain> Drums> Space> I Need a Miracle> Dear Prudence, Throwing Stones",Encore: Knockin on Heaven's Door
9,"06/19/2018 • Darien Lake P.A.C. • Darien Center, NY",Tour: Summer Tour 2018,"Set 1: Cold Rain and Snow, Tennessee Jed, Dire Wolf, Queen Jane Approximately, If I Had the World to Give, Here Comes Sunshine, Little Red Rooster, Let It Grow","Set 2: Iko Iko, Dark Star> Truckin> Smokestack Lightning> Dark Star> Deal> Drums> Space> Wharf Rat> Casey Jones",Encore: Werewolves of London


In [22]:
df_clean = df_clean.T

In [23]:
df_clean.shape

(5, 109)

In [24]:
pd.to_pickle(df_clean,"folded_df")