In [1]:
#Importing packages I will use
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Getting the URL I will be using to scrape the data, I am using BeautifulSoup to get the information in html format and using prettify to make it easier to read
url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_concert_tours_by_women'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of highest-grossing concert tours by women - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vect

In [3]:
#As I am creating a new variable that contains all the information I need for this table, as it is the second table on the webpage I need to specify which table I am getting
table = soup.find_all('table')[1]

In [4]:
#All the column headers are in the html as 'th' so I am creating a new variable for the headers
headers = table.find_all('th')

In [5]:
#Striping the headers to remove excess space on each side and repacing the headers variable with the striped titles
headers = [title.text.strip() for title in headers]
print(headers)

['Rank', 'Peak', 'All-timepeak', 'Actual\xa0gross', 'Adjusted\xa0gross (in 2024 dollars)', 'Artist', 'Tour title', 'Year(s)', 'Shows', 'Averagegross', 'Ref.', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20']


In [6]:
#I am removing the columns that do not have data (wikipedia has the left column as the same html code as the headers)
headers = headers[1:11]
print(headers)

['Peak', 'All-timepeak', 'Actual\xa0gross', 'Adjusted\xa0gross (in 2024 dollars)', 'Artist', 'Tour title', 'Year(s)', 'Shows', 'Averagegross', 'Ref.']


In [7]:
#Putting the new headers into a data frame
df = pd.DataFrame(columns = headers)
df

Unnamed: 0,Peak,All-timepeak,Actual gross,Adjusted gross (in 2024 dollars),Artist,Tour title,Year(s),Shows,Averagegross,Ref.


In [8]:
#Finding all of the data that will go into the table
column_data = table.find_all('tr')
column_data

[<tr>
 <th class="unsortable">Rank
 </th>
 <th><abbr title="Female rank">Peak</abbr>
 </th>
 <th><a href="/wiki/List_of_highest-grossing_concert_tours" title="List of highest-grossing concert tours">All-time<br/>peak</a>
 </th>
 <th>Actual gross
 </th>
 <th><a href="/wiki/Inflation" title="Inflation">Adjusted gross</a> <br/><span style="font-size:85%;">(in 2024 dollars)</span>
 </th>
 <th width="15%">Artist
 </th>
 <th width="25%">Tour title
 </th>
 <th>Year(s)
 </th>
 <th>Shows
 </th>
 <th>Average<br/>gross
 </th>
 <th class="unsortable"><abbr title="Reference">Ref.</abbr>
 </th></tr>,
 <tr>
 <th>1
 </th>
 <td>1<sup class="reference" id="cite_ref-Guinness_Eras_2-0"><a href="#cite_note-Guinness_Eras-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup>
 </td>
 <td>1<sup class="reference" id="cite_ref-Guinness_Eras_2-1"><a href="#cite_note-Guinness_Eras-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup>
 </td>
 <td>$2,077,

In [9]:
#Start of this function is to get the data, the way this works is by selecting the first row and then adding the row data, it is stripping the data
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
#This makes it append each row after the previous one, if this wasn't here it would only print first row
    length = len(df)
    df.loc[length] = individual_row_data

In [10]:
#To start cleaning the data I am going to remove the numbers in square brackets as they are links to different peices on information on wikipeida 
df[['Peak','Drop 1']] = df["Peak"].str.split("[",n = 2,expand = True)
df

Unnamed: 0,Peak,All-timepeak,Actual gross,Adjusted gross (in 2024 dollars),Artist,Tour title,Year(s),Shows,Averagegross,Ref.,Drop 1
0,1.0,1[2],"$2,077,618,725","$2,077,618,725",Taylor Swift,The Eras Tour[2][a],2023–2024,149,"$13,943,750",[1],2]
1,2.0,8[3],"$584,700,000","$584,700,000",Pink,Summer Carnival,2023–2024,97,"$6,027,835",[3],3]
2,1.0,7[5],"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour ‡[6][a],2023,56,"$10,353,571",[5],4]
3,1.0,2[8],"$411,000,000","$602,379,246",Madonna,Sticky & Sweet Tour ‡[7][a],2008–2009,85,"$4,835,294",[9],7]
4,2.0,10[10],"$397,300,000","$488,622,634",Pink,Beautiful Trauma World Tour,2018–2019,156,"$2,546,795",[10],10]
5,2.0,,"$345,675,146","$432,849,782",Taylor Swift,Reputation Stadium Tour,2018,53,"$6,522,173",[11],7]
6,,,"$341,096,360","$341,096,360",G.E.M.,I Am Gloria World Tour †,2023–2025,80,"$4,263,705",[12],
7,,,"$313,300,000","$313,300,000",Karol G,Mañana Será Bonito Tour,2023–2024,62,"$5,053,226",[13],
8,2.0,10[14],"$305,158,363","$417,950,626",Madonna,The MDNA Tour,2012,88,"$3,467,709",[14],7]
9,2.0,,"$280,000,000","$410,380,022",Celine Dion,Taking Chances World Tour,2008–2009,131,"$2,137,405",[16],15]


In [11]:
#Repeating above step for All time peak, and tour title (have to do this three times for tour title as some sections have several brackets and these symbols † & ‡)
df[['All-timepeak','Drop 2']] =df["All-timepeak"].str.split("[",n = 2,expand = True)
df[["Tour title", "Drop 3", "Drop 4"]] = df["Tour title"].str.split("[",n = 2,expand = True)
df[["Tour title", "Drop 5"]] = df["Tour title"].str.split("‡",n = 1,expand = True)
df[["Tour title", "Drop 6"]] = df["Tour title"].str.split("†",n = 1,expand = True)
df

Unnamed: 0,Peak,All-timepeak,Actual gross,Adjusted gross (in 2024 dollars),Artist,Tour title,Year(s),Shows,Averagegross,Ref.,Drop 1,Drop 2,Drop 3,Drop 4,Drop 5,Drop 6
0,1.0,1.0,"$2,077,618,725","$2,077,618,725",Taylor Swift,The Eras Tour,2023–2024,149,"$13,943,750",[1],2],2],2],a],,
1,2.0,8.0,"$584,700,000","$584,700,000",Pink,Summer Carnival,2023–2024,97,"$6,027,835",[3],3],3],,,,
2,1.0,7.0,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,2023,56,"$10,353,571",[5],4],5],6],a],,
3,1.0,2.0,"$411,000,000","$602,379,246",Madonna,Sticky & Sweet Tour,2008–2009,85,"$4,835,294",[9],7],8],7],a],,
4,2.0,10.0,"$397,300,000","$488,622,634",Pink,Beautiful Trauma World Tour,2018–2019,156,"$2,546,795",[10],10],10],,,,
5,2.0,,"$345,675,146","$432,849,782",Taylor Swift,Reputation Stadium Tour,2018,53,"$6,522,173",[11],7],,,,,
6,,,"$341,096,360","$341,096,360",G.E.M.,I Am Gloria World Tour,2023–2025,80,"$4,263,705",[12],,,,,,
7,,,"$313,300,000","$313,300,000",Karol G,Mañana Será Bonito Tour,2023–2024,62,"$5,053,226",[13],,,,,,
8,2.0,10.0,"$305,158,363","$417,950,626",Madonna,The MDNA Tour,2012,88,"$3,467,709",[14],7],14],,,,
9,2.0,,"$280,000,000","$410,380,022",Celine Dion,Taking Chances World Tour,2008–2009,131,"$2,137,405",[16],15],,,,,


In [12]:
#Next I am splitting the Years the tours took place into Start year and end year
df[["Start Year", "End Year"]] = df["Year(s)"].str.split("–",n = 2,expand = True)
df

Unnamed: 0,Peak,All-timepeak,Actual gross,Adjusted gross (in 2024 dollars),Artist,Tour title,Year(s),Shows,Averagegross,Ref.,Drop 1,Drop 2,Drop 3,Drop 4,Drop 5,Drop 6,Start Year,End Year
0,1.0,1.0,"$2,077,618,725","$2,077,618,725",Taylor Swift,The Eras Tour,2023–2024,149,"$13,943,750",[1],2],2],2],a],,,2023,2024.0
1,2.0,8.0,"$584,700,000","$584,700,000",Pink,Summer Carnival,2023–2024,97,"$6,027,835",[3],3],3],,,,,2023,2024.0
2,1.0,7.0,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,2023,56,"$10,353,571",[5],4],5],6],a],,,2023,
3,1.0,2.0,"$411,000,000","$602,379,246",Madonna,Sticky & Sweet Tour,2008–2009,85,"$4,835,294",[9],7],8],7],a],,,2008,2009.0
4,2.0,10.0,"$397,300,000","$488,622,634",Pink,Beautiful Trauma World Tour,2018–2019,156,"$2,546,795",[10],10],10],,,,,2018,2019.0
5,2.0,,"$345,675,146","$432,849,782",Taylor Swift,Reputation Stadium Tour,2018,53,"$6,522,173",[11],7],,,,,,2018,
6,,,"$341,096,360","$341,096,360",G.E.M.,I Am Gloria World Tour,2023–2025,80,"$4,263,705",[12],,,,,,,2023,2025.0
7,,,"$313,300,000","$313,300,000",Karol G,Mañana Será Bonito Tour,2023–2024,62,"$5,053,226",[13],,,,,,,2023,2024.0
8,2.0,10.0,"$305,158,363","$417,950,626",Madonna,The MDNA Tour,2012,88,"$3,467,709",[14],7],14],,,,,2012,
9,2.0,,"$280,000,000","$410,380,022",Celine Dion,Taking Chances World Tour,2008–2009,131,"$2,137,405",[16],15],,,,,,2008,2009.0


In [13]:
#Dropping unessesry columns
df = df.drop(columns =['Drop 1', 'Drop 2', 'Drop 3', 'Drop 4', 'Drop 5', 'Drop 6', 'Ref.','Year(s)'])
df

Unnamed: 0,Peak,All-timepeak,Actual gross,Adjusted gross (in 2024 dollars),Artist,Tour title,Shows,Averagegross,Start Year,End Year
0,1.0,1.0,"$2,077,618,725","$2,077,618,725",Taylor Swift,The Eras Tour,149,"$13,943,750",2023,2024.0
1,2.0,8.0,"$584,700,000","$584,700,000",Pink,Summer Carnival,97,"$6,027,835",2023,2024.0
2,1.0,7.0,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,56,"$10,353,571",2023,
3,1.0,2.0,"$411,000,000","$602,379,246",Madonna,Sticky & Sweet Tour,85,"$4,835,294",2008,2009.0
4,2.0,10.0,"$397,300,000","$488,622,634",Pink,Beautiful Trauma World Tour,156,"$2,546,795",2018,2019.0
5,2.0,,"$345,675,146","$432,849,782",Taylor Swift,Reputation Stadium Tour,53,"$6,522,173",2018,
6,,,"$341,096,360","$341,096,360",G.E.M.,I Am Gloria World Tour,80,"$4,263,705",2023,2025.0
7,,,"$313,300,000","$313,300,000",Karol G,Mañana Será Bonito Tour,62,"$5,053,226",2023,2024.0
8,2.0,10.0,"$305,158,363","$417,950,626",Madonna,The MDNA Tour,88,"$3,467,709",2012,
9,2.0,,"$280,000,000","$410,380,022",Celine Dion,Taking Chances World Tour,131,"$2,137,405",2008,2009.0


In [14]:
#Restting index naming it rank, I then +1 to index so it starts at 1 rather than 0
df = df.reset_index(drop = True)
df.index = df.index+1
df.index.rename('Rank',inplace = True)
df

Unnamed: 0_level_0,Peak,All-timepeak,Actual gross,Adjusted gross (in 2024 dollars),Artist,Tour title,Shows,Averagegross,Start Year,End Year
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1.0,1.0,"$2,077,618,725","$2,077,618,725",Taylor Swift,The Eras Tour,149,"$13,943,750",2023,2024.0
2,2.0,8.0,"$584,700,000","$584,700,000",Pink,Summer Carnival,97,"$6,027,835",2023,2024.0
3,1.0,7.0,"$579,800,000","$579,800,000",Beyoncé,Renaissance World Tour,56,"$10,353,571",2023,
4,1.0,2.0,"$411,000,000","$602,379,246",Madonna,Sticky & Sweet Tour,85,"$4,835,294",2008,2009.0
5,2.0,10.0,"$397,300,000","$488,622,634",Pink,Beautiful Trauma World Tour,156,"$2,546,795",2018,2019.0
6,2.0,,"$345,675,146","$432,849,782",Taylor Swift,Reputation Stadium Tour,53,"$6,522,173",2018,
7,,,"$341,096,360","$341,096,360",G.E.M.,I Am Gloria World Tour,80,"$4,263,705",2023,2025.0
8,,,"$313,300,000","$313,300,000",Karol G,Mañana Será Bonito Tour,62,"$5,053,226",2023,2024.0
9,2.0,10.0,"$305,158,363","$417,950,626",Madonna,The MDNA Tour,88,"$3,467,709",2012,
10,2.0,,"$280,000,000","$410,380,022",Celine Dion,Taking Chances World Tour,131,"$2,137,405",2008,2009.0


In [15]:
#Saving the dataframe as a CSV
df.to_csv(r"C:\Cleaning Data\Highest Grossing Female Tours.csv", index=True)