## Exercise 1: Scrape NASDAQ Top Gainers
Steps:
1. **Initial Scrape:** Scrape the NASDAQ Top Gainers Table (https://www.nasdaq.com/market-activity/stocks/screener?exchange=nasdaq&status=top-gainers).
1. **Initial Scrape2:** If you get a timeout from NASDAQ try Yahoo Finance (https://finance.yahoo.com/markets/stocks/gainers/?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAACvz6Ex45XoUQkTNdDAujGj-X1mDenZIQcqrx6vnpefvlJ9NoDdFaU1W6EO9SzM8m0aA1t7qTMhWSZq2zdbdGfRyC47dQXdu8ZG8IISgSgz6DXTsJe0Jrp3hGEKnAxOCDSjeey7roNKAj5L0UJ68arDOoeeI13BkNR2xMSggz88c)
2. **Data Cleanup:** Keep only the 'Symbol', 'Company', and 'Price' columns. With Yahoo data, Symbol and Company name is in the same column. 
3. **Analysis:** Find the company with the highest stock price.. Hint: With Yahoo you can use the start and count arguments to see all companies. 

In [1]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
attrs = {
    "start": 0,
    "count": 100
}
url = "https://www.nasdaq.com/market-activity/stocks/screener?exchange=nasdaq&status=top-gainers"
results = re.get(url)
src = results.content
print(src)
document = BeautifulSoup(src, "lxml")
tables = document.find_all("table")    # I verify that this has len(1)
print(tables)
table = tables[0]
data = {"Symbol": [], "Company": [], "Price": []}
rows = table.find_all("tr")
for row in rows[1:]:
    values = [c.get_text() for c in row.find_all("td")]
    symbol_and_name = values[0].split()
    symbol = symbol_and_name[0]
    company_name = " ".join(symbol_and_name[1:])
    price_chg_pctchg = values[1].split()
    price = price_chg_pctchg[0]
    data["Symbol"].append(symbol)
    data["Company"].append(company_name)
    data["Price"].append(float(price))

df = pd.DataFrame(data)
sorted_df = df.sort_values(by="Price", ascending=False)
top_company = sorted_df.iloc[0]
print(f"Of {len(df)} companies, {top_company.Company} has the most expensive share price")

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [None]:
from requests_html import HTMLSession

session = HTMLSession()

url = "https://finance.yahoo.com/markets/stocks/gainers/?start=0&count=100"
response = session.get(url)
tables = response.html.find('table')
table = tables[0]
rows = table.find('tr')
data = {"Symbol": [], "Company": [], "Price": []}
for row in rows[1:]:
    values = [c.text for c in row.find("td")]
    symbol_and_name = values[0].split()
    symbol = symbol_and_name[0]
    company_name = " ".join(symbol_and_name[1:])
    price_chg_pctchg = values[1].split()
    price = price_chg_pctchg[0]
    data["Symbol"].append(symbol)
    data["Company"].append(company_name)
    data["Price"].append(float(price))

df = pd.DataFrame(data)
sorted_df = df.sort_values(by="Price", ascending=False)
top_company = sorted_df.iloc[0]
print(f"Of {len(df)} companies, {top_company.Company} has the most expensive share price")

## Exercise 2: Scrape Top 250 Movies by Gross income
Steps:
1. **Initial Scrape:** Scrape BoxOfficeMojo's list of top 250 movies (https://www.boxofficemojo.com/chart/top_lifetime_gross/).
2. **Data Cleanup:** Keep only relevant columns such as 'Rank', 'Title', "Lifetime gross", and 'Year'.
3. **Analysis:** Find the best decade in terms of "Lifetime gross". 

In [6]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"
data = {"Title": [], "Gross": [], "Year": []}

for i in range(5):
    attrs = {"offset": 200*i}
    results = re.get(url, params=attrs)
    src = results.content
    document = BeautifulSoup(src, "lxml")
    tables = document.find_all("table")
    table = tables[0]
    rows = table.find_all('tr')
    for row in rows[1:]:
        elements = [e.get_text() for e in row.find_all("td")]
        data["Title"].append(elements[1])
        income = elements[2]
        income = float(income.replace(",", "").replace("$", ""))
        data["Gross"].append(income)
        data["Year"].append(int(elements[3]))

df = pd.DataFrame(data, index=pd.Index(range(1, len(data["Title"])+1), name="Rank"))
decades = [1970 + 10*i for i in range(6)]
average_gross = {}
max_decade = 0
for decade in decades:
    decade_df = df[(df["Year"] > decade)*(df["Year"] < decade + 10)]
    average_gross[decade] = decade_df["Gross"].mean()
    if average_gross[decade] > max_decade:
        max_decade = decade

print(f"The {decade}'s had the highest average Gross income")
df["Gross"] /= 1000000000 
df

The 2020's had the highest average Gross income


Unnamed: 0_level_0,Title,Gross,Year
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Star Wars: Episode VII - The Force Awakens,0.936662,2015
2,Avengers: Endgame,0.858373,2019
3,Spider-Man: No Way Home,0.814867,2021
4,Avatar,0.785222,2009
5,Top Gun: Maverick,0.718733,2022
...,...,...,...
996,The Naked Gun 2½: The Smell of Fear,0.086930,1991
997,The Campaign,0.086908,2012
998,The War of the Roses,0.086889,1989
999,The Amityville Horror,0.086432,1979


## Exercise 3: Scrape Wikipedia's List of Best-selling Music Artists
Steps:
1. **Initial Scrape:** Scrape Wikipedia's table of best-selling music artists (https://en.wikipedia.org/wiki/List_of_best-selling_music_artists).
2. **Data Cleanup:** Retain only 'Artist', 'Country/Market', and 'Certified Sales'.
3. **Analysis:** Find the artist with the highest certified sales.

In [3]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
url = "https://en.wikipedia.org/wiki/List_of_best-selling_music_artists"
headers = {
    "User-Agent": "PythonRequests"
}
results = re.get(url, headers=headers)
src = results.content
document = BeautifulSoup(src, "lxml")
tables = document.find_all("table")
table = tables[0]
rows = table.find_all('tr')
data = {"Name": [], "Country": [], "Certified sales": []}
for row in rows[1:]:
    values = row.get_text().split("\n")
    name = values[1]
    country = values[3]
    sales = float(values[12].strip(" million"))
    data["Name"].append(name)
    data["Country"].append(country)
    data["Certified sales"].append(sales)

df = pd.DataFrame(data, index=pd.Index(range(1, len(data["Name"])+1), name="Claimed rank"))
df_sorted = df.sort_values(by="Certified sales", ascending=False)
print(f"{df_sorted.iloc[0].Name} has the highest certified sales")

    

Rihanna has the highest certified sales


## Exercise 4: Scrape CoinMarketCap's Top 10 Cryptocurrencies
Steps:
1. **Initial Scrape:** Scrape CoinMarketCap's table of top cryptocurrencies (https://coinmarketcap.com/).
2. **Data Cleanup:** Retain only 'Name', 'Symbol', and 'Market Cap'.
3. **Analysis:** Identify the cryptocurrency with the highest market cap.


In [7]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By



driver = webdriver.Chrome()
url = "https://coinmarketcap.com/"
driver.get(url)
time.sleep(3)
height = driver.execute_script("return window.innerHeight;")
for _ in range(12):
    driver.execute_script(f"window.scrollBy(0, {height});")
    time.sleep(0.5)

html_content = driver.page_source
document = BeautifulSoup(html_content, "lxml")
tables = document.find_all("table")
table = tables[0]
rows = table.find_all('tr')
data = {"Company name": [], "Symbol": [], "Market cap": []}
for row in rows[1:]:
    elements = row.find_all("td")
    try:
        name_and_symbol = elements[2]
        name, symbol = [e.get_text() for e in name_and_symbol.find_all("p")]
        market_cap_text = elements[7].get_text()
        market_cap = float(market_cap_text.split("$")[2].replace(",",""))
        data["Symbol"].append(symbol)
        data["Company name"].append(name)
        data["Market cap"].append(float(market_cap))
    except:
        print(f"{elements[2].get_text()} differ from structure, skipping row")

drop_anchors = driver.find_elements(By.ID, "drop-anchor")
wait = WebDriverWait(driver, 1)
read_more_button = wait.until(
    EC.element_to_be_clickable((By.ID, "drop-anchor"))
)
    # Click the "Next page" button
read_more_button.click()
time.sleep(5)
driver.close()
df = pd.DataFrame(data, index=pd.Index(range(1, len(data["Company name"])+1), name="Rank"))
print(f"{df.sort_values(by="Market cap", ascending=False).iloc[0]["Company name"]} has the higest market cap")

Shiba InuSHIB differ from structure, skipping row
Bitcoin has the higest market cap


In [8]:
df

Unnamed: 0_level_0,Company name,Symbol,Market cap
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Bitcoin,BTC,2.175277e+12
2,Ethereum,ETH,4.695311e+11
3,Tether,USDT,1.735413e+11
4,XRP,XRP,1.631307e+11
5,BNB,BNB,1.307725e+11
...,...,...,...
95,dogwifhat,WIF,7.237138e+08
96,Conflux,CFX,7.115079e+08
97,Aethir,ATH,7.091325e+08
98,Tezos,XTZ,6.979545e+08


In [29]:
from selenium import webdriver
from io import StringIO
import time
import pandas as pd


pages = 2
url = "https://coinmarketcap.com/"
dfs = []
for i in range(pages):
    driver = webdriver.Chrome()
    page = i+1
    driver.get(url+f"?page={page}")
    time.sleep(3)
    height = driver.execute_script("return window.innerHeight;")
    for _ in range(12):
        driver.execute_script(f"window.scrollBy(0, {height});")
        time.sleep(0.5)

    html_content = driver.page_source
    driver.close()
    df = pd.read_html(StringIO(html_content))[0]
    dfs.append(df.copy())

DF = pd.concat(dfs)
DF

Unnamed: 0.1,Unnamed: 0,#,Name,Price,1h %,24h %,7d %,Market Cap,Volume(24h),Circulating Supply,...,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103
0,,1.0,BitcoinBTCBuy,"$109,271.11",0.18%,2.17%,6.50%,"$2.18T$2,177,372,668,427","$70,137,335,960641.67K",19.92M BTC,...,,,,,,,,,,
1,,2.0,EthereumETHBuy,"$3,924.51",0.81%,2.29%,13.47%,"$473.7B$473,702,926,865","$61,270,935,37215.61M",120.7M ETH,...,,,,,,,,,,
2,,3.0,TetherUSDTBuy,$1.00,0.02%,0.04%,0.00%,"$173.53B$173,529,849,321","$185,647,514,523185.51B",173.44B USDT,...,,,,,,,,,,
3,,4.0,XRPXRPBuy,$2.75,0.33%,3.81%,9.56%,"$164.49B$164,490,408,229","$8,732,340,1183.17B",59.77B XRP,...,,,,,,,,,,
4,,5.0,BNBBNBBuy,$939.33,0.13%,5.58%,5.47%,"$130.74B$130,742,139,577","$4,584,360,2774.87M",139.18M BNB,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,196.0,ZORAZORABuy,$0.06025,1.79%,0.42%,11.46%,"$206.95M$206,953,346","$62,496,4671.03B",3.43B ZORA,...,,,,,,,,,,
96,,197.0,Cheems (cheems.pet)CHEEMSBuy,$0.051096,0.27%,4.38%,12.88%,"$205.5M$205,502,889","$3,714,9623.38T",187.49T CHEEMS,...,,,,,,,,,,
97,,198.0,ZilliqaZILBuy,$0.01024,0.20%,3.30%,15.62%,"$200.08M$200,082,620","$17,404,9911.69B",19.53B ZIL,...,,,,,,,,,,
98,,199.0,Peanut the SquirrelPNUTBuy,$0.1989,0.15%,4.34%,19.73%,"$198.93M$198,933,918","$70,636,948354.67M",999.85M PNUT,...,,,,,,,,,,


Solution by Johannes Grande:

In [30]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

"""
Exercise 4: Scrape CoinMarketCap's Top 10 Cryptocurrencies
Scrape 'Name', 'Symbol', and 'Market Cap' data
Identify the cryptocurrency with the highest market cap. d """

# Setup our WebDriver
driver = webdriver.Chrome() 

# Open CoinMarketCap
driver.get("https://coinmarketcap.com/")

# Explicit wait until the elements are visible
wait = WebDriverWait(driver, 10) 

def getCurrencies(amount):
    names = []
    symbols = []
    marketCaps = []

    for i in range(1, amount + 1): 

        try:
            # find the xpath for each element we want to scrape
            name_xpath = f'//*[@id="__next"]/div[2]/div[1]/div[2]/div/div[1]/div[4]/table/tbody/tr[{i}]/td[3]/div/a/div/div/div/p'
            symbol_xpath = f'//*[@id="__next"]/div[2]/div[1]/div[2]/div/div[1]/div[4]/table/tbody/tr[{i}]/td[3]/div/a/div/div/div/div/p'
            marketCap_xpath = f'/html/body/div[1]/div[2]/div[1]/div[2]/div/div[1]/div[4]/table/tbody/tr[{i}]/td[8]/p/span[2]'

            # Wait for each element to be present and get the text
            name = wait.until(EC.presence_of_element_located((By.XPATH, name_xpath))).text
            symbol = wait.until(EC.presence_of_element_located((By.XPATH, symbol_xpath))).text
            marketCap = wait.until(EC.presence_of_element_located((By.XPATH, marketCap_xpath))).text

            # Append the data to the lists
            names.append(name)
            symbols.append(symbol)
            marketCaps.append(marketCap)

        except Exception as e:
            print(f"Error fetching data for row {i}: {e}")

    return names, symbols, marketCaps 


def createDf(namesList, SymbolsList, mcLists):
    
    mcLists_cleaned = pd.to_numeric([cap.replace('$', '').replace(',', '') for cap in mcLists], errors='coerce')

    df = pd.DataFrame({
        'Name': namesList,
        'Symbol': SymbolsList,
        'Market Cap': mcLists_cleaned
    })

    return df

def exportData(df, LargestMcap):
    with open('cryptoData.txt', 'w') as f:
        f.write("Top 10 Cryptocurrencies:\n")
        f.write(df.to_string(index = False))
        f.write("\n\nCryptocurrency with the largest market cap:\n")
        f.write(LargestMcap.to_string())


# Fetch data for the top 10 cryptocurrencies
names, symbols, market_caps = getCurrencies(10)

# Create the DataFrame
df = createDf(names, symbols, market_caps)

# Find the cryptocurrency with the largest market cap
largestMcapIdx = df['Market Cap'].idxmax()
largestMcap = df.loc[largestMcapIdx]

#close the driver
driver.quit()

# Export our data to a txt file
exportData(df, largestMcap)


Error fetching data for row 1: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff60d731eb5+80197]
	GetHandleVerifier [0x0x7ff60d731f10+80288]
	(No symbol) [0x0x7ff60d4b02fa]
	(No symbol) [0x0x7ff60d507cd7]
	(No symbol) [0x0x7ff60d507f9c]
	(No symbol) [0x0x7ff60d55ba87]
	(No symbol) [0x0x7ff60d5303bf]
	(No symbol) [0x0x7ff60d5587fb]
	(No symbol) [0x0x7ff60d530153]
	(No symbol) [0x0x7ff60d4f8b02]
	(No symbol) [0x0x7ff60d4f98d3]
	GetHandleVerifier [0x0x7ff60d9ee83d+2949837]
	GetHandleVerifier [0x0x7ff60d9e8c6a+2926330]
	GetHandleVerifier [0x0x7ff60da086c7+3055959]
	GetHandleVerifier [0x0x7ff60d74cfee+191102]
	GetHandleVerifier [0x0x7ff60d7550af+224063]
	GetHandleVerifier [0x0x7ff60d73af64+117236]
	GetHandleVerifier [0x0x7ff60d73b119+117673]
	GetHandleVerifier [0x0x7ff60d7210a8+11064]
	BaseThreadInitThunk [0x0x7ffc5f4b259d+29]
	RtlUserThreadStart [0x0x7ffc6056af78+40]

Error fetching data for row 2: Message: 
Stacktrace:
	GetHandleVerifier [0x0x7ff60d731eb5+80197]
	GetHandleVerifier [0x0x7f

ValueError: attempt to get argmax of an empty sequence