## Exercise 1: Scrape NASDAQ Top Gainers
Steps:
1. **Initial Scrape:** Scrape the NASDAQ Top Gainers Table (https://www.nasdaq.com/market-activity/stocks/screener?exchange=nasdaq&status=top-gainers).
1. **Initial Scrape2:** If you get a timeout from NASDAQ try Yahoo Finance (https://finance.yahoo.com/markets/stocks/gainers/?guccounter=1&guce_referrer=aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS8&guce_referrer_sig=AQAAACvz6Ex45XoUQkTNdDAujGj-X1mDenZIQcqrx6vnpefvlJ9NoDdFaU1W6EO9SzM8m0aA1t7qTMhWSZq2zdbdGfRyC47dQXdu8ZG8IISgSgz6DXTsJe0Jrp3hGEKnAxOCDSjeey7roNKAj5L0UJ68arDOoeeI13BkNR2xMSggz88c)
2. **Data Cleanup:** Keep only the 'Symbol', 'Company', and 'Price' columns. With Yahoo data, Symbol and Company name is in the same column. 
3. **Analysis:** Find the company with the highest stock price.. Hint: With Yahoo you can use the start and count arguments to see all companies. 

In [None]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd
attrs = {
    "start": 0,
    "count": 100
}
url = "https://finance.yahoo.com/markets/stocks/gainers/"
results = re.get(url, attrs)
src = results.content
document = BeautifulSoup(src, "lxml")
tables = document.find_all("table")    # I verify that this has len(1)
table = tables[0]
data = {"Symbol": [], "Company": [], "Price": []}
rows = table.find_all("tr")
for row in rows[1:]:
    values = [c.get_text() for c in row.find_all("td")]
    symbol_and_name = values[0].split()
    symbol = symbol_and_name[0]
    company_name = " ".join(symbol_and_name[1:])
    price_chg_pctchg = values[1].split()
    price = price_chg_pctchg[0]
    data["Symbol"].append(symbol)
    data["Company"].append(company_name)
    data["Price"].append(float(price))

df = pd.DataFrame(data)
sorted_df = df.sort_values(by="Price", ascending=False)
top_company = sorted_df.iloc[0]
print(f"Of {len(df)} companies, {top_company.Company} has the most expensive share price")

In [None]:
from requests_html import HTMLSession

session = HTMLSession()

url = "https://finance.yahoo.com/markets/stocks/gainers/?start=0&count=100"
response = session.get(url)
tables = response.html.find('table')
table = tables[0]
rows = table.find('tr')
data = {"Symbol": [], "Company": [], "Price": []}
for row in rows[1:]:
    values = [c.text for c in row.find("td")]
    symbol_and_name = values[0].split()
    symbol = symbol_and_name[0]
    company_name = " ".join(symbol_and_name[1:])
    price_chg_pctchg = values[1].split()
    price = price_chg_pctchg[0]
    data["Symbol"].append(symbol)
    data["Company"].append(company_name)
    data["Price"].append(float(price))

df = pd.DataFrame(data)
sorted_df = df.sort_values(by="Price", ascending=False)
top_company = sorted_df.iloc[0]
print(f"Of {len(df)} companies, {top_company.Company} has the most expensive share price")

## Exercise 2: Scrape Top 250 Movies by Gross income
Steps:
1. **Initial Scrape:** Scrape BoxOfficeMojo's list of top 250 movies (https://www.boxofficemojo.com/chart/top_lifetime_gross/).
2. **Data Cleanup:** Keep only relevant columns such as 'Rank', 'Title', "Lifetime gross", and 'Year'.
3. **Analysis:** Find the best decade in terms of "Lifetime gross". 

In [8]:
import requests as re
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"
data = {"Title": [], "Gross": [], "Year": []}

for i in range(5):
    attrs = {"offset": 200*i}
    results = re.get(url, params=attrs)
    src = results.content
    document = BeautifulSoup(src, "lxml")
    tables = document.find_all("table")
    table = tables[0]
    rows = table.find_all('tr')
    for row in rows[1:]:
        elements = [e.get_text() for e in row.find_all("td")]
        data["Title"].append(elements[1])
        income = elements[2]
        income = float(income.replace(",", "").replace("$", ""))
        data["Gross"].append(income)
        data["Year"].append(int(elements[3]))

df = pd.DataFrame(data, index=pd.Index(range(1, len(data["Title"])+1), name="Rank"))
decades = [1970 + 10*i for i in range(6)]
average_gross = {}
max_decade = 0
for decade in decades:
    decade_df = df[(df["Year"] > decade)*(df["Year"] < decade + 10)]
    average_gross[decade] = decade_df["Gross"].mean()
    if average_gross[decade] > max_decade:
        max_decade = decade

print(f"The {decade}'s had the highest average Gross income")
df

The 2020's had the highest average Gross income


Unnamed: 0_level_0,Title,Gross,Year
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Star Wars: Episode VII - The Force Awakens,936662225.0,2015
2,Avengers: Endgame,858373000.0,2019
3,Spider-Man: No Way Home,814866759.0,2021
4,Avatar,785221649.0,2009
5,Top Gun: Maverick,718732821.0,2022
...,...,...,...
996,Aliens,85160248.0,1986
997,Open Season,85105259.0,2006
998,Green Book,85080171.0,2018
999,The Expendables 2,85028192.0,2012


## Exercise 3: Scrape Wikipedia's List of Best-selling Music Artists
Steps:
1. **Initial Scrape:** Scrape Wikipedia's table of best-selling music artists (https://en.wikipedia.org/wiki/List_of_best-selling_music_artists).
2. **Data Cleanup:** Retain only 'Artist', 'Country/Market', and 'Certified Sales'.
3. **Analysis:** Find the artist with the highest certified sales.

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_best-selling_music_artists"
results = re.get(url)
src = results.content
document = BeautifulSoup(src, "lxml")
tables = document.find_all("table")
table = tables[0]
rows = table.find_all('tr')
data = {"Name": [], "Country": [], "Certified sales": []}
for row in rows[1:]:
    values = row.get_text().split("\n")
    name = values[1]
    country = values[3]
    sales = float(values[12].strip(" million"))
    data["Name"].append(name)
    data["Country"].append(country)
    data["Certified sales"].append(sales)

df = pd.DataFrame(data, index=pd.Index(range(1, len(data["Name"])+1), name="Claimed rank"))
df_sorted = df.sort_values(by="Certified sales", ascending=False)
print(f"{df_sorted.iloc[0].Name} has the highest certified sales")

    

## Exercise 4: Scrape CoinMarketCap's Top 10 Cryptocurrencies
Steps:
1. **Initial Scrape:** Scrape CoinMarketCap's table of top cryptocurrencies (https://coinmarketcap.com/).
2. **Data Cleanup:** Retain only 'Name', 'Symbol', and 'Market Cap'.
3. **Analysis:** Identify the cryptocurrency with the highest market cap.


In [4]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By



driver = webdriver.Chrome()
url = "https://coinmarketcap.com/"
driver.get(url)
time.sleep(3)
height = driver.execute_script("return window.innerHeight;")
for _ in range(12):
    driver.execute_script(f"window.scrollBy(0, {height});")
    time.sleep(0.5)

html_content = driver.page_source
document = BeautifulSoup(html_content, "lxml")
tables = document.find_all("table")
table = tables[0]
rows = table.find_all('tr')
data = {"Company name": [], "Symbol": [], "Market cap": []}
for row in rows[1:]:
    elements = row.find_all("td")
    name_and_symbol = elements[2]
    name, symbol = [e.get_text() for e in name_and_symbol.find_all("p")]
    market_cap_text = elements[7].get_text()
    market_cap = float(market_cap_text.split("$")[2].replace(",",""))
    data["Symbol"].append(symbol)
    data["Company name"].append(name)
    data["Market cap"].append(float(market_cap))

drop_anchors = driver.find_elements(By.ID, "drop-anchor")
wait = WebDriverWait(driver, 1)
read_more_button = wait.until(
    EC.element_to_be_clickable((By.ID, "drop-anchor"))
)
    # Click the "Next page" button
read_more_button.click()
time.sleep(5)
driver.close()
df = pd.DataFrame(data, index=pd.Index(range(1, len(data["Company name"])+1), name="Rank"))
print(f"{df.sort_values(by="Market cap", ascending=False).iloc[0]["Company name"]} has the higest market cap")

Bitcoin has the higest market cap


In [11]:
from selenium import webdriver
from io import StringIO
import time
import pandas as pd


pages = 2
url = "https://coinmarketcap.com/"
dfs = []
for i in range(pages):
    driver = webdriver.Chrome()
    page = i+1
    driver.get(url+f"?page={page}")
    time.sleep(3)
    height = driver.execute_script("return window.innerHeight;")
    for _ in range(12):
        driver.execute_script(f"window.scrollBy(0, {height});")
        time.sleep(0.5)

    html_content = driver.page_source
    driver.close()
    df = pd.read_html(StringIO(html_content))[0]
    dfs.append(df.copy())

DF = pd.concat(dfs)
DF

Unnamed: 0.1,Unnamed: 0,#,Name,Price,1h %,24h %,7d %,Market Cap,Volume(24h),Circulating Supply,Last 7 Days
0,,1,BitcoinBTC,"$64,492.58",0.10%,1.61%,1.03%,"$1.27T$1,274,424,747,258","$20,406,014,729316,409 BTC","19,760,796 BTC",
1,,2,EthereumETH,"$2,633.02",0.22%,1.20%,1.13%,"$316.93B$316,931,044,788","$13,675,254,2005,193,756 ETH","120,367,967 ETH",
2,,3,TetherUSDT,$1.00,0.02%,0.00%,0.01%,"$119.52B$119,524,829,801","$47,297,901,17647,292,998,880 USDT","119,518,802,451 USDT",
3,,4,BNBBNB,$579.60,0.45%,3.00%,2.72%,"$84.58B$84,582,976,679","$1,855,597,6393,201,503 BNB","145,932,862 BNB",
4,,5,SolanaSOL,$156.68,0.60%,0.04%,6.61%,"$73.48B$73,481,670,705","$2,051,753,97213,094,783 SOL","468,977,532 SOL",
...,...,...,...,...,...,...,...,...,...,...,...
95,,196,Rocket PoolRPL,$11.46,0.92%,0.78%,2.41%,"$237.81M$237,812,095","$6,875,433601,451 RPL","20,753,857 RPL",
96,,197,Mask NetworkMASK,$2.36,0.75%,0.22%,2.89%,"$235.94M$235,939,232","$25,095,19610,642,716 MASK","100,000,000 MASK",
97,,198,PolymeshPOLYX,$0.2629,0.62%,1.77%,2.37%,"$234.89M$234,886,461","$10,419,77239,637,473 POLYX","893,522,997 POLYX",
98,,199,OriginTrailTRAC,$0.5678,0.21%,3.97%,0.22%,"$234.2M$234,202,730","$1,731,3033,039,242 TRAC","412,463,783 TRAC",


Solution by Johannes Grande:

In [8]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

"""
Exercise 4: Scrape CoinMarketCap's Top 10 Cryptocurrencies
Scrape 'Name', 'Symbol', and 'Market Cap' data
Identify the cryptocurrency with the highest market cap. d """

# Setup our WebDriver
driver = webdriver.Chrome() 

# Open CoinMarketCap
driver.get("https://coinmarketcap.com/")

# Explicit wait until the elements are visible
wait = WebDriverWait(driver, 10) 

def getCurrencies(amount):
    names = []
    symbols = []
    marketCaps = []

    for i in range(1, amount + 1): 

        try:
            # find the xpath for each element we want to scrape
            name_xpath = f'//*[@id="__next"]/div[2]/div[1]/div[2]/div/div[1]/div[4]/table/tbody/tr[{i}]/td[3]/div/a/div/div/div/p'
            symbol_xpath = f'//*[@id="__next"]/div[2]/div[1]/div[2]/div/div[1]/div[4]/table/tbody/tr[{i}]/td[3]/div/a/div/div/div/div/p'
            marketCap_xpath = f'/html/body/div[1]/div[2]/div[1]/div[2]/div/div[1]/div[4]/table/tbody/tr[{i}]/td[8]/p/span[2]'

            # Wait for each element to be present and get the text
            name = wait.until(EC.presence_of_element_located((By.XPATH, name_xpath))).text
            symbol = wait.until(EC.presence_of_element_located((By.XPATH, symbol_xpath))).text
            marketCap = wait.until(EC.presence_of_element_located((By.XPATH, marketCap_xpath))).text

            # Append the data to the lists
            names.append(name)
            symbols.append(symbol)
            marketCaps.append(marketCap)

        except Exception as e:
            print(f"Error fetching data for row {i}: {e}")

    return names, symbols, marketCaps 


def createDf(namesList, SymbolsList, mcLists):
    
    mcLists_cleaned = pd.to_numeric([cap.replace('$', '').replace(',', '') for cap in mcLists], errors='coerce')

    df = pd.DataFrame({
        'Name': namesList,
        'Symbol': SymbolsList,
        'Market Cap': mcLists_cleaned
    })

    return df

def exportData(df, LargestMcap):
    with open('cryptoData.txt', 'w') as f:
        f.write("Top 10 Cryptocurrencies:\n")
        f.write(df.to_string(index = False))
        f.write("\n\nCryptocurrency with the largest market cap:\n")
        f.write(LargestMcap.to_string())


# Fetch data for the top 10 cryptocurrencies
names, symbols, market_caps = getCurrencies(10)

# Create the DataFrame
df = createDf(names, symbols, market_caps)

# Find the cryptocurrency with the largest market cap
largestMcapIdx = df['Market Cap'].idxmax()
largestMcap = df.loc[largestMcapIdx]

#close the driver
driver.quit()

# Export our data to a txt file
exportData(df, largestMcap)
