## Trials

Trial 1

In [3]:
from selenium import webdriver
import time

# Specify the URL
url = "https://www.cryptocompare.com/news/list/latest/"

# Initialize the webdriver
driver = webdriver.Chrome()  # You need to have chromedriver installed and in your PATH

# Open the URL
driver.get(url)

# Set the initial scroll position
scroll_pos = 0

try:
    while True:
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for a short time to let the page load
        time.sleep(2)
        
        # Calculate the new scroll position
        new_scroll_pos = driver.execute_script("return document.body.scrollHeight")
        
        # Check if the scroll position hasn't changed (reached the bottom of the page)
        if new_scroll_pos == scroll_pos:
            break
        
        scroll_pos = new_scroll_pos
except KeyboardInterrupt:
    # If the user interrupts the program (e.g., with Ctrl+C), stop scrolling
    pass
finally:
    # Close the webdriver
    driver.quit()


Trial 2

In [4]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

# Specify the URL
url = "https://u.today/bitcoin-news"

# Initialize the webdriver
driver = webdriver.Chrome()  # You need to have chromedriver installed and in your PATH

try:
    # Open the URL
    driver.get(url)

    # Set the initial scroll position
    scroll_pos = 0

    # Scroll the page 5 times
    for _ in range(5):
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for a short time to let the page load
        time.sleep(2)

    # Once scrolling is done, get the page source
    page_source = driver.page_source

    # Parse the HTML content
    soup = BeautifulSoup(page_source, "html.parser")

    # Find and print the titles of news items
    titles = soup.select("div.news__item-title")
    for title in titles:
        print(title.text.strip())

except KeyboardInterrupt:
    # If the user interrupts the program (e.g., with Ctrl+C), stop scrolling and scraping
    pass
finally:
    # Close the webdriver
    driver.quit()


Top 7 Bitcoin Whale Withdraws $627 Million in BTC From Major Exchange
Key Reason Behind Dogecoin's Massive Price Surge
Bitcoin (BTC) Scores Historic Monthly Close
Bitcoin (BTC) Hashrate New High: 'Three Times More Money,' CryptoQuant CEO Says
Bitcoin (BTC) Price Could Top $150,000, Yusko Predicts
Shiba Inu (SHIB) Could See Major Price Surge: Trader
Bitcoin About to Have Its Most Impactful Halving, Bitwise CEO Says
Massive USDC Inflows Spur Bullish Sentiment
Bitcoin (BTC) Price Might Be on Track to Hit $75,000, but There's Key Resistance
Bitcoin Reacts to SEC v. Coinbase Ruling With Price Drop
BlackRock's $9.5 Trillion May Flow Into Digital Assets, Bitcoin Expert Willy Woo Believes
Hong Kong Primed for Bitcoin ETFs, Expert Calls It Game-Changer
Silk Road Founder Ross Ulbricht Turns 40 with 11 Years Spent Behind Bars
Solv Introduces SolvBTC, First Yield-Bearing Token for Bitcoiners
Bitcoin to Reach $100,000 Sooner Than Expected, Predicts Analyst
BlackRock Praises Bitcoin as Portfolio Div

Trial 3

In [6]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# Specify the URL
url = "https://u.today/bitcoin-news"

# Initialize the webdriver
driver = webdriver.Chrome()  # You need to have chromedriver installed and in your PATH

try:
    # Open the URL
    driver.get(url)

    # Set the initial scroll position
    scroll_pos = 0

    # Scroll the page 5 times
    for _ in range(5):
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for a short time to let the page load
        time.sleep(2)

    # Once scrolling is done, get the page source
    page_source = driver.page_source

    # Parse the HTML content
    soup = BeautifulSoup(page_source, "html.parser")

    # Find and store the dates and titles of news items
    dates = [date.text.strip() for date in soup.find_all('div', class_='humble')]
    titles = [title.text.strip() for title in soup.find_all('div', class_='news__item-title')]

    # Create a DataFrame to store the data
    df = pd.DataFrame({'Date': dates, 'News': titles})

    # Print the DataFrame
    print(df)

except KeyboardInterrupt:
    # If the user interrupts the program (e.g., with Ctrl+C), stop scrolling and scraping
    pass
finally:
    # Close the webdriver
    driver.quit()


                 Date                                               News
1    2024/04/01 09:29  Top 7 Bitcoin Whale Withdraws $627 Million in ...
2    2024/04/01 07:36   Key Reason Behind Dogecoin's Massive Price Surge
3    2024/04/01 05:24        Bitcoin (BTC) Scores Historic Monthly Close
4    2024/03/31 14:10  Bitcoin (BTC) Hashrate New High: 'Three Times ...
..                ...                                                ...
205  2024/02/15 08:21  Bitcoin Bull Michael Saylor Breaks Silence on ...
206  2024/02/15 08:10      Dogecoin Founder Reveals His Bitcoin Holdings
207  2024/02/15 06:05      Bitcoin (BTC) Hits New All-Time High in Japan
208  2024/02/14 16:45  Binance CEO Comments on Bitcoin’s Historic Mar...
209  2024/02/14 16:06    Bitcoin Leading Ransomware Market, Gensler Says

[210 rows x 2 columns]


In [7]:
# Save the DataFrame to a CSV file
df.to_csv('crypto_news.csv', index=False)


## Scraping Policy Data

In [4]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define the base URL and the range of pages to scrape
base_url = "https://www.theblock.co/category/policy/"
start_page = 1
end_page = 271

# Initialize an empty list to store dictionaries
data_list = []

# Function to scrape data from a single page
def scrape_page(page_number):
    url = f"{base_url}{page_number}"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all headline and date elements
        headlines = soup.select('div.collection__feed div.headline span')
        dates = soup.select('div.collection__feed div.meta div.pubDate')
        # Iterate over each headline and date and store in the list
        for headline, date in zip(headlines, dates):
            headline_text = headline.get_text(strip=True)
            date_text = date.get_text(strip=True)
            data_list.append({'headline': headline_text, 'date': date_text})
    else:
        print(f"Failed to fetch page {page_number}")

# Loop through pages and scrape data
for page_number in range(start_page, end_page + 1):
    print(f"Scraping data from page {page_number}...")
    scrape_page(page_number)
    time.sleep(1)  # Add a delay to be polite to the server

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)

# Save the DataFrame to a CSV file
df.to_csv("scraped_data.csv", index=False)

print("Data saved to scraped_data.csv")


Scraping data from page 1...
Scraping data from page 2...
Scraping data from page 3...
Scraping data from page 4...
Scraping data from page 5...
Scraping data from page 6...
Scraping data from page 7...
Scraping data from page 8...
Scraping data from page 9...
Scraping data from page 10...
Scraping data from page 11...
Scraping data from page 12...
Scraping data from page 13...
Scraping data from page 14...
Scraping data from page 15...
Scraping data from page 16...
Scraping data from page 17...
Scraping data from page 18...
Scraping data from page 19...
Scraping data from page 20...
Scraping data from page 21...
Scraping data from page 22...
Scraping data from page 23...
Scraping data from page 24...
Scraping data from page 25...
Scraping data from page 26...
Scraping data from page 27...
Scraping data from page 28...
Scraping data from page 29...
Scraping data from page 30...
Scraping data from page 31...
Scraping data from page 32...
Scraping data from page 33...
Scraping data from 

In [16]:
# Assuming df is your existing DataFrame with the "date" column containing dates with "EDT"
df['date'] = df['date'].str.replace(' EDT', '')  # Remove 'EDT'
df['date'] = df['date'].str.replace(' EST', '')  # Remove 'EDT'
df['date'] = df['date'].str.strip()  # Remove leading and trailing whitespace
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Set errors='coerce' to handle parsing errors

# Print the first few rows of the DataFrame to verify the changes
print(df.head())


                                            headline                date
0  Hong Kong spot bitcoin ETFs could go live as s... 2024-04-16 02:54:00
1  Lawmakers demand information on CFTC chair's r... 2024-04-15 16:46:00
2  Nebraska man charged for mining $1 million in ... 2024-04-15 14:56:00
3  Potential movement on stablecoin legislation f... 2024-04-15 13:09:00
4  UK to legislate on cryptoasset regulatory fram... 2024-04-15 12:18:00


In [18]:
df.to_csv("theblock_policy_data.csv", index=False)

## Scraping News Data

Trial 1

In [5]:

import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define the base URL and the range of pages to scrape
base_url = "https://www.theblock.co/latest?start="
start_page = 0
end_page = 4

# Initialize an empty list to store dictionaries
data_list = []

# Function to scrape data from a single page
def scrape_page(page_number):
    url = f"{base_url}{page_number}0"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all headline and date elements
        headlines = soup.select('div.collection__feed div.headline span')
        dates = soup.select('div.collection__feed div.meta div.pubDate')
        # Iterate over each headline and date and store in the list
        for headline, date in zip(headlines, dates):
            headline_text = headline.get_text(strip=True)
            date_text = date.get_text(strip=True)
            data_list.append({'headline': headline_text, 'date': date_text})
    else:
        print(f"Failed to fetch page {page_number}")

# Loop through pages and scrape data
for page_number in range(start_page, end_page + 1):
    print(f"Scraping data from page {page_number}...")
    scrape_page(page_number)
    time.sleep(1)  # Add a delay to be polite to the server

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
# Clean and convert the "date" column to datetime format

#df['date'] = df['date'].str.replace(' EDT', '')  # Remove 'EDT'
#df['date'] = df['date'].str.replace(' EST', '')  # Remove 'EDT'
#df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Convert to datetime format

# Save the DataFrame to a CSV file
df.to_csv("news_data.csv", index=False)

print("Data saved to news_data.csv")


Scraping data from page 0...
Scraping data from page 1...
Scraping data from page 2...
Scraping data from page 3...
Scraping data from page 4...
Data saved to news_data.csv


Checking with Request

In [84]:
import requests

url = "https://www.theblock.co/latest?start=0"
response = requests.get(url)

if response.status_code == 200:
    print("The website is accessible and scrapable.")
else:
    print(f"Failed to access the website. Status code: {response.status_code}")


Failed to access the website. Status code: 429


First try with Proxy

In [9]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define the base URL and the range of pages to scrape
base_url = "https://www.theblock.co/latest?start="
start_page = 0
end_page = 24  # Adjusted to scrape 25 pages at a time

# List of proxies to rotate through
proxies = [
    {'http': 'http://102.130.125.86:80'},
    {'http': 'http://85.198.13.205:80'},
    {'http': 'http://185.110.189.166:80'},
    {'http': 'http://103.168.254.62:8080'}
    
    
    
    # Add more proxies as needed
]

# Initialize an empty list to store dictionaries
data_list = []

# Function to scrape data from a single page with a specified proxy
def scrape_page(page_number, proxy):
    url = f"{base_url}{page_number}0"
    response = requests.get(url, proxies=proxy)  # Use specified proxy
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all headline and date elements
        headlines = soup.select('div.collection__feed div.headline span')
        dates = soup.select('div.collection__feed div.meta div.pubDate')
        # Iterate over each headline and date and store in the list
        for headline, date in zip(headlines, dates):
            headline_text = headline.get_text(strip=True)
            date_text = date.get_text(strip=True)
            data_list.append({'headline': headline_text, 'date': date_text})
    else:
        print(f"Failed to fetch page {page_number}")

# Loop through pages and scrape data
proxy_index = 0
for page_number in range(start_page, end_page + 1):
    print(f"Scraping data from page {page_number}...")
    # Select proxy for this batch of requests
    proxy = proxies[proxy_index % len(proxies)]
    try:
        scrape_page(page_number, proxy)
    except Exception as e:
        print(f"Error occurred while scraping page {page_number}: {e}")
        break  # Stop scraping if an error occurs
    time.sleep(1)  # Add a delay to be polite to the server
    proxy_index += 1

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
# Clean and convert the "date" column to datetime format

# Save the DataFrame to a CSV file
df.to_csv("news_data.csv", index=False)

print("Data saved to news_data.csv")


Scraping data from page 0...
Scraping data from page 1...
Scraping data from page 2...
Scraping data from page 3...
Scraping data from page 4...
Scraping data from page 5...
Scraping data from page 6...
Scraping data from page 7...
Scraping data from page 8...
Scraping data from page 9...
Scraping data from page 10...
Scraping data from page 11...
Scraping data from page 12...
Scraping data from page 13...
Scraping data from page 14...
Scraping data from page 15...
Scraping data from page 16...
Scraping data from page 17...
Scraping data from page 18...
Scraping data from page 19...
Scraping data from page 20...
Scraping data from page 21...
Scraping data from page 22...
Scraping data from page 23...
Scraping data from page 24...
Data saved to news_data.csv


Batching Try 1


In [21]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define the base URL and the total number of pages to scrape
base_url = "https://www.theblock.co/latest?start="
total_pages = 100
pages_per_batch = 25

# List of proxies to rotate through
proxies = [
    {'http': 'http://102.130.125.86:80'},
    {'http': 'http://85.198.13.205:80'},
    {'http': 'http://185.110.189.166:80'},
    {'http': 'http://103.168.254.62:8080'}
    
    
    
    # Add more proxies as needed
]

# Initialize an empty list to store dictionaries
data_list = []

# Function to scrape data from a single page with a specified proxy
def scrape_page(page_number, proxy):
    url = f"{base_url}{page_number}0"
    response = requests.get(url, proxies=proxy)  # Use specified proxy
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all headline and date elements
        headlines = soup.select('div.collection__feed div.headline span')
        dates = soup.select('div.collection__feed div.meta div.pubDate')
        # Iterate over each headline and date and store in the list
        for headline, date in zip(headlines, dates):
            headline_text = headline.get_text(strip=True)
            date_text = date.get_text(strip=True)
            data_list.append({'headline': headline_text, 'date': date_text})
    else:
        print(f"Failed to fetch page {page_number}")


# Loop through pages and scrape data
for batch_number in range(total_pages // pages_per_batch):
    start_page = batch_number * pages_per_batch + 1
    end_page = start_page + pages_per_batch - 1
    print(f"Scraping data from pages {start_page}-{end_page}...")
    # Select proxy for this batch of requests
    proxy = proxies[batch_number % len(proxies)]
    for page_number in range(start_page, end_page + 1):
        try:
            scrape_page(page_number, proxy)
        except Exception as e:
            print(f"Error occurred while scraping page {page_number}: {e}")
            break  # Stop scraping if an error occurs
        time.sleep(1)  # Add a delay to be polite to the server

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
# Clean and convert the "date" column to datetime format

# Save the DataFrame to a CSV file
df.to_csv("news_data.csv", index=False)

print("Data saved to news_data.csv")


Scraping data from pages 1-25...
Scraping data from pages 26-50...
Scraping data from pages 51-75...
Scraping data from pages 76-100...
Data saved to news_data.csv


## Final Scraper with Proxy and Batching

In [61]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define the base URL and the total number of pages to scrape
base_url = "https://www.theblock.co/latest?start="
total_pages = 1300
pages_per_batch = 250

# List of proxies to rotate through
proxies = [
    {'http': 'http://102.130.125.86:80'},
    {'http': 'http://85.198.13.205:80'},
    {'http': 'http://185.110.189.166:80'},
    {'http': 'http://103.168.254.62:8080'}
    
    
    
    # Add more proxies as needed
]

# Initialize an empty list to store dictionaries
data_list = []

# Function to scrape data from a single page with a specified proxy
def scrape_page(page_number, proxy):
    url = f"{base_url}{page_number}0"
    response = requests.get(url, proxies=proxy)  # Use specified proxy
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all headline and date elements
        headlines = soup.select('div.collection__feed div.headline span')
        dates = soup.select('div.collection__feed div.meta div.pubDate')
        # Iterate over each headline and date and store in the list
        for headline, date in zip(headlines, dates):
            headline_text = headline.get_text(strip=True)
            date_text = date.get_text(strip=True)
            data_list.append({'headline': headline_text, 'date': date_text})
    else:
        print(f"Failed to fetch page {page_number}")


# Loop through pages and scrape data
for batch_number in range(1,total_pages // pages_per_batch):
    start_page = -150+ batch_number * pages_per_batch +1
    end_page = start_page + pages_per_batch-1
    print(f"Scraping data from pages {start_page}-{end_page}...")
    # Select proxy for this batch of requests
    proxy = proxies[batch_number % len(proxies)]
    for page_number in range(start_page, end_page + 1):
        try:
            scrape_page(page_number, proxy)
            print(f"Now scraping {page_number}")
        except Exception as e:
            print(f"Error occurred while scraping page {page_number}: {e}")
            break  # Stop scraping if an error occurs
        time.sleep(15)  # Add a delay to be polite to the server

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
# Clean and convert the "date" column to datetime format

# Save the DataFrame to a CSV file
df.to_csv("news_data2.csv", index=False)

print("Data saved to news_data2.csv")


Scraping data from pages 101-350...
Now scraping 101
Now scraping 102
Now scraping 103
Now scraping 104
Now scraping 105
Now scraping 106
Now scraping 107
Now scraping 108
Now scraping 109
Now scraping 110
Now scraping 111
Now scraping 112
Now scraping 113
Now scraping 114
Now scraping 115
Now scraping 116
Now scraping 117
Now scraping 118
Now scraping 119
Now scraping 120
Now scraping 121
Now scraping 122
Now scraping 123
Now scraping 124
Now scraping 125
Now scraping 126
Now scraping 127
Now scraping 128
Now scraping 129
Now scraping 130
Now scraping 131
Now scraping 132
Now scraping 133
Now scraping 134
Now scraping 135
Now scraping 136
Now scraping 137
Now scraping 138
Now scraping 139
Now scraping 140
Now scraping 141
Now scraping 142
Now scraping 143
Now scraping 144
Now scraping 145
Now scraping 146
Now scraping 147
Now scraping 148
Now scraping 149
Now scraping 150
Now scraping 151
Now scraping 152
Now scraping 153
Now scraping 154
Now scraping 155
Now scraping 156
Now scraping

In [80]:
total_pages=100
pages_per_batch=250
# Loop through pages and scrape data
for batch_number in range(total_pages // pages_per_batch):
    
    start_page1 = batch_number * pages_per_batch + 1 +379
    end_page2 = start_page1 + pages_per_batch -1
    print(f"Scraping data from pages {start_page1}-{end_page2}...")
    

Scraping data from pages 380-629...
Scraping data from pages 630-879...
Scraping data from pages 880-1129...
Scraping data from pages 1130-1379...
Scraping data from pages 1380-1629...


In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

# Define the base URL and the total number of pages to scrape
base_url = "https://www.theblock.co/latest?start="
total_pages = 1300
pages_per_batch = 250

# List of proxies to rotate through
proxies = [
    'http://102.130.125.86:80',
    'http://85.198.13.205:80',
    'http://185.110.189.166:80',
    'http://103.168.254.62:8080'
    # Add more proxies as needed
]

# Initialize an empty list to store dictionaries
data_list = []

# Function to scrape data from a single page with a specified proxy
def scrape_page(page_number, proxy):
    # Configure Selenium WebDriver with proxy
    chrome_options = Options()
    chrome_options.add_argument(f'--proxy-server={proxy}')
    service = Service('path_to_chromedriver')
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Load page
    url = f"{base_url}{page_number}0"
    driver.get(url)
    time.sleep(2)  # Wait for page to load (adjust as needed)
    
    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    headlines = soup.select('div.collection__feed div.headline span')
    dates = soup.select('div.collection__feed div.meta div.pubDate')
    
    # Iterate over each headline and date and store in the list
    for headline, date in zip(headlines, dates):
        headline_text = headline.get_text(strip=True)
        date_text = date.get_text(strip=True)
        data_list.append({'headline': headline_text, 'date': date_text})
    
    # Close the WebDriver
    driver.quit()

# Loop through pages and scrape data
for batch_number in range(1, total_pages // pages_per_batch):
    start_page = -150 + batch_number * pages_per_batch + 1
    end_page = start_page + pages_per_batch - 1
    print(f"Scraping data from pages {start_page}-{end_page}...")
    # Select proxy for this batch of requests
    proxy = proxies[batch_number % len(proxies)]
    for page_number in range(start_page, end_page + 1):
        try:
            scrape_page(page_number, proxy)
            print(f"Now scraping {page_number}")
        except Exception as e:
            print(f"Error occurred while scraping page {page_number}: {e}")
            break  # Stop scraping if an error occurs
        time.sleep(1)  # Add a delay to be polite to the server

# Create DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)

# Save the DataFrame to a CSV file
df.to_csv("news_data2.csv", index=False)

print("Data saved to news_data2.csv")
