Get the main first page results from Google search for the term "bitcoin".

In [1]:
from CryptoFraudDetection.utils.enums import ScraperNotebookMode, LoggerMode
from CryptoFraudDetection.utils.logger import Logger
from CryptoFraudDetection.scraper.google_results import GoogleResultsScraper
from CryptoFraudDetection.elasticsearch.data_insertion import insert_dict
from CryptoFraudDetection.utils.exceptions import DetectedBotException

import pandas as pd

logger_ = Logger(name="scrape_test", level=LoggerMode.DEBUG, log_dir="../logs")

In [2]:
MODE = ScraperNotebookMode.WRITE

In [3]:
# read subreddits from data/raw/subreddit_list.txt
with open("../data/raw/subreddit_list.txt", "r") as f:
    subreddits = f.readlines()
    subreddits = [subreddit.strip() for subreddit in subreddits]

subreddits

['r/CryptoCurrency', 'r/CryptoMoonShots', 'r/CryptoMarkets']

In [4]:
# read subreddits from data/raw/labeled_coins.csv
coins = pd.read_csv("../data/raw/labeled_coins.csv")
coins

Unnamed: 0,Coin Name,Alternative Name,Short Name,Classification,Start Date,Scam Date
0,FTX Token,,FTT,Scam,2019-08-01,2022-11-07
1,Terra Luna Classic,Terra Luna,LUNC,Scam,2019-07-27,
2,Squid-Game-Coin,,SQUID,Scam,2024-10-15,
3,BeerCoin,,BEER,Scam,2024-05-27,
4,BitForex,,BF,Scam,2019-08-31,
5,Bitcoin,,BTC,Non-Scam,2010-07-14,
6,Ethereum,,ETH,Non-Scam,2015-08-08,
7,Cosmos,,ATOM,Non-Scam,2019-03-15,
8,Avalanche,,AVAX,Non-Scam,2020-09-23,


In [5]:
proxy_list_api = "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&proxy_format=protocolipport&format=text"
proxy_list = pd.read_csv(proxy_list_api, header=None).iloc[:, 0].values
proxy_list[:5]

proxy_gen = (proxy for proxy in proxy_list)
def get_next_proxy():
    return next(proxy_gen)

In [6]:
proxy_address = get_next_proxy()
logger_.debug(f"Using proxy {proxy_address}")

n_sites = 30
if MODE == ScraperNotebookMode.WRITE:
    for _, coin in coins.iterrows():
        for subreddit in subreddits:
            try:
                logger_.info(f"Scraping {coin['Coin Name']} in {subreddit}")
                query = f"{coin['Coin Name']} {coin["Alternative Name"] if not pd.isna(coin["Alternative Name"]) else ""} {coin["Short Name"]} site:reddit.com/{subreddit} {"before:" + coin['Scam Date'] if not pd.isna(coin['Scam Date']) else ""} {"after:" + coin['Start Date'] if not pd.isna(coin['Start Date']) else ""}"
                query = query.replace("  ", " ")
                logger_.debug(f"Query: {query}")
                scraper = GoogleResultsScraper(logger=logger_)
                results = scraper.get_main_results(
                    query, n_sites=n_sites, headless=False
                )
                insert_dict(logger=logger_, index="reddit_metadata", data_dict=results)
            except DetectedBotException as e:
                logger_.warning(f"Detected bot, changing proxy")
                proxy_address = get_next_proxy()
                logger_.info(f"Using proxy {proxy_address}")
        break  # test

2024-11-11 07:58:10,616 - scrape_test - DEBUG - Using proxy http://165.22.77.86:80
2024-11-11 07:58:10,618 - scrape_test - INFO - Scraping FTX Token in r/CryptoCurrency
2024-11-11 07:58:10,619 - scrape_test - DEBUG - Query: FTX Token FTT site:reddit.com/r/CryptoCurrency before:2022-11-07 after:2019-08-01
2024-11-11 07:58:12,811 - scrape_test - INFO - Accepted Google's cookies.
2024-11-11 07:58:13,906 - scrape_test - INFO - Search query submitted successfully.
2024-11-11 07:58:14,951 - scrape_test - DEBUG - Missing elements in result box. Skipping
2024-11-11 07:58:15,024 - scrape_test - DEBUG - Missing elements in result box. Skipping
2024-11-11 07:58:15,560 - scrape_test - INFO - Scraping FTX Token in r/CryptoMoonShots
2024-11-11 07:58:15,561 - scrape_test - DEBUG - Query: FTX Token FTT site:reddit.com/r/CryptoMoonShots before:2022-11-07 after:2019-08-01
2024-11-11 07:58:17,736 - scrape_test - INFO - Accepted Google's cookies.
2024-11-11 07:58:18,826 - scrape_test - INFO - Search query