Get the main first page results from Google search for the term "bitcoin".

In [1]:
import time

from CryptoFraudDetection.utils.logger import Logger
from CryptoFraudDetection.utils.exceptions import ProxyNotWorking, DetectedBotException
from CryptoFraudDetection.utils.enums import ScraperNotebookMode, LoggerMode
from CryptoFraudDetection.elasticsearch.data_insertion import insert_dict
from CryptoFraudDetection.scraper.google_results import GoogleResultsScraper
from CryptoFraudDetection.scraper.utils import get_driver

import pandas as pd
from selenium.common.exceptions import TimeoutException, NoSuchElementException

logger_ = Logger(name="scrape_reddit_metadata", level=LoggerMode.DEBUG, log_dir="../logs")

In [2]:
MODE = ScraperNotebookMode.WRITE

In [3]:
# read subreddits from data/raw/subreddit_list.txt
with open("../data/raw/subreddit_list.txt", "r") as f:
    subreddits = f.readlines()
    subreddits = [subreddit.strip() for subreddit in subreddits]

subreddits

['r/CryptoCurrency', 'r/CryptoMoonShots', 'r/CryptoMarkets']

In [4]:
# read subreddits from data/raw/labeled_coins.csv
coins = pd.read_csv("../data/raw/labeled_coins.csv")
coins

Unnamed: 0,Coin Name,Alternative Name,Short Name,Classification,Start Date,Scam Date
0,FTX Token,,FTT,Scam,2019-08-01,2022-11-07
1,Terra Luna Classic,Terra Classic,LUNC,Scam,2019-07-27,2022-05-08
2,BeerCoin,,BEER,Scam,2024-05-27,2024-06-24
3,BitForex,,BF,Scam,2019-08-31,2024-03-05
4,Bitcoin,,BTC,Non-Scam,2010-07-14,
5,Ethereum,,ETH,Non-Scam,2015-08-08,
6,Cosmos,,ATOM,Non-Scam,2019-03-15,
7,Avalanche,,AVAX,Non-Scam,2020-09-23,


In [None]:
def get_next_proxy(
    link="https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&proxy_format=protocolipport&format=csv&timeout=2000&country=korea",
):
    proxy_list = pd.read_csv(link)
    proxy_list = proxy_list.sample(1)
    return proxy_list.iloc[0]

In [6]:
# proxy_info = get_next_proxy()
# logger_.debug(f"Using proxy {proxy_info.protocol}://{proxy_info.ip}:{proxy_info.port}")

# driver = get_driver(
#    proxy_protocol=proxy_info.protocol,
#    proxy_address=f"{proxy_info.ip}:{proxy_info.port}",
#    headless=False,
# )

# driver.quit()

In [None]:
proxy_info = get_next_proxy()
logger_.info(f"Using proxy {proxy_info.protocol}://{proxy_info.ip}:{proxy_info.port}")

N_SITES = 30
if MODE == ScraperNotebookMode.WRITE:
    for _, coin in coins.iterrows():
        for subreddit in subreddits:
            scrape_successful = False
            while not scrape_successful:
                try:
                    logger_.info(f"Scraping {coin['Coin Name']} in {subreddit}")
                    # query = f"{coin['Coin Name']} {coin["Alternative Name"] if not pd.isna(coin["Alternative Name"]) else ""} {coin["Short Name"]} site:reddit.com/{subreddit} {"before:" + coin['Scam Date'] if not pd.isna(coin['Scam Date']) else ""} {"after:" + coin['Start Date'] if not pd.isna(coin['Start Date']) else ""}"
                    query = f"{coin['Coin Name']} {coin["Short Name"]} site:reddit.com/{subreddit} {"before:" + coin['Scam Date'] if not pd.isna(coin['Scam Date']) else ""} {"after:" + coin['Start Date'] if not pd.isna(coin['Start Date']) else ""}"
                    query = query.replace("  ", " ")
                    logger_.debug(f"Query: {query}")
                    scraper = GoogleResultsScraper(logger=logger_)
                    results = scraper.get_main_results(
                        query,
                        n_sites=N_SITES,
                        headless=False,
                        proxy_protocol=proxy_info.protocol,
                        proxy_address=f"{proxy_info.ip}:{proxy_info.port}",
                    )
                    insert_dict(
                        logger=logger_, index="reddit_metadata", data_dict=results
                    )
                    scrape_successful = True
                except Exception as e:
                    logger_.warning("Detected bot, proxy not working or other error")
                    proxy_info = get_next_proxy()
                    logger_.info(
                        f"Using proxy {proxy_info.protocol}://{proxy_info.ip}:{proxy_info.port}"
                    )
                finally:
                    time.sleep(5)

2024-11-14 11:40:41,491 - scrape_reddit_metadata - INFO - Using proxy socks4://158.180.46.199:1004
2024-11-14 11:40:41,492 - scrape_reddit_metadata - INFO - Scraping FTX Token in r/CryptoCurrency
2024-11-14 11:40:41,492 - scrape_reddit_metadata - DEBUG - Query: FTX Token FTT site:reddit.com/r/CryptoCurrency before:2022-11-07 after:2019-08-01
2024-11-14 11:40:43,601 - scrape_reddit_metadata - INFO - Using proxy http://64.92.82.58:8080
2024-11-14 11:40:48,607 - scrape_reddit_metadata - INFO - Scraping FTX Token in r/CryptoCurrency
2024-11-14 11:40:48,609 - scrape_reddit_metadata - DEBUG - Query: FTX Token FTT site:reddit.com/r/CryptoCurrency before:2022-11-07 after:2019-08-01
2024-11-14 11:41:51,552 - scrape_reddit_metadata - INFO - Using proxy http://135.181.154.225:80
2024-11-14 11:41:56,559 - scrape_reddit_metadata - INFO - Scraping FTX Token in r/CryptoCurrency
2024-11-14 11:41:56,563 - scrape_reddit_metadata - DEBUG - Query: FTX Token FTT site:reddit.com/r/CryptoCurrency before:2022