Get the main first page results from Google search for the term "bitcoin".

In [1]:
import time
import json

from CryptoFraudDetection.utils.logger import Logger
from CryptoFraudDetection.utils.exceptions import ProxyNotWorking, DetectedBotException
from CryptoFraudDetection.utils.enums import ScraperNotebookMode, LoggerMode
from CryptoFraudDetection.elasticsearch.data_insertion import insert_dict
from CryptoFraudDetection.scraper.google_results import GoogleResultsScraper
from CryptoFraudDetection.scraper.utils import get_driver

import pandas as pd
from selenium.common.exceptions import TimeoutException, NoSuchElementException

logger_ = Logger(name="scrape_reddit_metadata", level=LoggerMode.DEBUG, log_dir="../logs")

In [2]:
MODE = ScraperNotebookMode.WRITE

In [3]:
# read data/raw/coins.json
with open('../data/raw/coins.json') as f:
    coins = json.load(f)
coins = sorted(coins, key=lambda coin: coin['max_market_cap_e9'], reverse=False)
coins[0]

{'name': 'Safe Moon',
 'symbol': 'SAFEMOON',
 'fraud': True,
 'test': True,
 'max_market_cap_e9': 0.0,
 'start_date': '2022-01-17',
 'subreddits': ['CryptoCurrency',
  'CryptoMoonShots',
  'CryptoMarkets',
  'Crypto',
  'SafeMoon'],
 'end_date': '2023-10-31'}

In [4]:
def get_next_proxy(
    link="https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&proxy_format=protocolipport&format=csv&timeout=2000",
):
    proxy_list = pd.read_csv(link)
    proxy_list = proxy_list.sample(1)
    return proxy_list.iloc[0]

In [5]:
proxy_info = get_next_proxy()
logger_.info(f"Using proxy {proxy_info.protocol}://{proxy_info.ip}:{proxy_info.port}")

N_SITES = 100
if MODE == ScraperNotebookMode.WRITE:
    for coin in coins:
        for subreddit in coin['subreddits']:
            scrape_successful = False
            while not scrape_successful:
                try:
                    logger_.info(f"Scraping {coin['name']} in {subreddit}")
                    query = f"{coin['name']} site:reddit.com/r/{subreddit} {"before:" + coin['end_date'] if coin.get('end_date') else ""} {"after:" + coin['start_date'] }"
                    query = query.replace("  ", " ")
                    logger_.debug(f"Query: {query}")
                    scraper = GoogleResultsScraper(logger=logger_)
                    results = scraper.get_main_results(
                        query,
                        n_sites=N_SITES,
                        headless=True,
                        proxy_protocol=proxy_info.protocol,
                        proxy_address=f"{proxy_info.ip}:{proxy_info.port}",
                    )
                    insert_dict(
                        logger=logger_, index="reddit_metadata_100", data_dict=results
                    )
                    scrape_successful = True
                    time.sleep(5)
                except Exception as e:
                    logger_.warning("Detected bot, proxy not working or other error")
                    proxy_info = get_next_proxy()
                    logger_.info(
                       f"Using proxy {proxy_info.protocol}://{proxy_info.ip}:{proxy_info.port}"
                    )

2024-11-21 08:27:16,795 - scrape_reddit_metadata - INFO - Using proxy http://67.43.227.230:20587
2024-11-21 08:27:16,796 - scrape_reddit_metadata - INFO - Scraping Safe Moon in CryptoCurrency
2024-11-21 08:27:16,797 - scrape_reddit_metadata - DEBUG - Query: Safe Moon site:reddit.com/r/CryptoCurrency before:2023-10-31 after:2022-01-17
2024-11-21 08:27:30,103 - scrape_reddit_metadata - INFO - Search query submitted successfully.
2024-11-21 08:27:32,133 - scrape_reddit_metadata - DEBUG - Missing elements in result box. Skipping
2024-11-21 08:27:32,219 - scrape_reddit_metadata - DEBUG - Missing elements in result box. Skipping
2024-11-21 08:27:33,757 - scrape_reddit_metadata - INFO - Navigated to the next page of results.
2024-11-21 08:27:36,400 - scrape_reddit_metadata - INFO - Navigated to the next page of results.
2024-11-21 08:27:38,661 - scrape_reddit_metadata - INFO - Navigated to the next page of results.
2024-11-21 08:27:40,864 - scrape_reddit_metadata - INFO - Navigated to the nex