---
jupyter: python3
---

Scraping of Twitter, Google and Comparitech in this notebook.

Setting up the notebook:

In [1]:
import os
import hashlib
import pandas as pd
from datetime import datetime

from pprint import pprint
from CryptoFraudDetection.scraper.twitter import TwitterScraper
from CryptoFraudDetection.scraper.twitter import scrape_in_blocks
from CryptoFraudDetection.scraper.google_results import GoogleResultsScraper
from CryptoFraudDetection.scraper.comparitech import ComparitechScraper
from CryptoFraudDetection.utils.logger import Logger
from CryptoFraudDetection.elasticsearch.data_insertion import (
    insert_dict,
    insert_dataframe,
)
from CryptoFraudDetection.elasticsearch.data_retrieval import search_data

logger_ = Logger("logger")

Setting up the scrapers:

In [2]:
twitter_scraper = TwitterScraper(
    logger=logger_,
    username=os.getenv("TWITTER_USERNAME"),
    password=os.getenv("TWITTER_PASSWORD"),
    cookies_file_path="../data/raw/cookies/x.json",
)

google_results_scraper = GoogleResultsScraper(
    logger=logger_,
)

comparitech_scraper = ComparitechScraper(
    logger=logger_,
)

Scraping and saving Twitter data:

In [10]:
dict_test = twitter_scraper.scrape_with_cookies(tweet_count=35, headless=False)

2024-11-07 18:01:12,688 - logger - INFO - Cookies successfully loaded from ../data/raw/cookies/x.json
2024-11-07 18:01:32,918 - logger - DEBUG - Page title after loading cookies and navigating to Explore: Log in to X / X
2024-11-07 18:01:33,031 - logger - DEBUG - Close button clicked successfully.
2024-11-07 18:01:33,793 - logger - INFO - Searched for: Bitcoin
2024-11-07 18:01:43,824 - logger - DEBUG - Clicked on 'Latest'.
2024-11-07 18:01:44,389 - logger - DEBUG - Impressions element not found; defaulting to 0.
2024-11-07 18:01:44,399 - logger - DEBUG - Failed to extract aria-label from tweet.
2024-11-07 18:01:44,400 - logger - INFO - Scraped tweet 1/35
2024-11-07 18:01:44,454 - logger - DEBUG - Impressions element not found; defaulting to 0.
2024-11-07 18:01:44,461 - logger - DEBUG - Failed to extract aria-label from tweet.
2024-11-07 18:01:44,462 - logger - INFO - Scraped tweet 2/35
2024-11-07 18:01:44,642 - logger - DEBUG - Impressions element not found; defaulting to 0.
2024-11-07

In [11]:
dict_test

{'Username': ['@rovercrc',
  '@naiivememe',
  '@TheBTCTherapist',
  '@Ashcryptoreal',
  '@isabellasg3',
  '@LucasGeez',
  '@nnamanis995',
  '@TechieuwsMail',
  '@cyphergenesisAI',
  '@Lyonpups',
  '@BitcoinHODLER76',
  '@alexandre_etf',
  '@saratoshigems',
  '@Futuretrashy',
  '@mawshaikh24434',
  '@Brrrbon_',
  '@CryptoJoseJM',
  '@MOHON1SHANTA1',
  '@davidgokhshtein',
  '@ripcache',
  '@ripcache',
  '@DonnieBitcoin',
  '@Bitcoin503ES',
  '@NorqueNoq',
  '@ExcelTeknik',
  '@doom_nick187',
  '@AugustoBackes',
  '@nnamanis995',
  '@snowcoals',
  '@SexMoneyMigo',
  '@Ikuikasi',
  '@A_Cap1tal',
  '@Dennis5De',
  '@DonnieBitcoin',
  '@2HotBella',
  '@Headbagner',
  '@YLNZTEKADAM',
  '@BTC_Bella69420',
  '@adatreasury',
  '@JustynaEpolskie',
  '@DonnieBitcoin',
  '@BitcoinBugsy',
  '@NSC_Daily',
  '@shab_tamuh',
  '@allseeingrune',
  '@Tomk4at',
  '@BitcoinBull21M',
  '@Regginald167928',
  '@chidambara09',
  '@Sathi4291243582',
  '@rnoonrivers',
  '@rnoonrivers',
  '@rnoonrivers',
  '@rnoon

In [None]:

start_date = datetime.strptime('2018-12-01', '%Y-%m-%d')
end_date = datetime.strptime('2024-11-01', '%Y-%m-%d')

In [None]:
scrape_in_blocks(
    scraper=twitter_scraper,
    search_query="Bitcoin",
    start_date=start_date,
    end_date=end_date,
    block_count=26,
    total_tweet_count=4200,
    db_index="x",
    logger=logger_,
    headless=False,
)

2024-11-07 18:36:31,409 - logger - DEBUG - Scraping block 1/28: 82 tweets from 2018-03-01 00:00:00 to 2018-05-27 00:51:25.714286.
2024-11-07 18:36:39,132 - logger - INFO - Cookies successfully loaded from ../data/raw/cookies/x.json
2024-11-07 18:36:58,805 - logger - DEBUG - Page title after loading cookies and navigating to Explore: Log in to X / X
2024-11-07 18:36:58,922 - logger - DEBUG - Close button clicked successfully.
2024-11-07 18:36:59,853 - logger - INFO - Searched for: "Bitcoin" since:2018-03-01 until:2018-05-27
2024-11-07 18:37:08,481 - logger - INFO - Clicked on 'Latest'.
2024-11-07 18:37:09,247 - logger - DEBUG - Impressions element not found; defaulting to 0.
2024-11-07 18:37:09,257 - logger - DEBUG - Failed to extract aria-label from tweet.
2024-11-07 18:37:09,259 - logger - INFO - Scraped tweet 1/82
2024-11-07 18:37:09,317 - logger - DEBUG - Impressions element not found; defaulting to 0.
2024-11-07 18:37:09,325 - logger - INFO - Scraped tweet 2/82
2024-11-07 18:37:09,

KeyboardInterrupt: 

Scraping and saving Comparitech data:

In [None]:
comparitech_data = ComparitechScraper(logger_).get_data()
comparitech_data = pd.DataFrame(comparitech_data)
comparitech_data["id"] = comparitech_data["Company Name"].apply(
    lambda x: hashlib.md5(x.encode()).hexdigest()
)

In [None]:
insert_dataframe(logger=logger_, index="comparitech_scam_list", df=comparitech_data)