In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
from urllib.parse import quote

## Test API 

In [None]:
# Tone is a FILTER in the query string, not a separate param
params = {
    "query": "trump crypto tone:>5 sourcelang:English",  # Only positive articles
    "mode": "artlist",
    "format": "json",
    "timespan": "24hours",
    "maxrecords": 5
}

# Tone filtering options:
"query": "bitcoin tone:>5"      # Positive (tone > 5)
"query": "bitcoin tone:<-5"     # Negative (tone < -5)
"query": "bitcoin tone:>10"     # Very positive
"query": "bitcoin tone:<-10"    # Very negative

# All params explained:

# IN QUERY STRING (part of "query" param):
# - sourcelang:English          # Language filter
# - sourcecountry:US            # Country (US, GB, DE, etc.)
# - domainis:coindesk.com       # Exact domain
# - domain:cnn.com              # Domain + subdomains
# - tone:>5 or tone:<-5         # Sentiment filter (-100 to +100)
# - theme:ECON_CRYPTOCURRENCY   # GDELT theme
# - near5:"word1 word2"         # Words within 5 words of each other

# SEPARATE PARAMS:
params = {
    "mode": "artlist",           # artlist, timelinevol, timelinevolraw, imagecollagelist
    "format": "json",            # json, csv, html
    "timespan": "3months",       # 15min, 1hour, 24hours, 7days, 1month, 3months
    "maxrecords": 250,           # 1-250 (only for artlist mode)
    "sort": "DateDesc",          # DateDesc, DateAsc, ToneDesc, ToneAsc (only artlist)
    "startdatetime": "20250101000000",  # YYYYMMDDHHMMSS format
    "enddatetime": "20250115000000"     # YYYYMMDDHHMMSS format
}

# Note: Tone scores in results are -100 to +100 scale
# Articles already include tone scores in response - you don't "add" them

In [None]:
base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
params = {
    "query": "trump crypto sourcelang:English",  # No manual encoding!
    "mode": "artlist",
    "format": "json",
    "timespan": "1280hours",
    "maxrecords": 20
}
response = requests.get(base_url, params=params)
data_dict = response.json()



JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [39]:
data_dict

{'articles': [{'url': 'https://www.dailymail.co.uk/news/article-15190969/trump-insider-trader-china-tariff.html',
   'url_mobile': 'https://www.dailymail.co.uk/news/article-15190969/amp/trump-insider-trader-china-tariff.html',
   'title': 'One minute before Trump China tariff shock - an intriguing market bet',
   'seendate': '20251014T160000Z',
   'socialimage': 'https://i.dailymail.co.uk/1s/2025/10/14/16/102978005-0-image-m-13_1760454464928.jpg',
   'domain': 'dailymail.co.uk',
   'language': 'English',
   'sourcecountry': 'United Kingdom'},
  {'url': 'https://www.forbes.com/sites/digital-assets/2025/10/11/trump-surprise-triggers-huge-crypto-flash-crash-as-traders-brace-for-bitcoin-ethereum-xrp-bnb-and-solana-price-wipeout/',
   'url_mobile': '',
   'title': '  Trump Surprise  Triggers Huge Crypto  Flash Crash  As Traders Brace For Bitcoin , Ethereum , XRP , BNB And Solana Price  Wipeout  ',
   'seendate': '20251011T093000Z',
   'socialimage': 'https://imageio.forbes.com/specials-imag

In [38]:
for article in data_dict['articles']:
    print(article['title'])
    print(article.get('tone')) 
    print(article.get('date'))

One minute before Trump China tariff shock - an intriguing market bet
None
None
  Trump Surprise  Triggers Huge Crypto  Flash Crash  As Traders Brace For Bitcoin , Ethereum , XRP , BNB And Solana Price  Wipeout  
None
None
Exclusive | Possible pardon for ex - Binance CEO sparking fierce White House debate
None
None
Social Security tax break : Will Fed Chair Jerome Powell speech today trigger a fresh crypto market crash ? Here what to track
None
None
Binance Blames Market Conditions , Not Platform Failures , for Crypto Crash Volatility
None
None
Crypto IPO Momentum Hit by Trump China Tariff Threats
None
None
Why is Barron Trump being accused of  insider trading ? Crypto short around China tariff decision sparks row
None
None
Donald Trump joins elite club of bitcoin investors with strategic crypto moves via Trump Media : Report
None
None
Crypto Whale shorting Bitcoin : Crypto whale linked to Trump slams insider rumors then opens $340 million Bitcoin short
None
None
crypto market crash : 

In [34]:
data_dict

{'articles': [{'url': 'https://www.dailymail.co.uk/news/article-15190969/trump-insider-trader-china-tariff.html',
   'url_mobile': 'https://www.dailymail.co.uk/news/article-15190969/amp/trump-insider-trader-china-tariff.html',
   'title': 'One minute before Trump China tariff shock - an intriguing market bet',
   'seendate': '20251014T160000Z',
   'socialimage': 'https://i.dailymail.co.uk/1s/2025/10/14/16/102978005-0-image-m-13_1760454464928.jpg',
   'domain': 'dailymail.co.uk',
   'language': 'English',
   'sourcecountry': 'United Kingdom'},
  {'url': 'https://economictimes.indiatimes.com/news/international/us/will-fed-chair-jerome-powells-speech-today-trigger-a-fresh-crypto-market-crash-heres-what-to-track/articleshow/124552775.cms',
   'url_mobile': 'https://m.economictimes.com/news/international/us/will-fed-chair-jerome-powells-speech-today-trigger-a-fresh-crypto-market-crash-heres-what-to-track/amp_articleshow/124552775.cms',
   'title': 'Social Security tax break : Will Fed Chair

In [30]:
s[:10]

'{"articles'

In [4]:

class GDELTBitcoinScraper:
    """
    Scraper for fetching Bitcoin-related news from GDELT DOC 2.0 API
    """
    
    def __init__(self):
        self.base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
        
    def build_query_url(self, 
                       keywords,
                       mode="artlist",
                       max_records=250,
                       timespan="3months",
                       source_lang="English",
                       format_type="json",
                       domains=None,
                       tone=None):
        """
        Build GDELT API query URL
        
        Parameters:
        - keywords: str or list, search terms
        - mode: str, output mode (artlist, timelinevol, timelinevolraw, etc.)
        - max_records: int, number of records (1-250 for artlist)
        - timespan: str, time period (e.g., "24hours", "3months", "1week")
        - source_lang: str, language filter
        - format_type: str, output format (json, csv, html)
        - domains: str or list, specific domains to search
        - tone: str, tone filter (e.g., ">5" for positive, "<-5" for negative)
        """
        
        # Handle keywords - quote terms with spaces or special characters
        def needs_quotes(term):
            """Check if a term needs quotes (has spaces or special chars)"""
            special_chars = ['-', '_', '/', '\\', ':', ';', ',', '.', '!', '?']
            return ' ' in term or any(char in term for char in special_chars)
        
        if isinstance(keywords, list):
            query = " ".join([f'"{kw}"' if needs_quotes(kw) else kw for kw in keywords])
        else:
            query = f'"{keywords}"' if needs_quotes(keywords) else keywords
            
        # Build query parameters
        params = [f"query={quote(query)}"]
        
        if source_lang:
            params.append(f"sourcelang:{source_lang}")
            
        if domains:
            if isinstance(domains, list):
                for domain in domains:
                    params.append(f"domainis:{domain}")
            else:
                params.append(f"domainis:{domains}")
                
        if tone:
            params.append(f"tone:{tone}")
        
        # Join query parameters
        query_string = " ".join(params)
        
        # Build full URL
        url = f"{self.base_url}?query={quote(query_string)}"
        url += f"&mode={mode}"
        url += f"&format={format_type}"
        url += f"&timespan={timespan}"
        
        if mode == "artlist":
            url += f"&maxrecords={max_records}"
            
        return url
    
    def fetch_articles(self, 
                      keywords=["bitcoin", "cryptocurrency"],
                      max_records=250,
                      timespan="3months",
                      domains=None):
        """
        Fetch article list from GDELT
        
        Returns: DataFrame with columns: title, url, domain, language, 
                 seendate, sourcecountry
        """
        
        url = self.build_query_url(
            keywords=keywords,
            mode="artlist",
            max_records=max_records,
            timespan=timespan,
            domains=domains
        )
        
        print(f"Fetching from: {url}")
        
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            data = response.json()
            
            if "articles" in data and data["articles"]:
                df = pd.DataFrame(data["articles"])
                
                # Convert seendate to datetime
                if "seendate" in df.columns:
                    df["seendate"] = pd.to_datetime(df["seendate"], format="%Y%m%dT%H%M%SZ")
                
                print(f"Fetched {len(df)} articles")
                return df
            else:
                print("No articles found")
                return pd.DataFrame()
                
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            return pd.DataFrame()
    
    def fetch_timeline(self, 
                      keywords=["bitcoin"],
                      timespan="1month",
                      mode="timelinevolraw"):
        """
        Fetch timeline data showing article volume over time
        
        Parameters:
        - mode: "timelinevol" (percentage) or "timelinevolraw" (raw counts)
        
        Returns: DataFrame with datetime and volume columns
        """
        
        url = self.build_query_url(
            keywords=keywords,
            mode=mode,
            timespan=timespan
        )
        
        print(f"Fetching timeline from: {url}")
        
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            data = response.json()
            
            if "timeline" in data and data["timeline"]:
                timeline = data["timeline"][0]
                
                # Parse data points
                dates = []
                values = []
                
                for item in timeline.get("data", []):
                    dates.append(datetime.strptime(item["date"], "%Y-%m-%d %H:%M:%S"))
                    values.append(item["value"])
                
                df = pd.DataFrame({
                    "datetime": dates,
                    "article_count": values
                })
                
                print(f"Fetched {len(df)} timeline data points")
                return df
            else:
                print("No timeline data found")
                return pd.DataFrame()
                
        except requests.exceptions.RequestException as e:
            print(f"Error fetching timeline: {e}")
            return pd.DataFrame()
    
    def fetch_batch_articles(self, 
                            keywords=["bitcoin"],
                            days_back=90,
                            batch_size=7):
        """
        Fetch articles in batches by date ranges to get more historical data
        
        Parameters:
        - days_back: int, how many days back to fetch
        - batch_size: int, days per batch (GDELT has 250 article limit per query)
        
        Returns: Combined DataFrame of all articles
        """
        
        all_articles = []
        end_date = datetime.now()
        
        for i in range(0, days_back, batch_size):
            batch_end = end_date - timedelta(days=i)
            batch_start = end_date - timedelta(days=i + batch_size)
            
            # Format dates for GDELT
            timespan = f"{batch_size}days"
            
            print(f"\nFetching batch: {batch_start.date()} to {batch_end.date()}")
            
            df = self.fetch_articles(
                keywords=keywords,
                timespan=timespan,
                max_records=250
            )
            
            if not df.empty:
                all_articles.append(df)
            
            # Be polite to the API
            time.sleep(2)
        
        if all_articles:
            combined_df = pd.concat(all_articles, ignore_index=True)
            # Remove duplicates based on URL
            combined_df = combined_df.drop_duplicates(subset=["url"])
            print(f"\nTotal unique articles: {len(combined_df)}")
            return combined_df
        else:
            return pd.DataFrame()

# # Example usage
# if __name__ == "__main__":
#     scraper = GDELTBitcoinScraper()
    
#     # Example 1: Fetch recent Bitcoin articles
#     print("=" * 50)
#     print("Fetching Bitcoin articles from last 24 hours")
#     print("=" * 50)
#     articles = scraper.fetch_articles(
#         keywords=["bitcoin"],
#         timespan="24hours",
#         max_records=100
#     )
    
#     if not articles.empty:
#         print("\nFirst 5 articles:")
#         print(articles[["title", "seendate", "domain"]].head())
    
#     # Example 2: Fetch timeline data
#     print("\n" + "=" * 50)
#     print("Fetching Bitcoin timeline for last 30 days")
#     print("=" * 50)
#     timeline = scraper.fetch_timeline(
#         keywords=["bitcoin"],
#         timespan="1month",
#         mode="timelinevolraw"
#     )
    
#     if not timeline.empty:
#         print("\nTimeline summary:")
#         print(timeline.describe())
    
#     # Example 3: Fetch from specific crypto news sources
#     print("\n" + "=" * 50)
#     print("Fetching from crypto-specific sources")
#     print("=" * 50)
#     crypto_articles = scraper.fetch_articles(
#         keywords=["bitcoin", "cryptocurrency"],
#         domains=["coindesk.com", "cointelegraph.com", "decrypt.co"],
#         timespan="7days",
#         max_records=250
#     )
    
#     if not crypto_articles.empty:
#         print(f"\nFound {len(crypto_articles)} articles from crypto sources")
#         print("\nDomain distribution:")
#         print(crypto_articles["domain"].value_counts())
    
#     # Example 4: Search for Bitcoin + specific events
#     print("\n" + "=" * 50)
#     print("Searching for Bitcoin regulation news")
#     print("=" * 50)
#     regulation_articles = scraper.fetch_articles(
#         keywords=["bitcoin", "regulation"],
#         timespan="30days",
#         max_records=100
#     )
    
#     if not regulation_articles.empty:
#         print(f"\nFound {len(regulation_articles)} regulation-related articles")

In [5]:


scraper = GDELTBitcoinScraper()

# Example 1: Fetch recent Bitcoin articles
print("=" * 50)
print("Fetching Bitcoin articles from last 24 hours")
print("=" * 50)
articles = scraper.fetch_articles(
    keywords=["bitcoin"],
    timespan="24hours",
    max_records=100
)

if not articles.empty:
    print("\nFirst 5 articles:")
    print(articles[["title", "seendate", "domain"]].head())

# Example 2: Fetch timeline data
print("\n" + "=" * 50)
print("Fetching Bitcoin timeline for last 30 days")
print("=" * 50)
timeline = scraper.fetch_timeline(
    keywords=["bitcoin"],
    timespan="1month",
    mode="timelinevolraw"
)

if not timeline.empty:
    print("\nTimeline summary:")
    print(timeline.describe())

# Example 3: Fetch from specific crypto news sources
print("\n" + "=" * 50)
print("Fetching from crypto-specific sources")
print("=" * 50)
crypto_articles = scraper.fetch_articles(
    keywords=["bitcoin", "cryptocurrency"],
    domains=["coindesk.com", "cointelegraph.com", "decrypt.co"],
    timespan="7days",
    max_records=250
)

if not crypto_articles.empty:
    print(f"\nFound {len(crypto_articles)} articles from crypto sources")
    print("\nDomain distribution:")
    print(crypto_articles["domain"].value_counts())

# Example 4: Search for Bitcoin + specific events
print("\n" + "=" * 50)
print("Searching for Bitcoin regulation news")
print("=" * 50)
regulation_articles = scraper.fetch_articles(
    keywords=["bitcoin", "regulation"],
    timespan="30days",
    max_records=100
)

if not regulation_articles.empty:
    print(f"\nFound {len(regulation_articles)} regulation-related articles")

Fetching Bitcoin articles from last 24 hours
Fetching from: https://api.gdeltproject.org/api/v2/doc/doc?query=query%3Dbitcoin%20sourcelang%3AEnglish&mode=artlist&format=json&timespan=24hours&maxrecords=100
Error fetching data: Expecting value: line 1 column 1 (char 0)

Fetching Bitcoin timeline for last 30 days
Fetching timeline from: https://api.gdeltproject.org/api/v2/doc/doc?query=query%3Dbitcoin%20sourcelang%3AEnglish&mode=timelinevolraw&format=json&timespan=1month
Error fetching timeline: Expecting value: line 1 column 1 (char 0)

Fetching from crypto-specific sources
Fetching from: https://api.gdeltproject.org/api/v2/doc/doc?query=query%3Dbitcoin%2520cryptocurrency%20sourcelang%3AEnglish%20domainis%3Acoindesk.com%20domainis%3Acointelegraph.com%20domainis%3Adecrypt.co&mode=artlist&format=json&timespan=7days&maxrecords=250
Error fetching data: Expecting value: line 1 column 1 (char 0)

Searching for Bitcoin regulation news
Fetching from: https://api.gdeltproject.org/api/v2/doc/doc?

## Test BIG QUERY

In [None]:
from google.cloud import bigquery
import pandas as pd

# Initialize client
client = bigquery.Client(project='your-project-id')

In [None]:

query = """
SELECT 
  DATE,
  SourceCommonName,
  DocumentIdentifier,
  V2Themes,
  V2Tone,
  AllNames
FROM `gdelt-bq.gdeltv2.gkg`
WHERE 
  DATE BETWEEN 20230101 AND 20231231
  AND (
    LOWER(SourceCommonName) = 'nytimes.com'
    OR DocumentIdentifier LIKE '%nytimes.com%'
  )
LIMIT 1000
"""