In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
eodhd_api_key = os.getenv('EODHD_API_KEY')

In [None]:
import requests

url = f'https://eodhd.com/api/news?t=venture%20capital&offset=1000&limit=1000&api_token={eodhd_api_key}&fmt=json'
data = requests.get(url).json()

In [None]:
data

In [None]:
len(data)

In [None]:
from collections import Counter
import json

# Assuming 'data' is your loaded JSON data
# If you need to load from file instead, uncomment below:
# with open('your_file.json', 'r') as f:
#     data = json.load(f)

def analyze_top_items(data, key_name, top_n=10):
    """Extract and count items from the specified key across all records."""
    # Flatten the list of lists
    all_items = []
    for item in data:
        if key_name in item and isinstance(item[key_name], list):
            all_items.extend(item[key_name])
    
    # Count occurrences
    item_counts = Counter(all_items)
    
    # Get top N items
    top_items = item_counts.most_common(top_n)
    
    return top_items

# Get top 10 tags
top_tags = analyze_top_items(data, 'tags', top_n=100)
print("Top 10 Tags:")
for tag, count in top_tags:
    print(f"{tag}: {count}")

print("\n" + "-"*50 + "\n")

# Get top 10 symbols
top_symbols = analyze_top_items(data, 'symbols')
print("Top 10 Symbols:")
for symbol, count in top_symbols:
    print(f"{symbol}: {count}")

In [None]:
from datetime import datetime
import numpy as np

def analyze_dates(data):
    # Extract and parse dates
    dates = [datetime.fromisoformat(item['date']) for item in data]
    dates.sort()
    
    # Calculate time differences in hours between consecutive dates
    diffs = [(dates[i+1] - dates[i]).total_seconds() / 3600 for i in range(len(dates)-1)]
    
    # Calculate statistics
    mean_diff = np.mean(diffs) if diffs else 0
    std_diff = np.std(diffs) if diffs else 0
    
    print(f"Earliest date: {dates[0]}")
    print(f"Latest date: {dates[-1]}")
    print(f"Mean time between entries (hours): {mean_diff}")
    print(f"Std deviation (hours): {std_diff}")

In [None]:
analyze_dates(data)

In [None]:
from datetime import datetime
import numpy as np

def analyze_dates(data):
    # Extract and parse dates
    dates = [datetime.fromisoformat(item['date']) for item in data]
    dates.sort()
    # Calculate time differences in hours between consecutive dates
    diffs = [(dates[i+1] - dates[i]).total_seconds() / 3600 for i in range(len(dates)-1)]
    mean_diff = np.mean(diffs) if diffs else 0
    std_diff = np.std(diffs) if diffs else 0
    return mean_diff, std_diff, diffs

# Example usage for your four datasets:
mean1, std1, diffs1 = analyze_dates(data)
mean2, std2, diffs2 = analyze_dates(data2)
mean3, std3, diffs3 = analyze_dates(data3)
mean4, std4, diffs4 = analyze_dates(data4)

# Combine all diffs for overall statistics
all_diffs = diffs1 + diffs2 + diffs3 + diffs4
overall_mean = np.mean(all_diffs) if all_diffs else 0
overall_std = np.std(all_diffs) if all_diffs else 0

print(f"Mean1: {mean1}, Std1: {std1}")
print(f"Mean2: {mean2}, Std2: {std2}")
print(f"Mean3: {mean3}, Std3: {std3}")
print(f"Mean4: {mean4}, Std4: {std4}")
print(f"Overall mean (hours): {overall_mean}")
print(f"Overall std (hours): {overall_std}")

In [None]:
import numpy as np
from scipy import stats

def mean_confidence_interval(data, confidence=0.95):
    a = np.array(data)
    n = len(a)
    mean = np.mean(a)
    sem = stats.sem(a)  # Standard error of the mean
    h = sem * stats.t.ppf((1 + confidence) / 2., n-1)
    return mean, mean-h, mean+h

# Example usage:
mean, lower, upper = mean_confidence_interval(all_diffs)
print(f"Mean: {mean:.2f} hours")
print(f"95% confidence interval: [{lower:.2f}, {upper:.2f}] hours")

In [None]:
import numpy as np
from scipy import stats

def std_confidence_interval(data, confidence=0.95):
    a = np.array(data)
    n = len(a)
    s = np.std(a, ddof=1)
    alpha = 1 - confidence
    chi2_lower = stats.chi2.ppf(alpha / 2, n - 1)
    chi2_upper = stats.chi2.ppf(1 - alpha / 2, n - 1)
    lower = np.sqrt((n - 1) * s**2 / chi2_upper)
    upper = np.sqrt((n - 1) * s**2 / chi2_lower)
    return s, lower, upper

# Example usage:
std, lower, upper = std_confidence_interval(all_diffs)
print(f"Std: {std:.2f} hours")
print(f"95% confidence interval for std: [{lower:.2f}, {upper:.2f}] hours")

In [None]:
def compare_article_overlaps(data, data2, data3, data4, limit=50):
    # Extract the first `limit` links from each list
    links1 = set(d['link'] for d in data[:limit])
    links2 = set(d['link'] for d in data2[:limit])
    links3 = set(d['link'] for d in data3[:limit])
    links4 = set(d['link'] for d in data4[:limit])

    overlaps = {
        'tech_business': links1 & links2,
        'tech_ai': links1 & links3,
        'tech_earnings': links1 & links4,
        'business_ai': links2 & links3,
        'business_earnings': links2 & links4,
        'ai_earnings': links3 & links4,
        'all_four': links1 & links2 & links3 & links4,
    }
    return overlaps

# Example usage:
overlaps = compare_article_overlaps(data, data2, data3, data4)
print({k: len(v) for k, v in overlaps.items()})

In [None]:
import os
import json

def save_data(variable, filename):
    """Save a list of dicts to a JSON file inside the 'example_data' folder."""
    os.makedirs('example_data', exist_ok=True)
    filepath = os.path.join('example_data', filename)
    with open(filepath, 'w') as f:
        json.dump(variable, f)

def load_data(filename):
    """Load a list of dicts from a JSON file inside the 'example_data' folder."""
    filepath = os.path.join('example_data', filename)
    with open(filepath, 'r') as f:
        return json.load(f)

In [None]:
# Save
save_data(data, 'venture_capital.json')

In [None]:
# Load
data = load_data('data.json')
data2 = load_data('data2.json')
data3 = load_data('data3.json')
data4 = load_data('data4.json')