In [2]:
import pandas as pd
import json
import joblib
from tqdm import tqdm

# Load data
gdelt_data = pd.read_csv('gdelt_data_test/gdelt_data_test.csv', low_memory=False)
stock_data = pd.read_csv('stock_data_test/stock_data_test.csv')

# Convert date columns to datetime format and sort
gdelt_data['SQLDATE'] = pd.to_datetime(gdelt_data['SQLDATE'], format='%Y-%m-%d')
gdelt_data = gdelt_data.sort_values(by='SQLDATE')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%Y-%m-%d')

# Load lookup tables
with open('cameo_embeddings.json', 'r') as f:
    event_lookup = json.load(f)
with open('actor_embeddings_cleaned.json', 'r') as f:
    actor_lookup = json.load(f)

# Function to convert all values to int, float, or string (for dates)
def convert_values(data):
    if isinstance(data, dict):
        return {k: convert_values(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_values(item) for item in data]
    elif isinstance(data, (int, float)):
        return data
    elif isinstance(data, pd.Timestamp):
        return data.strftime('%Y-%m-%d')
    elif isinstance(data, str):
        try:
            # Try to convert to int
            return int(data)
        except ValueError:
            try:
                # Try to convert to float
                return float(data)
            except ValueError:
                # If conversion fails, return the original string
                return data
    else:
        return data

# Initialize lists to store combined data and labels
combined_data = []
labels = []

# Process the data for all date entries
last_processed_date = None
for i in tqdm(range(1, len(stock_data)), desc="Processing stock data"):
    try:
        current_date = stock_data.iloc[i]['Date']
        previous_close = stock_data.iloc[i - 1]['Close']
        current_close = stock_data.iloc[i]['Close']
        label = 1 if current_close > previous_close else 0

        # Filter relevant entries from gdelt_data for the current date
        relevant_entries = gdelt_data[gdelt_data['SQLDATE'] == current_date].copy()

        # Process relevant entries
        for col in ['Actor1Name', 'Actor2Name', 'EventCode']:
            if col in ['Actor1Name', 'Actor2Name']:
                # Perform the mapping using apply and lambda
                relevant_entries[col] = relevant_entries[col].apply(
                    lambda x: [actor_lookup.get(item, item) for item in x] if isinstance(x, list) else actor_lookup.get(x, x)
                )

                # Check for missing values
                if relevant_entries[col].isnull().any():
                    raise ValueError(f"Missing data found in column {col} for date {current_date}")
            elif col == 'EventCode':
                # Perform the mapping using apply and lambda
                relevant_entries[col] = relevant_entries[col].astype(str).apply(
                    lambda x: [event_lookup.get(item, item) for item in x] if isinstance(x, list) else event_lookup.get(x, x)
                )

                # Check for missing values
                if relevant_entries[col].isnull().any():
                    raise ValueError(f"Missing data found in column {col} for date {current_date}")

        # Convert relevant entries to a list of dictionaries for JSON serialization
        relevant_entries_dict = relevant_entries.to_dict(orient='records')

        # Convert all values to int, float, or string (for dates)
        relevant_entries_dict = convert_values(relevant_entries_dict)

        # Append to the lists, converting the date to a string
        combined_data.append({
            'date': convert_values(current_date),
            'label': label,
            'data': relevant_entries_dict
        })
        labels.append(label)

        # Update the last processed date
        last_processed_date = current_date

    except ValueError as e:
        # Handle specific ValueError exceptions without printing
        continue
    except Exception as e:
        # Print other exceptions
        print(f"Error processing index {i}: {e}")
        continue

print("Data processing complete.")

# Save to compressed binary file using joblib with gzip
print("Saving combined data to compressed binary file...")
joblib.dump(combined_data, 'combined_data_test.pkl.gz', compress=('gzip', 3))
print("Combined data saved to 'combined_data_test.pkl.gz'.")

Processing stock data: 100%|██████████| 9/9 [00:11<00:00,  1.28s/it]


Data processing complete.
Saving combined data to compressed binary file...
Combined data saved to 'combined_data_test.pkl.gz'.


In [16]:
import joblib
import json
from tqdm import tqdm

# Load the compressed binary file
combined_data_2015 = joblib.load('combined_data_test.pkl.gz')

# Extract the first 5 entries
first_5_entries = combined_data_2015[:]

# Save the first 5 entries to a JSON file with a progress bar
with open('first_entries_2015.json', 'w') as json_file:
    json_file.write('[\n')  # Start the JSON array
    for i, entry in enumerate(tqdm(first_5_entries, desc="Saving entries to JSON")):
        json.dump(entry, json_file, indent=4)
        if i < len(first_5_entries) - 1:
            json_file.write(',\n')  # Add a comma after each entry except the last one
        else:
            json_file.write('\n')  # No comma after the last entry
    json_file.write(']\n')  # End the JSON array

print("First 5 entries saved to 'first_entries_2015.json'.")

Saving entries to JSON: 100%|██████████| 9/9 [01:19<00:00,  8.87s/it]

First 5 entries saved to 'first_entries_2015.json'.





In [19]:
import json

# Load the JSON file
with open('first_entries_2015.json', 'r') as file:
    data = json.load(file)

# Get the first entry
first_entry = data[0]

# Extract the first element of the 'data' list
first_data_entry = first_entry['data'][0]

# Extract the relevant columns
actor1_name = first_data_entry['Actor1Name']
actor2_name = first_data_entry['Actor2Name']
event_code = first_data_entry['EventCode']

# Print the number of elements in each column
print(f"Number of elements in Actor1Name: {len(actor1_name)}")
print(f"Number of elements in Actor2Name: {len(actor2_name)}")
print(f"Number of elements in EventCode: {len(event_code)}")

Number of elements in Actor1Name: 300
Number of elements in Actor2Name: 300
Number of elements in EventCode: 512


: 

In [1]:
import pandas as pd
import os

# Load data
gdelt_data = pd.read_csv('gdelt_data_cleaned.csv', low_memory=False)
stock_data = pd.read_csv('stock_data.csv')

# Convert date columns to datetime format
gdelt_data['SQLDATE'] = pd.to_datetime(gdelt_data['SQLDATE'], format='%Y-%m-%d')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%Y-%m-%d')

# Create directories to save the yearly data if they don't exist
os.makedirs('gdelt_data_by_year', exist_ok=True)
os.makedirs('stock_data_by_year', exist_ok=True)

# Split gdelt_data by year and save to separate files
for year, data in gdelt_data.groupby(gdelt_data['SQLDATE'].dt.year):
    filename = f'gdelt_data_by_year/gdelt_data_{year}.csv'
    data.to_csv(filename, index=False)
    print(f"Saved {filename}")

# Split stock_data by year and save to separate files
for year, data in stock_data.groupby(stock_data['Date'].dt.year):
    filename = f'stock_data_by_year/stock_data_{year}.csv'
    data.to_csv(filename, index=False)
    print(f"Saved {filename}")

Saved gdelt_data_by_year/gdelt_data_2015.csv
Saved gdelt_data_by_year/gdelt_data_2016.csv
Saved gdelt_data_by_year/gdelt_data_2017.csv
Saved gdelt_data_by_year/gdelt_data_2018.csv
Saved gdelt_data_by_year/gdelt_data_2019.csv
Saved gdelt_data_by_year/gdelt_data_2020.csv
Saved gdelt_data_by_year/gdelt_data_2021.csv
Saved gdelt_data_by_year/gdelt_data_2022.csv
Saved gdelt_data_by_year/gdelt_data_2023.csv
Saved gdelt_data_by_year/gdelt_data_2024.csv
Saved stock_data_by_year/stock_data_2015.csv
Saved stock_data_by_year/stock_data_2016.csv
Saved stock_data_by_year/stock_data_2017.csv
Saved stock_data_by_year/stock_data_2018.csv
Saved stock_data_by_year/stock_data_2019.csv
Saved stock_data_by_year/stock_data_2020.csv
Saved stock_data_by_year/stock_data_2021.csv
Saved stock_data_by_year/stock_data_2022.csv
Saved stock_data_by_year/stock_data_2023.csv
Saved stock_data_by_year/stock_data_2024.csv


In [1]:
import pandas as pd
import os

# Load data
gdelt_data = pd.read_csv('gdelt_data_cleaned.csv', low_memory=False)
stock_data = pd.read_csv('stock_data.csv')

# Convert date columns to datetime format
gdelt_data['SQLDATE'] = pd.to_datetime(gdelt_data['SQLDATE'], format='%Y-%m-%d')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%Y-%m-%d')

# Extract the first 10 dates from the stock data
first_10_dates = stock_data['Date'].sort_values().unique()[:10]

# Filter stock data to include only the first 10 dates
stock_data_test = stock_data[stock_data['Date'].isin(first_10_dates)]

# Filter GDELT data to include only entries corresponding to the first 10 dates
gdelt_data_test = gdelt_data[gdelt_data['SQLDATE'].isin(first_10_dates)]

# Create directories to save the test data if they don't exist
os.makedirs('gdelt_data_test', exist_ok=True)
os.makedirs('stock_data_test', exist_ok=True)

# Save the filtered stock data to a test file
stock_data_test_filename = 'stock_data_test/stock_data_test.csv'
stock_data_test.to_csv(stock_data_test_filename, index=False)
print(f"Saved {stock_data_test_filename}")

# Save the filtered GDELT data to a test file
gdelt_data_test_filename = 'gdelt_data_test/gdelt_data_test.csv'
gdelt_data_test.to_csv(gdelt_data_test_filename, index=False)
print(f"Saved {gdelt_data_test_filename}")

Saved stock_data_test/stock_data_test.csv
Saved gdelt_data_test/gdelt_data_test.csv
