In [None]:
# Unused
import pandas as pd
import json
import joblib
from tqdm import tqdm

# formatting and load data
gdelt_data = pd.read_csv('gdelt_data_test/gdelt_data_test.csv', low_memory=False)
stock_data = pd.read_csv('stock_data_test/stock_data_test.csv')
gdelt_data['SQLDATE'] = pd.to_datetime(gdelt_data['SQLDATE'], format='%Y-%m-%d')
gdelt_data = gdelt_data.sort_values(by='SQLDATE')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%Y-%m-%d')

# load hashmaps
with open('cameo_embeddings.json', 'r') as f:
    event_lookup = json.load(f)
with open('actor_embeddings_cleaned.json', 'r') as f:
    actor_lookup = json.load(f)

# convert values to required form for json readability and for better processing
def convert_values(data):
    if isinstance(data, dict):
        return {k: convert_values(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_values(item) for item in data]
    elif isinstance(data, (int, float)):
        return data
    elif isinstance(data, pd.Timestamp):
        return data.strftime('%Y-%m-%d')
    elif isinstance(data, str):
        try:
            return int(data)
        except ValueError:
            try:
                return float(data)
            except ValueError:
                # default
                return data
    else:
        return data

# initialize lists
combined_data = []
labels = []

# process all data
last_processed_date = None
for i in tqdm(range(1, len(stock_data)), desc="Processing stock data"):
    try:
        current_date = stock_data.iloc[i]['Date']
        previous_close = stock_data.iloc[i - 1]['Close']
        current_close = stock_data.iloc[i]['Close']
        label = 1 if current_close > previous_close else 0

        # get entries for date
        relevant_entries = gdelt_data[gdelt_data['SQLDATE'] == current_date].copy()

        for col in ['Actor1Name', 'Actor2Name', 'EventCode']:
            if col in ['Actor1Name', 'Actor2Name']:
                # lambda function 
                relevant_entries[col] = relevant_entries[col].apply(
                    lambda x: [actor_lookup.get(item, item) for item in x] if isinstance(x, list) else actor_lookup.get(x, x)
                )

                # Check for missing or nan
                if relevant_entries[col].isnull().any():
                    raise ValueError(f"Missing data found in column {col} for date {current_date}")
            elif col == 'EventCode':
                # use lambda
                relevant_entries[col] = relevant_entries[col].astype(str).apply(
                    lambda x: [event_lookup.get(item, item) for item in x] if isinstance(x, list) else event_lookup.get(x, x)
                )
                # Check again for eventcode
                if relevant_entries[col].isnull().any():
                    raise ValueError(f"Missing data found in column {col} for date {current_date}")

        # Convert relevant entries to a list of dictionaries for JSON serialization and fast processing
        relevant_entries_dict = relevant_entries.to_dict(orient='records')
        # convert values call
        relevant_entries_dict = convert_values(relevant_entries_dict)

        # Append to the lists, converting the date to a string using convert function
        combined_data.append({
            'date': convert_values(current_date),
            'label': label,
            'data': relevant_entries_dict
        })
        labels.append(label)

        # Update the last processed date
        last_processed_date = current_date

    except ValueError as e:
        # general error catch
        continue
    except Exception as e:
        # Print other exceptions, print statments taken from other places for proper formatting
        print(f"Error processing index {i}: {e}")
        continue

print("Data processing complete.")

# save out
print("Saving combined data to compressed binary file...")
joblib.dump(combined_data, 'combined_data_test.pkl.gz', compress=('gzip', 3))
print("Combined data saved to 'combined_data_test.pkl.gz'.")

In [None]:
import pandas as pd
import json
import h5py
from tqdm import tqdm

# load files
gdelt_data = pd.read_csv('gdelt_data_test/gdelt_data_test.csv', low_memory=False)
stock_data = pd.read_csv('stock_data_test/stock_data_test.csv')
gdelt_data['SQLDATE'] = pd.to_datetime(gdelt_data['SQLDATE'], format='%Y-%m-%d')
gdelt_data = gdelt_data.sort_values(by='SQLDATE')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%Y-%m-%d')

# load hashmaps
with open('cameo_embeddings.json', 'r') as f:
    event_lookup = json.load(f)
with open('actor_embeddings_cleaned.json', 'r') as f:
    actor_lookup = json.load(f)

# convert to correct format function
def convert_values(data):
    if isinstance(data, dict):
        return {k: convert_values(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_values(item) for item in data]
    elif isinstance(data, (int, float)):
        return data
    elif isinstance(data, pd.Timestamp):
        return data.strftime('%Y-%m-%d')
    elif isinstance(data, str):
        try:
            return int(data)
        except ValueError:
            try:
                return float(data)
            except ValueError:
                # default
                return data
    else:
        return data

# Initialize lists 
combined_data = []
labels = []

# Create HDF5 file
with h5py.File('combined_data_test.h5', 'w') as h5f:
    # process all date data
    last_processed_date = None
    for i in tqdm(range(1, len(stock_data)), desc="Processing stock data"):
        try:
            current_date = stock_data.iloc[i]['Date']
            previous_close = stock_data.iloc[i - 1]['Close']
            current_close = stock_data.iloc[i]['Close']
            label = 1 if current_close > previous_close else 0

            # relevnt entries filtered per date
            relevant_entries = gdelt_data[gdelt_data['SQLDATE'] == current_date].copy()
            for col in ['Actor1Name', 'Actor2Name', 'EventCode']:
                if col in ['Actor1Name', 'Actor2Name']:
                    # use lambda to map
                    relevant_entries[col] = relevant_entries[col].apply(
                        lambda x: [actor_lookup.get(item, item) for item in x] if isinstance(x, list) else actor_lookup.get(x, x)
                    )

                    # Check for missing
                    if relevant_entries[col].isnull().any():
                        raise ValueError(f"Missing data found in column {col} for date {current_date}")
                    
                elif col == 'EventCode':
                    relevant_entries[col] = relevant_entries[col].astype(str).apply(
                        lambda x: [event_lookup.get(item, item) for item in x] if isinstance(x, list) else event_lookup.get(x, x)
                    )

                    if relevant_entries[col].isnull().any():
                        raise ValueError(f"Missing data found in column {col} for date {current_date}")

            # Convert relevant entries to a list of dictionaries for JSON serialization and fast processing
            relevant_entries_dict = relevant_entries.to_dict(orient='records')

            # Convert values call
            relevant_entries_dict = convert_values(relevant_entries_dict)
            # create structured datsaet
            date_group = h5f.create_group(str(current_date.date()))
            date_group.attrs['label'] = label
            for j, entry in enumerate(relevant_entries_dict):
                entry_group = date_group.create_group(f'entry_{j}')
                for key, value in entry.items():
                    entry_group.create_dataset(key, data=value)

            # Update the last processed date
            last_processed_date = current_date

        except ValueError as e:
            # handle general errors
            continue
        except Exception as e:
            # print errors, proper formatting
            print(f"Error processing index {i}: {e}")
            continue

print("Data processing complete.")

In [None]:
# Fully testing cell, used for test cases and debugging in memory exlosion problem in the stock analysis file

import h5py
import json
import numpy as np

def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (int, float, str, bool)):
        return obj
    elif isinstance(obj, (pd.Timestamp, pd.Timedelta)):
        return str(obj)
    else:
        return str(obj)

with h5py.File('combined_data_group_1.h5', 'r') as h5f:
    first_three_entries = []
    date_groups = list(h5f.keys())
    
    for date_group_name in date_groups[:3]:
        date_group = h5f[date_group_name]
        entry_dict = {
            'date': date_group_name,
            'label': date_group.attrs['label'],
            'data': []
        }
        for entry_name in date_group.keys():
            entry_group = date_group[entry_name]
            entry_data = {}
            for key in entry_group.keys():
                entry_data[key] = convert_to_serializable(entry_group[key][()])
            entry_dict['data'].append(entry_data)
        
        first_three_entries.append(entry_dict)

json_data = json.dumps(convert_to_serializable(first_three_entries), indent=4)
with open('first_three_entries.json', 'w') as json_file:
    json_file.write(json_data)

In [None]:
# Print structure of the HDF5 file for testing and debugging purposes

import json
import h5py
import numpy as np

with open('first_entries_2015.json', 'r') as f:
    json_data = json.load(f)
with h5py.File('data.h5', 'w') as h5f:
    for entry in json_data:
        date_group = h5f.create_group(entry['date'])
        date_group.attrs['label'] = entry['label']
        
        data_group = date_group.create_group('data')
        for i, data_entry in enumerate(entry['data']):
            entry_group = data_group.create_group(f'entry_{i}')
            for key, value in data_entry.items():
                if isinstance(value, list):
                    entry_group.create_dataset(key, data=np.array(value))
                else:
                    entry_group.attrs[key] = value
                    
with h5py.File('data.h5', 'r') as h5f:
    def print_structure(name, obj):
        print(name)
        for key, val in obj.attrs.items():
            print(f"  {key}: {val}")
    h5f.visititems(print_structure)

In [None]:
# Dumps the HDF5 file to a JSON file so that I can read it and assess if the processing was done correctly (or correctly enough to be passable)

import h5py
import json
import numpy as np

def h5_to_dict(h5file):
    def recursively_convert_group(group):
        result = {}
        for key, item in group.items():
            if isinstance(item, h5py.Group):
                result[key] = recursively_convert_group(item)

            elif isinstance(item, h5py.Dataset):
                result[key] = item[()].tolist()

        for key, val in group.attrs.items():
            result[key] = val.tolist() if isinstance(val, np.ndarray) else val

        return result
    
    data_dict = {}

    for key, item in h5file.items():
        data_dict[key] = recursively_convert_group(item)

    return data_dict

def convert_to_serializable(obj):

    if isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    
    elif isinstance(obj, (list, tuple)):
        return [convert_to_serializable(i) for i in obj]
    
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    
    else:
        return obj


with h5py.File('data.h5', 'r') as h5f:
    data_dict = h5_to_dict(h5f)

json_data = []
for date, date_group in data_dict.items():
    entry = {
        'date': date,
        'label': convert_to_serializable(date_group.pop('label')),
        'data': []
    }

    for entry_key, entry_group in date_group['data'].items():
        entry['data'].append(convert_to_serializable(entry_group))

    json_data.append(entry)

with open('data.json', 'w') as f:
    json.dump(json_data, f, indent=4)

In [None]:
# finds labels for date entries to help me check if the data was processed correctly after entry 1

import json

with open('first_three_entries.json', 'r') as f:
    json_data = json.load(f)

def find_labels(file_path):

    with open(file_path, 'r') as f:
        
        lines = f.readlines()
    
    label_lines = []
    for i, line in enumerate(lines):
        if '"label":' in line:
            label_lines.append(i + 1)
    return label_lines

label_lines = find_labels('first_three_entries.json')
print(f"Labels found at lines: {label_lines}")

In [None]:
# From the json file made above, make a smaller json file that only has the first three entries because the files are too large to be easily viewed

import joblib
import json
from tqdm import tqdm

# load compressed file
combined_data_2015 = joblib.load('combined_data_test.pkl.gz')

# Extract the first 5 entries
first_5_entries = combined_data_2015[:100]

# add stuff for json formatting
with open('first_entries_2015.json', 'w') as json_file:
    json_file.write('[\n') 
    for i, entry in enumerate(tqdm(first_5_entries, desc="Saving entries to JSON")):
        json.dump(entry, json_file, indent=4)
        if i < len(first_5_entries) - 1:
            json_file.write(',\n')
        else:
            json_file.write('\n')
    json_file.write(']\n')

print("done")

In [None]:
# archival testing, or could be used for json testing, prints number of items in each element of the first entry, important for finding out if the data was processed correctly and finding the size of the arrays since I forgot

import json

with open('first_entries_2015.json', 'r') as file:
    data = json.load(file)

first_entry = data[0]

# get first entry in data, no longer used like this
first_data_entry = first_entry['data'][0]

actor1_name = first_data_entry['Actor1Name']
actor2_name = first_data_entry['Actor2Name']
event_code = first_data_entry['EventCode']
print(f"Number of elements in Actor1Name: {len(actor1_name)}")
print(f"Number of elements in Actor2Name: {len(actor2_name)}")
print(f"Number of elements in EventCode: {len(event_code)}")

In [None]:
# chunks the data as wanted, important cell

import pandas as pd
import os

gdelt_data = pd.read_csv('gdelt_data_cleaned.csv', low_memory=False)
stock_data = pd.read_csv('stock_data.csv')
gdelt_data['SQLDATE'] = pd.to_datetime(gdelt_data['SQLDATE'], format='%Y-%m-%d')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%Y-%m-%d')

os.makedirs('gdelt_data_by_chunks', exist_ok=True)
os.makedirs('stock_data_by_chunks', exist_ok=True)

# Split stock_data into chunks of 100 entries and save to separate files, uses date range to find which entres to append for each date, depends on previous data, two different files
chunk_size = 100
for i, chunk in enumerate(range(0, stock_data.shape[0], chunk_size)):
    chunk_data = stock_data.iloc[chunk:chunk + chunk_size]
    filename = f'stock_data_by_chunks/stock_data_chunk_{i + 1}.csv'
    chunk_data.to_csv(filename, index=False)
    print(f"Saved {filename}")

    start_date = chunk_data['Date'].min()
    end_date = chunk_data['Date'].max()
    gdelt_chunk = gdelt_data[(gdelt_data['SQLDATE'] >= start_date) & (gdelt_data['SQLDATE'] <= end_date)]
    gdelt_filename = f'gdelt_data_by_chunks/gdelt_data_chunk_{i + 1}.csv'
    gdelt_chunk.to_csv(gdelt_filename, index=False)
    print(f"Saved {gdelt_filename}")

In [None]:
# This does essentially the same thing as the file above, but it was used for testing earlier, and it only makes one smaller chunk

import pandas as pd
import os

gdelt_data = pd.read_csv('gdelt_data_cleaned.csv', low_memory=False)
stock_data = pd.read_csv('stock_data.csv')
gdelt_data['SQLDATE'] = pd.to_datetime(gdelt_data['SQLDATE'], format='%Y-%m-%d')
stock_data['Date'] = pd.to_datetime(stock_data['Date'], format='%Y-%m-%d')

first_10_dates = stock_data['Date'].sort_values().unique()[:10]
stock_data_test = stock_data[stock_data['Date'].isin(first_10_dates)]
gdelt_data_test = gdelt_data[gdelt_data['SQLDATE'].isin(first_10_dates)]

os.makedirs('gdelt_data_test', exist_ok=True)
os.makedirs('stock_data_test', exist_ok=True)

stock_data_test_filename = 'stock_data_test/stock_data_test.csv'
stock_data_test.to_csv(stock_data_test_filename, index=False)
print(f"Saved {stock_data_test_filename}")

gdelt_data_test_filename = 'gdelt_data_test/gdelt_data_test.csv'
gdelt_data_test.to_csv(gdelt_data_test_filename, index=False)
print(f"Saved {gdelt_data_test_filename}")

In [None]:
# This cell was intended to test how large the pkl file was unzipped, but it failed, and I did not end up using pkl whatsoever, so it is mostly useless

import joblib
import sys
import pandas as pd

data = joblib.load('combined_data_test.pkl.gz')

def get_memory_size(obj):
    return sys.getsizeof(obj)

def get_pandas_memory_usage(df):
    return df.memory_usage(deep=True).sum()

if isinstance(data, pd.DataFrame):
    memory_size = get_pandas_memory_usage(data)
else:
    memory_size = get_memory_size(data)

print(f"Estimated memory size: {memory_size / (1000 ** 2):.2f} MB")