Set up OS

In [None]:
!pip install google-auth google-auth-oauthlib google-auth-httplib2
!pip install google-api-python-client
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import os
os.chdir('/content/drive/My Drive')

# Imports

In [None]:
!pip install torch_geometric

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import torch.optim as optim
import pandas as pd
from torch_geometric.loader import DataLoader
import itertools
import os
from torch_geometric.data import DataLoader, Data
from glob import glob


# Import Graph Data Per Panel Data Conversion Method

Euclidean

In [None]:
import os
import torch
from torch_geometric.data import DataLoader, Data
from glob import glob

def extract_parameters_from_filename(filename):
    """Extract W, sigma, and theta parameters from the given filename."""
    parts = filename.replace('.pt', '').split('_')
    W, sigma, theta = None, None, None
    for part in parts:
        if part.startswith('W'):
            W = int(part[1:])
        elif part.startswith('sigma'):
            sigma = float(part[5:])
        elif part.startswith('theta'):
            theta = float(part[5:])
    return W, sigma, theta

def load_all_graphs(W, sigma, theta):
    """Load all graphs matching the given W, sigma, and theta parameters."""
    save_dir = f"Pair(1963 New)_Min_FILTERED_W{W}_sigma{sigma}_theta{theta}"
    all_files = glob(os.path.join(save_dir, 'Pair(1963 New)_Min_FILTERED_*.pt'))
    all_data = []

    for file_path in all_files:
        basename = os.path.basename(file_path)
        W_file, sigma_file, theta_file = extract_parameters_from_filename(basename)
        if W == W_file and sigma == sigma_file and theta == theta_file:
            graph_data = torch.load(file_path)
            all_data.append(graph_data)

    return all_data

# Specify parameters
W_target = 36
sigma_target = 1
theta_target = 0.00001

# Load all matching data
all_data = load_all_graphs(W=W_target, sigma=sigma_target, theta=theta_target)


Cosine

In [None]:
import os
import torch
from torch_geometric.data import DataLoader, Data
from glob import glob

def extract_parameters_from_filename(filename):
    """Extract W and Percentile parameters from the given filename."""
    parts = filename.replace('.pt', '').split('_')
    W, percentile = None, None
    for part in parts:
        if part.startswith('W'):
            W = int(part[1:])
        elif 'Percentile' in part:
            start = part.find('Percentile') + len('Percentile')
            end = part.find('_Abs') if '_Abs' in part else len(part)
            percentile = int(part[start:end])
    return W, percentile

def load_all_graphs(W, percentile):
    """Load all graphs matching the given W and Percentile parameters, and print nodes and edges per graph."""
    save_dir = f"Pair-cosine(1963bestSP500)_Min_FILTERED_W{W}_Percentile{percentile}"
    all_files = glob(os.path.join(save_dir, '*.pt'))
    all_data = []

    for file_path in all_files:
        basename = os.path.basename(file_path)
        W_file, percentile_file = extract_parameters_from_filename(basename)
        if W == W_file and percentile == percentile_file:
            graph_data = torch.load(file_path)
            all_data.append(graph_data)
            print(f"Graph loaded from {file_path}: {graph_data.num_nodes} nodes, {graph_data.num_edges // 2} edges.")

    return all_data

# Specify the window size and percentile parameters
W_target = 36  # Update this as needed
percentile_target = 5  # Update this as needed

# Load all matching data
all_data = load_all_graphs(W=W_target, percentile=percentile_target)

# Optional: Load into a DataLoader if using batches
# data_loader = DataLoader(all_data, batch_size=10)  # Adjust batch size as necessary


Pearson Correlation

In [4]:
import os
import torch
from torch_geometric.data import DataLoader, Data
from glob import glob

def extract_parameters_from_filename(filename):
    """Extract W and theta parameters from the given filename."""
    parts = filename.replace('.pt', '').split('_')
    W, theta = None, None
    for part in parts:
        if part.startswith('W'):
            W = int(part[1:])
        elif part.startswith('theta'):
            theta = float(part[5:])
    return W, theta

def load_all_graphs(W, theta):
    """Load all graphs matching the given W and theta parameters."""
    save_dir = f"Corrsaved(absolute)_graphs_W{W}_theta{theta}"
    all_files = glob(os.path.join(save_dir, '*.pt'))
    all_data = []

    for file_path in all_files:
        basename = os.path.basename(file_path)
        W_file, theta_file = extract_parameters_from_filename(basename)
        if W == W_file and theta == theta_file:
            graph_data = torch.load(file_path)
            all_data.append(graph_data)

    return all_data

# Specify parameters
W_target = 36
theta_target = 0.00001

# Load all matching data
all_data = load_all_graphs(W=W_target, theta=theta_target)





# GCN Model

**Data Splitting Function**

This function splits the dataset into training, validation, and test sets based on specified year ranges.

In [21]:
def split_data_by_year(data, train_years, val_years, test_years):
    """Split the provided data into training, validation, and test datasets based on year ranges."""
    train_dataset, val_dataset, test_dataset = [], [], []

    for graph_data in data:
        if graph_data.date.numel() > 0:
            year = int(str(graph_data.date[0].item())[:4])
            if train_years[0] <= year <= train_years[1]:
                train_dataset.append(graph_data)
            elif val_years[0] <= year <= val_years[1]:
                val_dataset.append(graph_data)
            elif test_years[0] <= year <= test_years[1]:
                test_dataset.append(graph_data)
    return train_dataset, val_dataset, test_dataset


**Custom GCN Layer**


This class defines a custom Graph Convolutional Network (GCN) layer:

__init__: Initializes the GCN layer with input and output channels.
forward: Defines the forward pass with a GCN convolution followed by a ReLU activation.


In [22]:
class CustomGCNLayer(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(CustomGCNLayer, self).__init__()
        self.gcn_conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.gcn_conv(x, edge_index)
        return F.relu(x)


**GCN Model**

This class defines a GCN model:

__init__: Initializes the model with multiple GCN layers and a final regression layer.
forward: Processes the input data through the GCN layers and produces a final regression output.

In [23]:
class GCNModel(torch.nn.Module):
    def __init__(self, num_node_features, base_units, num_layers):
        super(GCNModel, self).__init__()
        self.layers = torch.nn.ModuleList()
        for i in range(num_layers):
            in_channels = num_node_features if i == 0 else base_units
            self.layers.append(CustomGCNLayer(in_channels, base_units))
        self.regressor = torch.nn.Linear(base_units, 1)

    def forward(self, data):
        x, edge_index = data.x[:, :-1], data.edge_index
        for layer in self.layers:
            x = layer(x, edge_index)
        x = self.regressor(x)
        return x


**Training Function**

This function trains the model on the provided data loader:

model.train(): Puts the model in training mode.
for data in loader: Iterates over batches of data to train the model.
optimizer.zero_grad(): Resets gradients.
loss.backward(): Computes gradients.
optimizer.step(): Updates model parameters.

In [24]:
def train(model, loader, optimizer, criterion, l1_lambda, device):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        target = data.x[:, -1].view(-1, 1).to(device)
        mse_loss = criterion(out, target)
        l1_loss = sum(p.abs().sum() for p in model.parameters())
        loss = mse_loss + l1_lambda * l1_loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)


**Validation Function**

This function validates the model:

model.eval(): Puts the model in evaluation mode.
with torch.no_grad(): Disables gradient computation for validation.
for data in loader: Iterates over batches of data to validate the model.

In [25]:
def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            target = data.x[:, -1].view(-1, 1).to(device)
            loss = criterion(out, target)
            total_loss += loss.item()
    return total_loss / len(loader)


**Training and Evaluating Across Years**

This function trains and evaluates the model across multiple years:

for test_year in range(start_year, end_year + 1): Iterates over each test year.
split_data_by_year: Splits the data for each test year.
for params in itertools.product(*hyperparams.values()): Iterates over all hyperparameter combinations.
train: Trains the model.
validate: Validates the model.
ensemble_predictions: Averages predictions over multiple runs.


In [26]:
def train_and_evaluate_all_years(data, start_year, end_year, num_ensemble_runs=10):
    results_df = pd.DataFrame()

    for test_year in range(start_year, end_year + 1):
        print(f"\nStarting evaluations for test year: {test_year}")
        train_years = (1969, 1986 + (test_year - 1999))
        val_years = (1987 + (test_year - 1999), 1998 + (test_year - 1999))
        test_years = (test_year, test_year)

        train_dataset, val_dataset, test_dataset = split_data_by_year(data, train_years, val_years, test_years)

        best_loss = float('inf')
        best_params = {}
        patience = 5
        best_model = None

        for params in itertools.product(*hyperparams.values()):
            lr, num_layers, base_units, batch_size, dropout_rate, l1_lambda = params
            model = GCNModel(num_node_features-1, base_units, num_layers).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            criterion = torch.nn.MSELoss()
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

            best_val_loss = float('inf')
            epochs_no_improve = 0
            for epoch in range(500):
                train_loss = train(model, train_loader, optimizer, criterion, l1_lambda, device)
                val_loss = validate(model, val_loader, criterion, device)
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_model = model
                    epochs_no_improve = 0
                    print(f"Validation improvement at epoch {epoch+1}, loss: {val_loss:.4f}")
                else:
                    epochs_no_improve += 1
                    if epochs_no_improve >= patience:
                        print(f"Early stopping triggered after {epoch+1} epochs for test year {test_year}.")
                        break

            if best_val_loss < best_loss:
                best_loss = best_val_loss
                best_params = dict(zip(hyperparams.keys(), params), epochs=epoch+1)

        print(f"Best parameters for year {test_year}: {best_params}")
        print(f"Best validation loss for year {test_year}: {best_loss:.4f}")

        ensemble_predictions = []
        identifiers_list = []
        true_values = []
        for run in range(num_ensemble_runs):
            full_train_data = train_dataset + val_dataset
            full_train_loader = DataLoader(full_train_data, batch_size=best_params['batch_size'], shuffle=True)
            model = GCNModel(num_node_features-1, best_params['base_units'], best_params['num_layers']).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=best_params['learning_rate'])
            for epoch in range(best_params['epochs']):
                train_loss = train(model, full_train_loader, optimizer, criterion, best_params['l1_lambda'], device)
                print(f"Retraining on full data at epoch {epoch+1}, loss: {train_loss:.4f} for ensemble run {run+1}")

            test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False)
            predictions, identifiers, truths = predict_with_truths(model, test_loader, device)
            ensemble_predictions.append(predictions)
            identifiers_list = identifiers
            true_values.append(truths)
            print(f"Run {run+1}, Test Prediction Completed. Mean prediction: {sum(predictions)/len(predictions):.4f}")

        mean_predictions = [sum(preds) / num_ensemble_runs for preds in zip(*ensemble_predictions)]
        mean_true_values = [sum(trues) / num_ensemble_runs for trues in zip(*true_values)]
        year_results = pd.DataFrame({
            'Permno': [id[0] for id in identifiers_list],
            'Date': [id[1] for id in identifiers_list],
            'Prediction': mean_predictions,
            'True': mean_true_values,
            'Test_Year': [test_year]*len(mean_predictions)
        })
        results_df = pd.concat([results_df, year_results], ignore_index=True)

    return results_df


**Predict with Truths Function**

This function generates predictions and collects true values from the data:

model.eval(): Sets the model to evaluation mode.
with torch.no_grad(): Disables gradient computation.
identifiers.extend(zip(permnos, dates)): Collects unique identifiers for each data point.


In [27]:
def predict_with_truths(model, loader, device):
    model.eval()
    predictions = []
    identifiers = []
    truths = []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            output = model(batch)
            predictions.extend(output.view(-1).tolist())
            truths.extend(batch.x[:, -1].view(-1).tolist())

            if hasattr(batch, 'permno') and hasattr(batch, 'date'):
                permnos = batch.permno.tolist()
                dates = batch.date.tolist()
                identifiers.extend(zip(permnos, dates))
            else:
                raise AttributeError("Batch does not contain 'permno' or 'date' attributes. Please check your dataset structure.")

    return predictions, identifiers, truths


**Final Execution**

This final section:

train_and_evaluate_all_years: Trains and evaluates the model across the specified years.
save_results: Saves the results to CSV files.
results_df.to_csv: Saves the aggregated results to a final CSV file.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_node_features = 20
hyperparams = {
    'learning_rate': [0.001],
    'num_layers': [2],
    'base_units': [8, 16, 32],
    'batch_size': [24],
    'dropout_rate': [0.3],
    'l1_lambda': [0]
}


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
results_df = train_and_evaluate_all_years(all_data, 1999, 2022)
print("Aggregated results across all years")



file_name = f'Thesis Tilburg GNN/ResultsDF/Results_GCN_1963_Correlation(absolute)_Min_W{W_target}_theta{theta_target}.csv'

results_df.to_csv(file_name, index=False)



# Constructing the file name dynamically based on the variables
#file_name = f'Thesis Tilburg GNN/ResultsDF/Results_GCN_1963_Correlation(absolute)_Min_W{W_target}_theta{theta_target}.csv'

# Saving the DataFrame to CSV with the dynamically generated file name
#results_df.to_csv(file_name, index=False)

#file_name = f'Thesis Tilburg GNN/ResultsDF/Results_GCN_1963_Cosine_Min_W{W_target}_Percentile{percentile_target}.csv'
#esults_df.to_csv(file_name, index=False)

#file_name = f'Thesis Tilburg GNN/ResultsDF/Results_GCN_1963_Correlation(absolute)_Min_W{W_target}_theta{theta_target}.csv'

#results_df.to_csv(file_name, index=False)

