In [None]:
# Import libraries
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
import ast


## 1. Load and encode the data into numbers

In [41]:
# Load the dataset (replace 'your_dataset.csv' with your file path)
df = pd.read_csv('../data/intermig/train_data-medianimputed.csv')
df = df.drop('ind_launch_date', axis=1)


date_cols = ['launch_date', 'date']

# Process date columns
for col in date_cols:
    df[col] = pd.to_datetime(df[col])
    df[f'{col}_year'] = df[col].dt.year
    df[f'{col}_month'] = df[col].dt.month
    df = df.drop(col, axis=1)

categoric_cols = ['brand', 'cluster_nl', 'corporation', 'country', 'drug_id', 'therapeutic_area']

# Process categorical columns
label_encoders = {}
for col in categoric_cols:
    le = LabelEncoder()
    df[f'{col}_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le
    df = df.drop(col, axis=1)

# Handle indication column (contains lists)
mlb = MultiLabelBinarizer()
# Convert string representations of lists to actual lists
df['indication'] = df['indication'].apply(ast.literal_eval)
# Transform the lists
indication_encoded = pd.DataFrame(
    mlb.fit_transform(df['indication']),
    columns=[f'indication_{i}' for i in range(len(mlb.classes_))],
    index=df.index
)
df = df.drop('indication', axis=1)
df = pd.concat([df, indication_encoded], axis=1)

print("Encoded features shape:", df.shape)
print("\nFeature names:", df.columns.tolist())

Encoded features shape: (118917, 175)

Feature names: ['che_pc_usd', 'che_perc_gdp', 'insurance_perc_che', 'population', 'prev_perc', 'price_month', 'price_unit', 'public_perc_che', 'target', 'months_since_launch', 'launch_date_year', 'launch_date_month', 'date_year', 'date_month', 'brand_encoded', 'cluster_nl_encoded', 'corporation_encoded', 'country_encoded', 'drug_id_encoded', 'therapeutic_area_encoded', 'indication_0', 'indication_1', 'indication_2', 'indication_3', 'indication_4', 'indication_5', 'indication_6', 'indication_7', 'indication_8', 'indication_9', 'indication_10', 'indication_11', 'indication_12', 'indication_13', 'indication_14', 'indication_15', 'indication_16', 'indication_17', 'indication_18', 'indication_19', 'indication_20', 'indication_21', 'indication_22', 'indication_23', 'indication_24', 'indication_25', 'indication_26', 'indication_27', 'indication_28', 'indication_29', 'indication_30', 'indication_31', 'indication_32', 'indication_33', 'indication_34', 'ind

## 2. Build the graph with dataset rows

In [70]:
import torch
import numpy as np
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


# 1. Organize node features
numerical_features = [
    'che_pc_usd', 'che_perc_gdp', 'insurance_perc_che', 'population',
    'prev_perc', 'price_month', 'price_unit', 'public_perc_che',
    'months_since_launch'
]

categorical_features = [
    'brand_encoded', 'cluster_nl_encoded', 'corporation_encoded',
    'country_encoded', 'drug_id_encoded', 'therapeutic_area_encoded'
]

indication_features = [col for col in df.columns if col.startswith('indication_')]


In [88]:
# 2. Construct graph
G = nx.Graph()

# Group by cluster_nl to get time series
cluster_features = {}
for cluster_nl, group in df.groupby('cluster_nl_encoded'):
    # Sort by date
    group = group.sort_values('date_year')
    
    # Get static features (last values)
    static_features = np.concatenate([
        group[numerical_features].iloc[-1].values,
        group[categorical_features].iloc[-1].values,
        group[indication_features].iloc[-1].values
    ])
    
    # Add temporal features
    temporal_features = np.array([
        group['date_year'].min(),     # launch_year
        group['date_month'].min(),    # launch_month
        len(group),                   # duration
        group['target'].mean(),       # mean_target
    ])

    # Combine features for similarity computation
    cluster_features[cluster_nl] = np.concatenate([
        group[numerical_features].iloc[-1].values,
    ])
        
    # Add node
    G.add_node(cluster_nl,
                features=static_features,
                temporal=temporal_features,
                country=group['country_encoded'].iloc[0],
                brand=group['brand_encoded'].iloc[0],
                corporation=group['corporation_encoded'].iloc[0],
                therapeutic_area=group['therapeutic_area_encoded'].iloc[0],
                ts_data=group['target'].values)  # Store full time series

In [90]:
# Add edges between similar time series
clusters = list(G.nodes())
feature_matrix = np.vstack([cluster_features[c] for c in clusters])
similarity_matrix = cosine_similarity(feature_matrix)
similarity_matrix

similarity_threshold=0.99

for i in range(len(clusters)):
    for j in range(i+1, len(clusters)):
        ci, cj = clusters[i], clusters[j]      

        consider_similarity = True 
        # Connect if same therapeutic area
        if G.nodes[ci]['therapeutic_area'] == G.nodes[cj]['therapeutic_area']:
            G.add_edge(ci, cj, edge_type='therapeutic')
            consider_similarity = False 

        # Connect if same corporation
        if G.nodes[ci]['corporation'] == G.nodes[cj]['corporation']:
            G.add_edge(ci, cj, edge_type='corporation')
            consider_similarity = False 

        # Connect if same country
        if G.nodes[ci]['country'] == G.nodes[cj]['country']:
            G.add_edge(ci, cj, edge_type='country')
            consider_similarity = False 

        if not consider_similarity:
            continue
       # Connect if similar features
        if similarity_matrix[i, j] > similarity_threshold:
            G.add_edge(ci, cj, edge_type='similarity')
        
# Print graph statistics
print(f"\nGraph Statistics:")
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print("\nEdge types distribution:")
edge_types = [d['edge_type'] for (u, v, d) in G.edges(data=True)]
for edge_type in set(edge_types):
    count = edge_types.count(edge_type)
    print(f"{edge_type}: {count} edges")


Graph Statistics:
Number of nodes: 2716
Number of edges: 1026822

Edge types distribution:
therapeutic: 721865 edges
country: 103988 edges
corporation: 200730 edges
similarity: 239 edges


## 3. Building the GNN

In [None]:
import torch
from torch_geometric.data import Data
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Convert graph to PyTorch Geometric format
def convert_to_pytorch_geometric(G):
    # Prepare edge index and type
    edge_index = []
    edge_type = []
    edge_type_dict = {
        'therapeutic': 0,
        'corporation': 1,
        'country': 2,
        'similarity': 3
    }
    
    for u, v, data in G.edges(data=True):
        # Add both directions for undirected graph
        edge_index.extend([[u, v], [v, u]])
        edge_type.extend([edge_type_dict[data['edge_type']]] * 2)
    
    # Convert node features
    x = torch.tensor([G.nodes[node]['features'] for node in G.nodes()], dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t()
    edge_type = torch.tensor(edge_type, dtype=torch.long)
    
    # Create target tensor
    y = torch.tensor([G.nodes[node]['ts_data'][-1] for node in G.nodes()], dtype=torch.float)
    
    return Data(x=x, edge_index=edge_index, edge_type=edge_type, y=y)

# Convert graph and initialize model
data = convert_to_pytorch_geometric(G)

# Move data to device
data = data.to(device)


print(f"\nData Statistics:")
print(f"Number of node features: {data.x.size(1)}")
print(f"Number of edges: {data.edge_index.size(1)}")
print(f"Number of edge types: {len(torch.unique(data.edge_type))}")


Using device: cuda

Data Statistics:
Number of node features: 170
Number of edges: 2053644
Number of edge types: 4


In [None]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool

class TimeSeriesGNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, forecast_steps=23, num_edge_types=4):
        super().__init__()
        self.num_edge_types = num_edge_types
        
        # Edge type attention weights
        self.edge_type_weights = nn.Parameter(torch.ones(num_edge_types))
        
        # Spatial: GAT layers per edge type
        self.spatial_layers = nn.ModuleList([
            GATConv(in_channels, hidden_channels, heads=4, concat=False) 
            for _ in range(num_edge_types)
        ])
        
        # Temporal: Transformer + Conv with normalization (good suggestion!)
        self.temporal_conv = nn.Sequential(
            nn.LayerNorm(hidden_channels),
            nn.Conv1d(hidden_channels, hidden_channels, kernel_size=3, padding=1)
        )
        self.temporal_attention = nn.MultiheadAttention(hidden_channels, num_heads=4)
        
        # Output layers with regularization (good suggestion!)
        self.decoder = nn.Sequential(
            nn.Linear(hidden_channels * 2, hidden_channels),
            nn.ReLU(),
            nn.LayerNorm(hidden_channels),
            nn.Dropout(0.1),
            nn.Linear(hidden_channels, forecast_steps * 2)
        )

    def nll_loss(self, means, log_vars, targets):
        variances = torch.exp(log_vars)
        loss = ((targets - means)**2 / variances + log_vars).mean()
        return loss
        
    def forward(self, x, edge_index, edge_type, batch=None):
        # 1. Spatial dependencies
        spatial_embeds = []
        for i in range(self.num_edge_types):
            mask = edge_type == i
            if mask.any():
                edge_index_i = edge_index[:, mask]
                spatial_embed = self.spatial_layers[i](x, edge_index_i)
                spatial_embeds.append(spatial_embed)
            else:
                spatial_embeds.append(torch.zeros_like(x))
        
        # Combine spatial embeddings
        spatial_out = sum(spatial_embeds)
        
        # 2. Temporal processing
        # Convolution branch
        temp_conv = self.temporal_conv(spatial_out.unsqueeze(-1)).squeeze(-1)
        
        # Attention branch
        temp_attn, _ = self.temporal_attention(
            spatial_out.unsqueeze(0),
            spatial_out.unsqueeze(0),
            spatial_out.unsqueeze(0)
        )
        temp_attn = temp_attn.squeeze(0)
        
        # Combine temporal features
        combined = torch.cat([temp_conv, temp_attn], dim=-1)
        
        # 3. Generate forecasts with uncertainty
        output = self.decoder(combined)
        means, log_vars = output.chunk(2, dim=-1)
        
        return means, torch.exp(log_vars)  # return mean and variance

# Convert graph and initialize model
model = TimeSeriesGNN(
    in_channels=data.x.size(1),
    hidden_channels=8,
    forecast_steps=2,
    num_edge_types=4
).to(device)

# Print model summary
print(f"\nModel Architecture:")
print(model)


Model Architecture:
TimeSeriesGNN(
  (spatial_layers): ModuleList(
    (0-3): 4 x GATConv(170, 8, heads=4)
  )
  (temporal_conv): Sequential(
    (0): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (1): Conv1d(8, 8, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (temporal_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=16, out_features=8, bias=True)
    (1): ReLU()
    (2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=8, out_features=4, bias=True)
  )
)


In [105]:
import torch

# Create sample data
batch_size = 4
in_channels = 10  # numerical + categorical + indication features
hidden_channels = 64
forecast_steps = 23
num_edge_types = 4
num_nodes = 8

# Sample node features
x = torch.randn(num_nodes, in_channels)

# Sample edges (2 edges per type)
edge_index = torch.tensor([
    [0, 1, 2, 3, 4, 5, 6, 7],  # Source nodes
    [1, 2, 3, 4, 5, 6, 7, 0]   # Target nodes
], dtype=torch.long)

# Sample edge types
edge_type = torch.tensor([0, 0, 1, 1, 2, 2, 3, 3], dtype=torch.long)

# Sample target values
targets = torch.randn(num_nodes, forecast_steps)

# Initialize model
model = TimeSeriesGNN(
    in_channels=in_channels,
    hidden_channels=hidden_channels,
    forecast_steps=forecast_steps,
    num_edge_types=num_edge_types
)

# Forward pass
means, log_vars = model(x, edge_index, edge_type)

# Calculate loss
loss = model.nll_loss(means, log_vars, targets)

# Print shapes and values
print(f"Input shape: {x.shape}")
print(f"Edge index shape: {edge_index.shape}")
print(f"Predictions shape: {means.shape}")
print(f"Log variances shape: {log_vars.shape}")
print(f"Loss value: {loss.item():.4f}")

RuntimeError: Given normalized_shape=[64], expected input with shape [*, 64], but got input of size[8, 64, 1]