In [24]:
# Data manipulation
import pandas as pd
import numpy as np

# Graph Neural Networks
import torch
import torch.nn.functional as F
from torch_geometric.data import Data as GeometricData
from torch_geometric.nn import GCNConv

# Bayesian Modeling
import pymc as pm

# Visualization
import matplotlib.pyplot as plt
import networkx as nx

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')


## 2. Loading and preprocessing

In [31]:
df = pd.read_csv('../data/raw/train_data.csv')

df.replace(-1.0, np.nan, inplace=True)
df.replace('-1', np.nan, inplace=True)

# Convert date columns to datetime
df['launch_date'] = pd.to_datetime(df['launch_date'])
df['date'] = pd.to_datetime(df['date'])
df['ind_launch_date'] = pd.to_datetime(df['ind_launch_date'], errors='coerce')

# Convert lists from strings to actual lists
import ast
df['indication'] = df['indication'].apply(ast.literal_eval)

df.to_csv('../data/intermig/train_data-withNaNs.csv', index=False)
df.head()

Unnamed: 0,brand,che_pc_usd,che_perc_gdp,cluster_nl,corporation,country,launch_date,date,drug_id,ind_launch_date,indication,insurance_perc_che,population,prev_perc,price_month,price_unit,public_perc_che,therapeutic_area,target
0,BRAND_354E,1.209114,1.665879,BRAND_354E_COUNTRY_88A3,CORP_D524,COUNTRY_88A3,2014-06-01,2014-06-01,DRUG_ID_8795,NaT,[IND_C3B6],1.893333,1.008039,0.028367,1.006444,1.013784,1.835821,THER_AREA_980E,1.000784
1,BRAND_626D,,,BRAND_626D_COUNTRY_8B47,CORP_01C7,COUNTRY_8B47,2014-06-01,2014-06-01,DRUG_ID_E66E,2014-09-01,"[IND_1590, IND_ECAC]",,1.023562,4.7e-05,,1.626677,,THER_AREA_96D7,1.0
2,BRAND_45D9,1.209114,1.665879,BRAND_45D9_COUNTRY_88A3,CORP_39F7,COUNTRY_88A3,2014-06-01,2014-06-01,DRUG_ID_F272,NaT,[IND_B2EF],1.893333,1.008039,0.001502,,3.144874,1.835821,THER_AREA_96D7,1.002258
3,BRAND_D724,1.85128,2.05177,BRAND_D724_COUNTRY_445D,CORP_711A,COUNTRY_445D,2014-06-01,2014-06-01,DRUG_ID_1D4E,NaT,[IND_BAFB],1.0,1.253186,0.001304,,1.213446,1.80597,THER_AREA_6CEE,1.068761
4,BRAND_4887,1.791199,2.05913,BRAND_4887_COUNTRY_D8B0,CORP_443D,COUNTRY_D8B0,2014-06-01,2014-06-01,DRUG_ID_AA88,NaT,[IND_3F31],2.013333,1.639352,0.054467,1.018589,1.008708,1.880597,THER_AREA_6CEE,1.036312


In [None]:
# Fill missing target values with zeros (since target is NaN in the sample)
df['target'] = df['target'].fillna(0)
# Calculate the difference in months
df['months_since_launch'] = (
    (df['date'].dt.year - df['launch_date'].dt.year) * 12 +
    (df['date'].dt.month - df['launch_date'].dt.month)
)

# If you want to account for the day of the month
df['months_since_launch'] -= df['date'].dt.day < df['launch_date'].dt.day

# Convert to integer type
df['months_since_launch'] = df['months_since_launch'].astype(int)

# Display the updated DataFrame
df.head()

Unnamed: 0,brand,che_pc_usd,che_perc_gdp,cluster_nl,corporation,country,launch_date,date,drug_id,ind_launch_date,...,insurance_perc_che,population,prev_perc,price_month,price_unit,public_perc_che,therapeutic_area,target,months_since_launch,tam
0,113,1.209114,1.665879,BRAND_354E_COUNTRY_88A3,116,28,2014-06-01,2014-06-01,121,NaT,...,1.893333,1.008039,0.028367,1.006444,1.013784,1.835821,10,1.000784,0,0.028595
1,223,,,BRAND_626D_COUNTRY_8B47,0,30,2014-06-01,2014-06-01,223,2014-09-01,...,,1.023562,4.7e-05,,1.626677,,9,1.0,0,4.8e-05
2,155,1.209114,1.665879,BRAND_45D9_COUNTRY_88A3,28,28,2014-06-01,2014-06-01,236,NaT,...,1.893333,1.008039,0.001502,,3.144874,1.835821,9,1.002258,0,0.001514
3,489,1.85128,2.05177,BRAND_D724_COUNTRY_445D,55,13,2014-06-01,2014-06-01,25,NaT,...,1.0,1.253186,0.001304,,1.213446,1.80597,7,1.068761,0,0.001634
4,161,1.791199,2.05913,BRAND_4887_COUNTRY_D8B0,34,43,2014-06-01,2014-06-01,149,NaT,...,2.013333,1.639352,0.054467,1.018589,1.008708,1.880597,7,1.036312,0,0.089291


In [None]:

# Calculate Total Addressable Market (TAM)
df['tam'] = df['population'] * df['prev_perc']

# Handle missing values in TAM
df['tam'].fillna(df['tam'].median(), inplace=True)

# Encode categorical variables using Label Encoding for simplicity
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['brand', 'corporation', 'country', 'drug_id', 'therapeutic_area']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tam'].fillna(df['tam'].median(), inplace=True)
