In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
#import data
dir = "/gdrive/My Drive/Colab Notebooks"
df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/cutoff_case_prepare.txt', delimiter='\t')
print(df.head())
print(df.info())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0       0     0         1             0              1            1.0   
1       0     0         0             1              1            0.0   
2       0     1         1             0              1            0.0   
3       0     1         0             0              1            0.0   
4       0     1         0             0              1            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0              1           0         0                   0  ...   0.013619   
1              1           0         1                   0  ...   0.050918   
2              0           0         0                   0  ...   0.015873   
3              0           0         0                   0  ...  -0.003685   
4              1           0         0                   0  ...   0.036937   

   Factor.2.  Factor.3.  Factor.4.  Factor.5.  Yrsincedep  \
0  -0.008851  -0.009791  -0.001

In [None]:
#normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)
normalized_dataframe = pd.DataFrame(normalized_data, columns=df.columns)
print(normalized_dataframe.head())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       1.0           0.0            1.0            1.0   
1     0.0   0.0       0.0           1.0            1.0            0.0   
2     0.0   1.0       1.0           0.0            1.0            0.0   
3     0.0   1.0       0.0           0.0            1.0            0.0   
4     0.0   1.0       0.0           0.0            1.0            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0            1.0         0.0       0.0                 0.0  ...   0.439127   
1            1.0         0.0       1.0                 0.0  ...   0.589726   
2            0.0         0.0       0.0                 0.0  ...   0.448228   
3            0.0         0.0       0.0                 0.0  ...   0.369260   
4            1.0         0.0       0.0                 0.0  ...   0.533276   

   Factor.2.  Factor.3.  Factor.4.  Factor.5.  Yrsincedep  \
0   0.689603   0.572179   0.407

In [None]:
#mask out 20 percentages of entries from the complete dataset
val_dataframe = normalized_dataframe.copy()
# Calculate the average of each column
#column_means = val_dataframe.mean()

# Create a new data frame to mark modifications
mask = pd.DataFrame(1, index=val_dataframe.index, columns=val_dataframe.columns)

# Operate on each column
for column in val_dataframe.columns:
    # Determine the number of rows to replace
    num_rows = val_dataframe.shape[0]
    num_to_replace = int(num_rows * 0.20)  # 20% 的行

    # Randomly select rows
    rows_to_replace = np.random.choice(num_rows, num_to_replace, replace=False)

    # Replace the value of the selected row with 0
    val_dataframe.loc[rows_to_replace, column] = 0

    # Set the value of the corresponding position in the mask data frame to 0
    mask.loc[rows_to_replace, column] = 0

#mask = mask.astype(float)
print(val_dataframe.head())
print(mask.head())
mask_re = mask.copy()
val_re = val_dataframe.copy()

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       1.0           0.0            1.0            1.0   
1     0.0   0.0       0.0           1.0            1.0            0.0   
2     0.0   1.0       1.0           0.0            1.0            0.0   
3     0.0   1.0       0.0           0.0            1.0            0.0   
4     0.0   1.0       0.0           0.0            1.0            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0            1.0         0.0       0.0                 0.0  ...   0.000000   
1            1.0         0.0       1.0                 0.0  ...   0.000000   
2            0.0         0.0       0.0                 0.0  ...   0.000000   
3            0.0         0.0       0.0                 0.0  ...   0.369260   
4            1.0         0.0       0.0                 0.0  ...   0.533276   

   Factor.2.  Factor.3.  Factor.4.  Factor.5.  Yrsincedep  \
0   0.689603   0.572179   0.407

In [None]:
#set auto_enconder
class HadamardAutoencoder(nn.Module):
    def __init__(self):
        super(HadamardAutoencoder, self).__init__()
        # Encoder part
        self.encoder = nn.Sequential(
            nn.Linear(in_features=499, out_features=128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        # Decoder part
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 499),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    # Add a method to get the encoding
    def encode(self, x):
        return self.encoder(x)


In [None]:
# use gpu for training if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
# instantiate the model
model = HadamardAutoencoder()
model.to(device) # move the model to gpu if available
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.0025)
criterion = nn.L1Loss()

HadamardAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=499, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=499, bias=True)
    (3): Sigmoid()
  )
)


In [None]:
#Self-Supervision set
replicate_size = 20
green_points_proportion = 0.4

val_dataframe = pd.concat([val_dataframe] * replicate_size, ignore_index=True)
mask = pd.concat([mask] * replicate_size, ignore_index=True)
print(val_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3760 entries, 0 to 3759
Columns: 499 entries, gender to A8c_RecentAge
dtypes: float64(499)
memory usage: 14.3 MB
None


In [None]:
train_dataframe = val_dataframe.copy()
# Calculate the average of each column
column_means = train_dataframe.mean()

# Operate on each column
for column in train_dataframe.columns:
    # Find the row in this column where mask is 1
    mask_rows = mask[mask[column] == 1].index

    # Calculate the number of rows that need to be replaced (40%)
    num_to_replace = int(len(mask_rows) * green_points_proportion)

    # Randomly select some of these rows
    rows_to_replace = np.random.choice(mask_rows, num_to_replace, replace=False)

    # Replace the value of the selected row with the average value of the column
    train_dataframe.loc[rows_to_replace, column] = column_means[column]

print(train_dataframe.head())

     gender      SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0  0.000000  0.000000  1.000000      0.000000       1.000000       1.000000   
1  0.053191  0.000000  0.000000      0.154255       1.000000       0.113597   
2  0.000000  0.345745  0.569149      0.000000       0.744681       0.000000   
3  0.000000  1.000000  0.000000      0.000000       1.000000       0.000000   
4  0.000000  0.345745  0.569149      0.000000       1.000000       0.113597   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0       0.420213    0.021277       0.0                 0.0  ...    0.00000   
1       1.000000    0.021277       1.0                 0.0  ...    0.00000   
2       0.000000    0.000000       0.0                 0.0  ...    0.00000   
3       0.000000    0.000000       0.0                 0.0  ...    0.36926   
4       1.000000    0.021277       0.0                 0.0  ...    0.34844   

   Factor.2.  Factor.3.  Factor.4.  Factor.5.  Yrsincede

In [None]:
#prepare data
val_dataframe = val_dataframe.astype(float)
train_dataframe = train_dataframe.astype(float)
mask = mask.astype(float)
validation = torch.tensor(val_dataframe.values)
train = torch.tensor(train_dataframe.values)
mask_value = torch.tensor(mask.values)
dataset = TensorDataset(train, validation, mask_value)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
print(train_dataframe.head())
print(val_dataframe.head())

     gender      SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0  0.000000  0.000000  1.000000      0.000000       1.000000       1.000000   
1  0.053191  0.000000  0.000000      0.154255       1.000000       0.113597   
2  0.000000  0.345745  0.569149      0.000000       0.744681       0.000000   
3  0.000000  1.000000  0.000000      0.000000       1.000000       0.000000   
4  0.000000  0.345745  0.569149      0.000000       1.000000       0.113597   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0       0.420213    0.021277       0.0                 0.0  ...    0.00000   
1       1.000000    0.021277       1.0                 0.0  ...    0.00000   
2       0.000000    0.000000       0.0                 0.0  ...    0.00000   
3       0.000000    0.000000       0.0                 0.0  ...    0.36926   
4       1.000000    0.021277       0.0                 0.0  ...    0.34844   

   Factor.2.  Factor.3.  Factor.4.  Factor.5.  Yrsincede

In [None]:
#train the model
epochs = 100
for epoch in range(epochs):
    for train_data, val_data, mask_data in data_loader:
        # Move data to the specified device (GPU or CPU)
        train_data = train_data.to(device).float()
        val_data = val_data.to(device).float()
        mask_data = mask_data.to(device).float()

        optimizer.zero_grad()
        output = model(train_data)
        loss = criterion(output * mask_data, val_data * mask_data)  # Apply Hadamard product
        loss.backward()
        optimizer.step()

    if epoch % 1 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

train = train.float()
final = model(train)
loss = criterion(final * mask_value, validation * mask_value)
print(f'Loss: {loss.item()}')

Epoch 0, Loss: 0.10654013603925705
Epoch 1, Loss: 0.0885891243815422
Epoch 2, Loss: 0.09937212616205215
Epoch 3, Loss: 0.0855308473110199
Epoch 4, Loss: 0.07161423563957214
Epoch 5, Loss: 0.07292848080396652
Epoch 6, Loss: 0.06534670293331146
Epoch 7, Loss: 0.06446529179811478
Epoch 8, Loss: 0.07061706483364105
Epoch 9, Loss: 0.058724675327539444
Epoch 10, Loss: 0.057679709047079086
Epoch 11, Loss: 0.05147670581936836
Epoch 12, Loss: 0.05448358878493309
Epoch 13, Loss: 0.05098417401313782
Epoch 14, Loss: 0.04362739995121956
Epoch 15, Loss: 0.04263455048203468
Epoch 16, Loss: 0.040381915867328644
Epoch 17, Loss: 0.04395603761076927
Epoch 18, Loss: 0.04142395779490471
Epoch 19, Loss: 0.04455546662211418
Epoch 20, Loss: 0.042312003672122955
Epoch 21, Loss: 0.03635239973664284
Epoch 22, Loss: 0.037857238203287125
Epoch 23, Loss: 0.03622032701969147
Epoch 24, Loss: 0.037994470447301865
Epoch 25, Loss: 0.03990570083260536
Epoch 26, Loss: 0.029994934797286987
Epoch 27, Loss: 0.037592038512229

In [None]:
loss=criterion(train * mask_value, validation * mask_value)
print(f'Loss: {loss.item()}')

Loss: 0.05703199348315137


In [None]:
#try impute with average
mean_dataframe = val_re.copy()
# Calculate the average of each column
column_means = mean_dataframe.mean()

# Operate on each column
for column in mean_dataframe.columns:
    # Find the row in this column where mask is 0
    mask_rows = mask_re[mask_re[column] == 0].index
    mean_dataframe.loc[mask_rows, column] = column_means[column]

mean = torch.tensor(mean_dataframe.values)
val = torch.tensor(val_re.values)

In [None]:
#deal with normalized_dataframe
norm = torch.tensor(normalized_dataframe.values)
loss=criterion(mean , norm )
print(f'Loss: {loss.item()}')
#Loss of average imputation

Loss: 0.03544224753750929


In [None]:
mask_select = torch.tensor(mask_re.values)
val = val.float()
impute = model(val)
impute_norm = torch.where(mask_select == 1, val, impute)
loss=criterion(impute_norm , norm )
print(f'Loss: {loss.item()}')
#Loss of imputation method we use

Loss: 0.022919794824195212


In [None]:
#read original data
origin = pd.read_csv('/gdrive/My Drive/Colab Notebooks/cutoff_case_mix.txt', delimiter='\t')
mask_na = (~origin.isna()).astype(int)
origin.fillna(df.mean(), inplace=True)
print(origin.info())
print(origin.head())
print(mask_na.info())
print(mask_na.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Columns: 499 entries, gender to A8c_RecentAge
dtypes: float64(244), int64(255)
memory usage: 1.8 MB
None
   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0       0     0       0.0             0              0            1.0   
1       0     0       1.0             0              1            0.0   
2       0     0       1.0             0              1            1.0   
3       0     0       0.0             1              1            0.0   
4       0     0       0.0             0              1            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0              0           0         0                   0  ...   0.061065   
1              1           0         0                   0  ...   0.040712   
2              1           0         0                   0  ...   0.013619   
3              1           0         1                   0  ...   0.0509

In [None]:
#normalization
scaler = MinMaxScaler()
normalized_origin = scaler.fit_transform(origin)
normalized_origin = pd.DataFrame(normalized_origin, columns=origin.columns)
print(normalized_origin.head())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       0.0           0.0            0.0            1.0   
1     0.0   0.0       1.0           0.0            1.0            0.0   
2     0.0   0.0       1.0           0.0            1.0            1.0   
3     0.0   0.0       0.0           1.0            1.0            0.0   
4     0.0   0.0       0.0           0.0            1.0            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0            0.0         0.0       0.0                 0.0  ...   0.633459   
1            1.0         0.0       0.0                 0.0  ...   0.551896   
2            1.0         0.0       0.0                 0.0  ...   0.443323   
3            1.0         0.0       1.0                 0.0  ...   0.592795   
4            1.0         0.0       0.0                 0.0  ...   0.216813   

   Factor.2.  Factor.3.  Factor.4.  Factor.5.  Yrsincedep  \
0   0.563047   0.728071   0.462

In [None]:
origin_tensor = torch.tensor(normalized_origin.values)
cover = torch.tensor(mask_na.values)

origin_tensor = origin_tensor.float()
origin_impute = model(origin_tensor)
new = torch.where(cover == 1, origin_tensor, origin_impute)

new_detached = new.detach()
new_np = new_detached.numpy()
new_dataframe = pd.DataFrame(new_np, columns=origin.columns)
print(new_dataframe.head())
print(normalized_origin.head())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       0.0           0.0            0.0            1.0   
1     0.0   0.0       1.0           0.0            1.0            0.0   
2     0.0   0.0       1.0           0.0            1.0            1.0   
3     0.0   0.0       0.0           1.0            1.0            0.0   
4     0.0   0.0       0.0           0.0            1.0            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  Factor.1.  \
0            0.0         0.0       0.0                 0.0  ...   0.633459   
1            1.0         0.0       0.0                 0.0  ...   0.551896   
2            1.0         0.0       0.0                 0.0  ...   0.443323   
3            1.0         0.0       1.0                 0.0  ...   0.592795   
4            1.0         0.0       0.0                 0.0  ...   0.216813   

   Factor.2.  Factor.3.  Factor.4.  Factor.5.  Yrsincedep  \
0   0.563047   0.728071   0.462

In [None]:
#get encode
encoded_data = model.encode(origin_tensor)
encoded_data_np = encoded_data.detach().numpy()
encoded_data_df = pd.DataFrame(encoded_data_np)
print(encoded_data_df.info())
print(encoded_data_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 64 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       462 non-null    float32
 1   1       462 non-null    float32
 2   2       462 non-null    float32
 3   3       462 non-null    float32
 4   4       462 non-null    float32
 5   5       462 non-null    float32
 6   6       462 non-null    float32
 7   7       462 non-null    float32
 8   8       462 non-null    float32
 9   9       462 non-null    float32
 10  10      462 non-null    float32
 11  11      462 non-null    float32
 12  12      462 non-null    float32
 13  13      462 non-null    float32
 14  14      462 non-null    float32
 15  15      462 non-null    float32
 16  16      462 non-null    float32
 17  17      462 non-null    float32
 18  18      462 non-null    float32
 19  19      462 non-null    float32
 20  20      462 non-null    float32
 21  21      462 non-null    float32
 22  22

In [None]:
new_dataframe.to_csv('/gdrive/My Drive/Colab Notebooks/case_output.txt', sep='\t', index=False)
encoded_data_df.to_csv('/gdrive/My Drive/Colab Notebooks/case_encode.txt', sep='\t', index=False)