In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [None]:
#import data
dir = "/gdrive/My Drive/Colab Notebooks"
df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/cutoff_dataset_prepare.txt', delimiter='\t')
print(df.head())
print(df.info())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0       0     0         1             1              1              1   
1       0     0         3             1              3              3   
2       0     0         3             1              1              1   
3       0     0         3             1              3              3   
4       1     0         3             1              1              1   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0              1           1         1                   1  ...   
1              3           1         1                   1  ...   
2              1           1         1                   1  ...   
3              1           1         1                   1  ...   
4              1           1         1                   1  ...   

   Num_Traumatic  IFNPATHSCORE  PPAGE_at_interview  PPEDUC  PPHHSIZE  \
0              0      0.002986                  22      10         4   
1             

In [None]:
#normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df)
normalized_dataframe = pd.DataFrame(normalized_data, columns=df.columns)
print(normalized_dataframe.head())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       0.0           0.0            0.0            0.0   
1     0.0   0.0       1.0           0.0            1.0            1.0   
2     0.0   0.0       1.0           0.0            0.0            0.0   
3     0.0   0.0       1.0           0.0            1.0            1.0   
4     1.0   0.0       1.0           0.0            0.0            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0            0.0         0.0       0.0                 0.0  ...   
1            1.0         0.0       0.0                 0.0  ...   
2            0.0         0.0       0.0                 0.0  ...   
3            0.0         0.0       0.0                 0.0  ...   
4            0.0         0.0       0.0                 0.0  ...   

   Num_Traumatic  IFNPATHSCORE  PPAGE_at_interview    PPEDUC  PPHHSIZE  \
0       0.000000      0.409393            0.025641  0.555556  0.230769   
1       0.

In [None]:
#mask out 20 percentages of entries from the complete dataset
val_dataframe = normalized_dataframe.copy()
# Calculate the average of each column
#column_means = val_dataframe.mean()

# Create a new data frame to mark modifications
mask = pd.DataFrame(1, index=val_dataframe.index, columns=val_dataframe.columns)

# Operate on each column
for column in val_dataframe.columns:
    # Determine the number of rows to replace
    num_rows = val_dataframe.shape[0]
    num_to_replace = int(num_rows * 0.20)  # 20% 的行

    # Randomly select rows
    rows_to_replace = np.random.choice(num_rows, num_to_replace, replace=False)

    # Replace the value of the selected row with 0
    val_dataframe.loc[rows_to_replace, column] = 0

    # Set the value of the corresponding position in the mask data frame to 0
    mask.loc[rows_to_replace, column] = 0

#mask = mask.astype(float)
print(val_dataframe.head())
print(mask.head())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       0.0           0.0            0.0            0.0   
1     0.0   0.0       1.0           0.0            1.0            0.0   
2     0.0   0.0       1.0           0.0            0.0            0.0   
3     0.0   0.0       1.0           0.0            1.0            1.0   
4     1.0   0.0       1.0           0.0            0.0            0.0   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0            0.0         0.0       0.0                 0.0  ...   
1            1.0         0.0       0.0                 0.0  ...   
2            0.0         0.0       0.0                 0.0  ...   
3            0.0         0.0       0.0                 0.0  ...   
4            0.0         0.0       0.0                 0.0  ...   

   Num_Traumatic  IFNPATHSCORE  PPAGE_at_interview    PPEDUC  PPHHSIZE  \
0       0.000000      0.409393            0.025641  0.000000  0.000000   
1       0.

In [None]:
#set auto_enconder
class HadamardAutoencoder(nn.Module):
    def __init__(self):
        super(HadamardAutoencoder, self).__init__()
        # Encoder part
        self.encoder = nn.Sequential(
            nn.Linear(in_features=428, out_features=128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        # Decoder part
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 428),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [None]:
# use gpu for training if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
# instantiate the model
model = HadamardAutoencoder()
model.to(device) # move the model to gpu if available
print(model)
optimizer = optim.Adam(model.parameters(), lr=0.0025)
criterion = nn.L1Loss()

HadamardAutoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=428, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=428, bias=True)
    (3): Sigmoid()
  )
)


In [None]:
#Self-Supervision set
replicate_size = 20
green_points_proportion = 0.4

val_dataframe = pd.concat([val_dataframe] * replicate_size, ignore_index=True)
mask = pd.concat([mask] * replicate_size, ignore_index=True)
print(val_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11960 entries, 0 to 11959
Columns: 428 entries, gender to Qd_max_weight
dtypes: float64(428)
memory usage: 39.1 MB
None


In [None]:
train_dataframe = val_dataframe.copy()
# Calculate the average of each column
column_means = train_dataframe.mean()

# Operate on each column
for column in train_dataframe.columns:
    # Find the row in this column where mask is 1
    mask_rows = mask[mask[column] == 1].index

    # Calculate the number of rows that need to be replaced (40%)
    num_to_replace = int(len(mask_rows) * green_points_proportion)

    # Randomly select some of these rows
    rows_to_replace = np.random.choice(mask_rows, num_to_replace, replace=False)

    # Replace the value of the selected row with the average value of the column
    train_dataframe.loc[rows_to_replace, column] = column_means[column]

print(train_dataframe.head())

     gender     SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0  0.000000  0.00000  0.538462      0.000000       0.000000       0.000000   
1  0.249164  0.00000  0.538462      0.000000       1.000000       0.000000   
2  0.000000  0.00000  0.538462      0.000000       0.367893       0.000000   
3  0.000000  0.35786  1.000000      0.000000       0.367893       0.088629   
4  1.000000  0.00000  1.000000      0.035117       0.367893       0.000000   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0            0.0    0.016722  0.013378            0.000000  ...   
1            1.0    0.000000  0.000000            0.000000  ...   
2            0.0    0.016722  0.000000            0.005017  ...   
3            0.0    0.000000  0.000000            0.000000  ...   
4            0.0    0.016722  0.000000            0.000000  ...   

   Num_Traumatic  IFNPATHSCORE  PPAGE_at_interview    PPEDUC  PPHHSIZE  \
0       0.071754      0.315685            0.483578  0.

In [None]:
#prepare data
val_dataframe = val_dataframe.astype(float)
train_dataframe = train_dataframe.astype(float)
mask = mask.astype(float)
validation = torch.tensor(val_dataframe.values)
train = torch.tensor(train_dataframe.values)
mask_value = torch.tensor(mask.values)
dataset = TensorDataset(train, validation, mask_value)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
print(train_dataframe.head())
print(val_dataframe.head())

     gender     SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0  0.000000  0.00000  0.538462      0.000000       0.000000       0.000000   
1  0.249164  0.00000  0.538462      0.000000       1.000000       0.000000   
2  0.000000  0.00000  0.538462      0.000000       0.367893       0.000000   
3  0.000000  0.35786  1.000000      0.000000       0.367893       0.088629   
4  1.000000  0.00000  1.000000      0.035117       0.367893       0.000000   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0            0.0    0.016722  0.013378            0.000000  ...   
1            1.0    0.000000  0.000000            0.000000  ...   
2            0.0    0.016722  0.000000            0.005017  ...   
3            0.0    0.000000  0.000000            0.000000  ...   
4            0.0    0.016722  0.000000            0.000000  ...   

   Num_Traumatic  IFNPATHSCORE  PPAGE_at_interview    PPEDUC  PPHHSIZE  \
0       0.071754      0.315685            0.483578  0.

In [None]:
#train the model
epochs = 100
for epoch in range(epochs):
    for train_data, val_data, mask_data in data_loader:
        # Move data to the specified device (GPU or CPU)
        train_data = train_data.to(device).float()
        val_data = val_data.to(device).float()
        mask_data = mask_data.to(device).float()

        optimizer.zero_grad()
        output = model(train_data)
        loss = criterion(output * mask_data, val_data * mask_data)  # Apply Hadamard product
        loss.backward()
        optimizer.step()

    if epoch % 1 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

train = train.float()
final = model(train)
loss = criterion(final * mask_value, validation * mask_value)
print(f'Loss: {loss.item()}')

Epoch 0, Loss: 0.07354068756103516
Epoch 1, Loss: 0.06960638612508774
Epoch 2, Loss: 0.046210817992687225
Epoch 3, Loss: 0.04968269541859627
Epoch 4, Loss: 0.04692194610834122
Epoch 5, Loss: 0.04717746749520302
Epoch 6, Loss: 0.04546152800321579
Epoch 7, Loss: 0.044867608696222305
Epoch 8, Loss: 0.04134365916252136
Epoch 9, Loss: 0.04170065000653267
Epoch 10, Loss: 0.04074101895093918
Epoch 11, Loss: 0.03683416545391083
Epoch 12, Loss: 0.03378558158874512
Epoch 13, Loss: 0.03409377485513687
Epoch 14, Loss: 0.032355066388845444
Epoch 15, Loss: 0.03192482888698578
Epoch 16, Loss: 0.029738357290625572
Epoch 17, Loss: 0.027549967169761658
Epoch 18, Loss: 0.03090822510421276
Epoch 19, Loss: 0.027327101677656174
Epoch 20, Loss: 0.029516415670514107
Epoch 21, Loss: 0.032110244035720825
Epoch 22, Loss: 0.029304414987564087
Epoch 23, Loss: 0.026119213551282883
Epoch 24, Loss: 0.026797983795404434
Epoch 25, Loss: 0.028857536613941193
Epoch 26, Loss: 0.03053271770477295
Epoch 27, Loss: 0.02462360

In [None]:
loss=criterion(train * mask_value, validation * mask_value)
print(f'Loss: {loss.item()}')

Loss: 0.04603539594306341


In [None]:
#deal with normalized_dataframe
normalized_dataframe = pd.concat([normalized_dataframe] * replicate_size, ignore_index=True)
norm = torch.tensor(normalized_dataframe.values)
loss=criterion(validation , norm )
print(f'Loss: {loss.item()}')
#This is the loss of average imputation

Loss: 0.03536496648385201


In [None]:
validation = validation.float()
impute = model(validation)
impute_norm = torch.where(mask_value == 1, validation, impute)
loss=criterion(impute_norm , norm )
print(f'Loss: {loss.item()}')
#This is the loss of the imputation we use

Loss: 0.01321116346847616


In [None]:
#read original data
origin = pd.read_csv('/gdrive/My Drive/Colab Notebooks/cutoff_dataset_final.txt', delimiter='\t')
mask_na = (~origin.isna()).astype(int)
origin.fillna(df.mean(), inplace=True)
print(origin.info())
print(origin.head())
print(mask_na.info())
print(mask_na.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 921 entries, 0 to 920
Columns: 428 entries, gender to Qd_max_weight
dtypes: float64(145), int64(283)
memory usage: 3.0 MB
None
   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0       0     0       1.0           1.0            1.0       3.000000   
1       0     0       1.0           1.0            1.0       1.000000   
2       1     0       3.0           1.0            3.0       1.000000   
3       0     0       3.0           1.0            3.0       1.214047   
4       0     0       3.0           1.0            3.0       3.000000   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0       1.000000         1.0       1.0                 1.0  ...   
1       1.000000         1.0       1.0                 1.0  ...   
2       1.397993         1.0       1.0                 1.0  ...   
3       3.000000         1.0       1.0                 1.0  ...   
4       3.000000         1.0       1.0           

In [None]:
#normalization
scaler = MinMaxScaler()
normalized_origin = scaler.fit_transform(origin)
normalized_origin = pd.DataFrame(normalized_origin, columns=origin.columns)
print(normalized_origin.head())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       0.0           0.0            0.0       1.000000   
1     0.0   0.0       0.0           0.0            0.0       0.000000   
2     1.0   0.0       1.0           0.0            1.0       0.000000   
3     0.0   0.0       1.0           0.0            1.0       0.107023   
4     0.0   0.0       1.0           0.0            1.0       1.000000   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0       0.000000         0.0       0.0                 0.0  ...   
1       0.000000         0.0       0.0                 0.0  ...   
2       0.198997         0.0       0.0                 0.0  ...   
3       1.000000         0.0       0.0                 0.0  ...   
4       1.000000         0.0       0.0                 0.0  ...   

   Num_Traumatic  IFNPATHSCORE  PPAGE_at_interview    PPEDUC  PPHHSIZE  \
0       0.545455      0.322126            0.871795  0.555556  0.000000   
1       0.

In [None]:

origin_tensor = torch.tensor(normalized_origin.values)
cover = torch.tensor(mask_na.values)

origin_tensor = origin_tensor.float()
origin_impute = model(origin_tensor)
new = torch.where(cover == 1, origin_tensor, origin_impute)

new_detached = new.detach()
new_np = new_detached.numpy()
new_dataframe = pd.DataFrame(new_np, columns=origin.columns)
print(new_dataframe.head())
print(normalized_origin.head())

   gender  SITE  CHILDREN  INPT_TX_EVER  OUTPT_TX_EVER  OTHER_TX_EVER  \
0     0.0   0.0       0.0           0.0            0.0   1.000000e+00   
1     0.0   0.0       0.0           0.0            0.0   0.000000e+00   
2     1.0   0.0       1.0           0.0            1.0   0.000000e+00   
3     0.0   0.0       1.0           0.0            1.0   4.290291e-18   
4     0.0   0.0       1.0           0.0            1.0   1.000000e+00   

   CURRENT_MH_TX  ALCOHOL_TX  SUBST_TX  CURRENT_ALCOHOL_TX  ...  \
0            0.0         0.0       0.0                 0.0  ...   
1            0.0         0.0       0.0                 0.0  ...   
2            0.0         0.0       0.0                 0.0  ...   
3            1.0         0.0       0.0                 0.0  ...   
4            1.0         0.0       0.0                 0.0  ...   

   Num_Traumatic  IFNPATHSCORE  PPAGE_at_interview    PPEDUC  PPHHSIZE  \
0       0.545455      0.322126            0.871795  0.555556  0.000000   
1       0.

In [None]:
new_dataframe.to_csv('/gdrive/My Drive/Colab Notebooks/dataset_output.txt', sep='\t', index=False)