In [104]:
import polars as pl
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
from scipy.stats import ttest_ind


In [105]:
df = pl.read_csv('../../Dementia/JanBDRcount.csv')
df = df.to_pandas()

In [106]:
for col in df.columns:
    df[col].fillna(3, inplace=True)

X = df.drop(columns=['FID', 'IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE'])
y = df['PHENOTYPE']

In [107]:
assert X.isnull().sum().sum() == 0, "There are still missing values in X"
assert y.isnull().sum().sum() == 0, "There are still missing values in y"

In [108]:
print(X.isnull().sum().sum())

0


In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [110]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, encoding_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, input_dim),
            nn.Sigmoid()  # Assuming normalized input between 0 and 1
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    

In [111]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [112]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
train_dataset = TensorDataset(X_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [113]:
input_dim = X_train.shape[1]
encoding_dim = 64  # you can adjust this

autoencoder = Autoencoder(input_dim, encoding_dim)
optimizer = optim.Adam(autoencoder.parameters(), lr=1e-3)
criterion = nn.MSELoss()

num_epochs = 50
for epoch in range(num_epochs):
    autoencoder.train()
    train_loss = 0
    for x_batch, in train_loader:
        optimizer.zero_grad()
        reconstructed = autoencoder(x_batch)
        loss = criterion(reconstructed, x_batch)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {train_loss / len(train_loader.dataset)}')

Epoch 1, Loss: 0.013830802076303926
Epoch 2, Loss: 0.012580303180633218
Epoch 3, Loss: 0.012559420142033146
Epoch 4, Loss: 0.012652022109913763
Epoch 5, Loss: 0.012548347500629784
Epoch 6, Loss: 0.012547793241352563
Epoch 7, Loss: 0.01254419866579787
Epoch 8, Loss: 0.012542563533015928
Epoch 9, Loss: 0.012550920727425542
Epoch 10, Loss: 0.012540802239732512
Epoch 11, Loss: 0.012539203221292981
Epoch 12, Loss: 0.012539018335035595
Epoch 13, Loss: 0.012539844570466724
Epoch 14, Loss: 0.012538474063131829
Epoch 15, Loss: 0.012537201112460834
Epoch 16, Loss: 0.012533879152251952
Epoch 17, Loss: 0.012533986696289308
Epoch 18, Loss: 0.012534877170506496
Epoch 19, Loss: 0.01263145722908245
Epoch 20, Loss: 0.012535798565631898
Epoch 21, Loss: 0.012534313882643672
Epoch 22, Loss: 0.012533444262062257
Epoch 23, Loss: 0.012534964659897955
Epoch 24, Loss: 0.012531970605134325
Epoch 25, Loss: 0.012632080600983975
Epoch 26, Loss: 0.01253358056973516
Epoch 27, Loss: 0.012535571093214102
Epoch 28, Los

In [114]:
autoencoder.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    reconstructed_X_test = autoencoder(X_test_tensor)
    reconstructed_X_test = reconstructed_X_test.numpy()

In [115]:
print(X_train)

[[1. 1. 0. ... 1. 0. 0.]
 [0. 1. 1. ... 1. 1. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [1. 2. 0. ... 1. 1. 0.]
 [1. 1. 1. ... 1. 1. 0.]
 [0. 0. 1. ... 1. 1. 1.]]


In [116]:
#L1 Loss
if len(X.columns) == reconstructed_X_test.shape[1]:
    reconstruction_error = np.mean((X_test - reconstructed_X_test) ** 2, axis=0)
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Reconstruction_Error': reconstruction_error})
    feature_importance = feature_importance.sort_values(by='Reconstruction_Error', ascending=False)  # Ascending order
    print(feature_importance)
else:
    print("Mismatch in feature dimensions. Check your input data and autoencoder model.")

              Feature  Reconstruction_Error
57217   rs112630218_0              4.000000
216999   rs35019336_0              4.000000
14051     rs6677369_0              4.000000
247769  rs146509793_0              4.000000
115404   rs79018329_0              4.000000
...               ...                   ...
281363  rs199570249_0              0.279503
281365    rs4801331_0              0.279503
63456     rs3009507_A              0.279503
105119   rs17111894_A              0.279503
279973       rs7412_T              0.204969

[297678 rows x 2 columns]


In [102]:
# Scaled MSELoss
if len(X.columns) == reconstructed_X_test.shape[1]:
    reconstruction_error = np.mean((X_test - reconstructed_X_test) ** 2, axis=0)
    scaler = MinMaxScaler()
    scaled_error = scaler.fit_transform(reconstruction_error.reshape(-1, 1)).flatten()
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Reconstruction_Error': scaled_error})
    feature_importance = feature_importance.sort_values(by='Reconstruction_Error', ascending=False)  # Ascending order
    print(feature_importance)
else:
    print("Mismatch in feature dimensions. Check your input data and autoencoder model.")

              Feature  Reconstruction_Error
82127     rs4834691_0          1.000000e+00
251440    rs9923820_0          9.998850e-01
218272   rs28606519_0          9.996258e-01
110349  rs114389430_0          9.995755e-01
294862  rs149277501_0          9.993037e-01
...               ...                   ...
222219   rs17074706_A          5.084034e-07
27782   rs143173314_C          4.429802e-07
264447    rs7216431_A          2.986577e-07
194295   rs16932348_G          1.769945e-07
186103   rs57973253_0          0.000000e+00

[297678 rows x 2 columns]


In [103]:
#MSE Loss
if len(X.columns) == reconstructed_X_test.shape[1]:
    reconstruction_error = np.mean((X_test - reconstructed_X_test) ** 2, axis=0)
    feature_importance = pd.DataFrame({'Feature': X.columns, 'Reconstruction_Error': reconstruction_error})
    feature_importance = feature_importance.sort_values(by='Reconstruction_Error', ascending=False)  # Ascending order
    print(feature_importance)
else:
    print("Mismatch in feature dimensions. Check your input data and autoencoder model.")

              Feature  Reconstruction_Error
82127     rs4834691_0              4.012598
251440    rs9923820_0              4.012150
218272   rs28606519_0              4.011142
110349  rs114389430_0              4.010947
294862  rs149277501_0              4.009890
...               ...                   ...
222219   rs17074706_A              0.124226
27782   rs143173314_C              0.124226
264447    rs7216431_A              0.124225
194295   rs16932348_G              0.124225
186103   rs57973253_0              0.124224

[297678 rows x 2 columns]


In [57]:
print(reconstructed_X_test.shape[1])

297678


In [58]:
print(len(df.columns))

297684


In [95]:
p_values = []

for i in range(X_train.shape[1]):
    group1 = X_train[y_train == 2, i]
    group2 = X_train[y_train == 1, i]

    # Check for constant features in either group
    try:
        t_stat, p_val = ttest_ind(group1, group2)
        p_values.append(p_val)
    except Exception as e:
        # print(f"Error calculating p-value for Feature {i}: {e}")
        p_values.append(np.nan)

feature_importance = pd.DataFrame({'Feature': X.columns, 'p_value': p_values})
feature_importance = feature_importance.sort_values(by='p_value')
print(feature_importance)



  res = hypotest_fun_out(*samples, **kwds)


              Feature       p_value
279972     rs429358_C  2.649056e-10
279960     rs157582_A  5.325411e-10
244229  rs140088880_G  7.067862e-09
60484     rs1492865_G  1.173295e-08
62690     rs2651618_G  1.275272e-08
...               ...           ...
285388  rs117853603_0           NaN
285503  rs118014232_0           NaN
289171   rs10439884_0           NaN
292390  rs112651842_0           NaN
294862  rs149277501_0           NaN

[297678 rows x 2 columns]
