In [1]:
import numpy as np
import pandas as pd
import pickle
from biom import Table
from gemelli.rpca import rpca
# from gemelli.factorization import rpca
from sklearn.preprocessing import StandardScaler


In [2]:
def find_binary_columns(X_train):
    binary_columns = []
    for col in range(X_train.shape[1]):
        unique_values = np.unique(X_train[:, col])
        if set(unique_values).issubset({0, 1}):
            binary_columns.append(col)
    return binary_columns

In [3]:
dataset_path = "dataset_amyloid.pickle"
with open(dataset_path, 'rb') as f:
    X_train, y_train, X_test, y_test = pickle.load(f)
X = np.concatenate((X_train, X_test), axis=0)
Y = np.concatenate((y_train, y_test), axis=0)

binary_columns = find_binary_columns(X)

numerical_columns = []
for i in range(0,10193):
    if i not in binary_columns:
        numerical_columns.append(i)

X_bi = X[:, binary_columns] # Binary dataset
X_nu = X[:, numerical_columns] # Numerical dataset

In [4]:
print(f'Number of Binary columns = {X_bi.shape}')
print(f'Number of Numerical columns = {X_nu.shape}')
print(f'Minimum value of numerical element = {np.min(X_nu)}')
print(f'Maximum value of numerical element = {np.max(X_nu)}')

Number of Binary columns = (190, 3887)
Number of Numerical columns = (190, 6306)
Minimum value of numerical element = 0.0
Maximum value of numerical element = 5347441.0


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_nu)

print(f'Minimum value of scaled numerical element = {np.min(X_scaled)}')
print(f'Maximum value of scaled numerical element = {np.max(X_scaled)}')

# label_encoder = LabelEncoder()
# Y_numerical = label_encoder.fit_transform(Y)

Minimum value of scaled numerical element = -1.22930590661059
Maximum value of scaled numerical element = 13.74772708486753


In [10]:
sample_ids = list(range(1, X_scaled.shape[0] + 1))  # Sample IDs will be 1, 2, 3, ..., 190
feature_ids = list(range(1, X_scaled.shape[1] + 1))  # Feature IDs will be 1, 2, 3, ..., 6000
bt = Table(X_scaled, feature_ids, sample_ids)

ordination, distance_matrix = rpca(bt, n_components=128) 
reduced_features = ordination.samples
reduced_features_df = pd.DataFrame(reduced_features, index=sample_ids)

print(reduced_features_df)

TableException: Duplicate observation IDs

In [None]:

# # 1. Handling Infinite Values
# print("Checking for infinite values...")
# if np.isinf(X_scaled).any():
#     print("Infinite values found. Replacing with the maximum finite value.")
#     X_scaled[np.isinf(X_scaled)] = np.finfo(np.float64).max
# else:
#     print("No infinite values found.")

# # 2. Handling NaN Values
# print("\nChecking for NaN values...")
# if np.isnan(X_scaled).any():
#     print("NaN values found. Replacing with the mean of the column.")
#     # Calculate column means
#     col_means = np.nanmean(X_scaled, axis=0)
#     # Find indices where NaN values are present
#     inds = np.where(np.isnan(X_scaled))
#     # Replace NaNs with the corresponding column mean
#     X_scaled[inds] = np.take(col_means, inds[1])
# else:
#     print("No NaN values found.")

# # 3. Handling Duplicate Rows/Columns
# print("\nChecking for duplicate rows and columns...")

# # Check for duplicate rows
# unique_rows = np.unique(X_scaled, axis=0)
# if unique_rows.shape[0] != X_scaled.shape[0]:
#     print(f"Duplicate rows found. Removing {X_scaled.shape[0] - unique_rows.shape[0]} duplicate rows.")
#     X_scaled = unique_rows
# else:
#     print("No duplicate rows found.")

# # Check for duplicate columns
# unique_columns = np.unique(X_scaled, axis=1)
# if unique_columns.shape[1] != X_scaled.shape[1]:
#     print(f"Duplicate columns found. Removing {X_scaled.shape[1] - unique_columns.shape[1]} duplicate columns.")
#     X_scaled = unique_columns
# else:
#     print("No duplicate columns found.")

# print("\nData cleaning completed.")


In [None]:
# all_zero_rows = np.all(X_scaled == 0, axis=1)
# all_zero_columns = np.all(X_scaled == 0, axis=0)

# print(f"All-zero rows: {all_zero_rows.sum()}")
# print(f"All-zero columns: {all_zero_columns.sum()}")

# # # Option 2: Replace all-zero rows/columns with a small constant (e.g., 1e-6)
# small_constant = 1e-6  # You can adjust this value if needed

# if all_zero_rows.sum() > 0:
#     print(f"Replacing {all_zero_rows.sum()} all-zero rows with {small_constant}.")
#     X_scaled[all_zero_rows, :] = small_constant

# if all_zero_columns.sum() > 0:
#     print(f"Replacing {all_zero_columns.sum()} all-zero columns with {small_constant}.")
#     X_scaled[:, all_zero_columns] = small_constant

# Create numerical sample and feature IDs
sample_ids = list(range(1, X_scaled.shape[0] + 1))  # Sample IDs will be 1, 2, 3, ..., 190
feature_ids = list(range(1, X_scaled.shape[1] + 1))  # Feature IDs will be 1, 2, 3, ..., 6306

# Convert the numpy array into a biom.Table
bt = Table(X_scaled, feature_ids, sample_ids)

# Run RPCA to reduce the features
ordination, distance_matrix = rpca(bt, n_components=128)  # Reduce to 128 components

# Extract the reduced features (principal components)
reduced_features = ordination.samples

# Convert reduced features to a pandas DataFrame for easier handling
reduced_features_df = pd.DataFrame(reduced_features, index=sample_ids)

# Print the reduced features
print(reduced_features_df)

# If you want to save the reduced features to a CSV file:
reduced_features_df.to_csv('reduced_features.csv')

In [None]:
# sample_ids = list(range(1, X_scaled.shape[0] + 1))  # Sample IDs will be 1, 2, 3, ..., 190
# feature_ids = list(range(1, X_scaled.shape[1] + 1))  # Feature IDs will be 1, 2, 3, ..., 6306

# # Debugging: Print the shapes and IDs to verify
# print(f"Shape of X_scaled: {X_scaled.shape}")
# print(f"Number of sample IDs: {len(sample_ids)}")
# print(f"Number of feature IDs: {len(feature_ids)}")

# # Check if the IDs are unique
# assert len(sample_ids) == len(set(sample_ids)), "Sample IDs are not unique!"
# assert len(feature_ids) == len(set(feature_ids)), "Feature IDs are not unique!"

# # Convert the numpy array into a biom.Table
# bt = Table(X_scaled, feature_ids, sample_ids)

# # Run RPCA to reduce the features
# ordination, distance_matrix = rpca(bt, n_components=128)  # Here, we reduce to 128 components

# # Extract the reduced features (principal components)
# reduced_features = ordination.samples

# # Convert reduced features to a pandas DataFrame for easier handling
# reduced_features_df = pd.DataFrame(reduced_features, index=sample_ids)

# # Print the reduced features
# print(reduced_features_df)

# # If you want to save the reduced features to a CSV file:
# reduced_features_df.to_csv('reduced_features.csv')