## Import libraries

In [None]:
import glob
import pandas as pd
import numpy as np
import plotnine as p9
from sklearn.preprocessing import *
import matplotlib.pyplot as plt
import seaborn as sns

## Initialize contants

In [None]:
root_data_dir = '/projects/genomic-ml/da2343/ml_project_1/data'

## Transform OTU data

In [None]:
# dataset_list = ['baxter_crc_data', 'crohns_data', 'glne007_data', 'global_patterns_data', 'esophagus_data', 'enterotype_data', 'hmp2prot_data', 'hmp216S_data', 'mixmpln_real_data', 'soilrep_data', 'ioral_data' ]
# dataset_list = ['amgut1_data', 'amgut2_data']
dataset_list = ['amgut2_data']


for dataset_name in dataset_list:
    dataset_df = pd.read_csv(f'{root_data_dir}/{dataset_name}_update.csv', header=0)
    # Add 1 to all the values
    # dataset_df = dataset_df + 1
    # dataset_df = pd.read_csv(f'./{dataset_name}_update.csv', header=0)
    # data_transformed = PowerTransformer().fit_transform(dataset_df)
    # Fit transformer to data
    # Transform data to normal distribution
    data_transformed = StandardScaler().fit_transform(dataset_df)
    dataset_df_scaled = pd.DataFrame(data_transformed, columns=dataset_df.columns) 
    # save the log transformed data
    dataset_df_scaled.to_csv(f'{root_data_dir}/{dataset_name}_standard_scaled.csv', index=False)


## Plot Data Transformation Graph

In [None]:
dataset_name = 'amgut1_data'
taxa_count = 5

# Set the style and size of the plots
sns.set_style("whitegrid")

# Create three dataframes with some sample data
df_raw = pd.read_csv(f'{root_data_dir}/{dataset_name}_update.csv')
df_raw = df_raw.iloc[:, :taxa_count]

df_ss = pd.read_csv(f'{root_data_dir}/{dataset_name}_standard_scaled.csv')
df_ss = df_ss.iloc[:, :taxa_count]

df_pt = pd.read_csv(f'{root_data_dir}/{dataset_name}_power_transformed.csv')
df_pt = df_pt.iloc[:, :taxa_count]

# Create a subplot with 1 row and 4 columns
fig, axes = plt.subplots(1, 3)

# increase the size of the plot
fig.set_size_inches(18, 10)

# Create a boxplot for each dataframe in each subplot
sns.boxplot(data=df_raw, ax=axes[0])
axes[0].set_title(f"Raw")

sns.boxplot(data=df_ss, ax=axes[1])
axes[1].set_title(f"Standard Scaled")

sns.boxplot(data=df_pt, ax=axes[2])
axes[2].set_title(f"Yeo-Johnson Transformation")

# add x and y labels to the second subplot
axes[0].set_ylabel("Abundance", fontsize=14)
axes[1].set_xlabel("Taxa", fontsize=14)


# Adjust the spacing and layout of the subplots
plt.tight_layout()
plt.show()

# TODO: Uncomment below to save the plot
# fig.savefig(f'{root_data_dir}/{dataset_name}_data_dist_boxplots.png', dpi=1500)


## Sparsity Analysis for Public Datasets

In [None]:
dataset_list = ['amgut1_data', 'amgut2_data', 'baxter_crc_data', 'crohns_data', 'glne007_data', 
               'global_patterns_data', 'esophagus_data', 'enterotype_data', 
                'hmp2prot_data', 'hmp216S_data', 'mixmpln_real_data', 
                'soilrep_data', 'ioral_data']


for dataset_name in dataset_list:
    dataset_df = pd.read_csv(f'{root_data_dir}/{dataset_name}_update.csv', header=0)
    sparsity = 1.0 - ( np.count_nonzero(dataset_df) / float(dataset_df.size) )
    sparsity = sparsity * 100.0
    print(f'{dataset_name} sparsity: {sparsity:.2f}%')


## Sparsity Analysis for Necromass Dataset

In [None]:
necromass_data_dir = '/projects/genomic-ml/da2343/ml_project_1/data/necromass/'

# /projects/genomic-ml/da2343/ml_project_1/data/necromass/Dec22_bacteria_conservative_r_same_raw.csv
dataset_list = ['bacteria_conservative_raw', 'bacteria_genus_raw', 
                'fungi_conservative_raw', 'fungi_genus_raw', 
                'bacteria_fungi_conservative_raw', 'Dec22_bacteria_conservative_r_same_raw',
                'Dec22_fungi_conservative_r_same_raw',
                'Dec22_bacteria_fungi_conservative_r_same_raw'
                ]

for dataset_name in dataset_list:
    dataset_df = pd.read_csv(f'{necromass_data_dir}/{dataset_name}.csv')
    sparsity = 1.0 - ( np.count_nonzero(dataset_df) / float(dataset_df.size) )
    sparsity = sparsity * 100.0
    print(f'{dataset_name} sparsity: {sparsity:.2f}%')

In [15]:
necromass_data_dir = '/projects/genomic-ml/da2343/ml_project_1/data/necromass/'

dataset_list = [
    {
        "name" : "AllSoilM1M3",
        "start" : 0,
        "end" : 22
    },
    {
        'name' : 'LowMelanM1',
        'start' : 23,
        'end' : 34
    },
    {
        'name' : 'HighMelanM1',
        'start' : 35,
        'end' : 45
    },
    {
        'name' : 'LowMelanM3',
        'start' : 46,
        'end' : 56
    },
    {
        'name' : 'HighMelanM3',
        'start' : 57,
        'end' : 68
    }
]

for dataset in dataset_list:
    dataset_name = dataset['name']
    dataset_start = dataset['start']
    dataset_end = dataset['end']
    
    dataset_df = pd.read_csv(f'{necromass_data_dir}/Dec22_bacteria_fungi_conservative_r_same_raw.csv')
    sub_df = dataset_df.iloc[dataset_start:dataset_end+1, :]
    
    sparsity = 1.0 - ( np.count_nonzero(sub_df) / float(sub_df.size) )
    sparsity = sparsity * 100.0
    print(f'{dataset_name} sparsity: {sparsity:.2f}%')

AllSoilM1M3 sparsity: 44.07%
LowMelanM1 sparsity: 46.62%
HighMelanM1 sparsity: 51.84%
LowMelanM3 sparsity: 43.00%
HighMelanM3 sparsity: 48.87%
