## Import libraries

In [13]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Transform OTU data

In [None]:
# dataset_list = ['bacteria_rarefied_otu_mapping_PKfixTrimmed', 'fungi_rarefied_otu_mapping_PKfix']
# dataset_list = ['bacteria_genus', 'fungi_genus']
dataset_list = ['bacteria_conservative', 'fungi_conservative']

for dataset_name in dataset_list:
    df = pd.read_csv(f'{dataset_name}.csv')
    # drop the following columns: 'Domain', 'Habitat', 'Melanization'
    # df = df.drop(['Domain', 'Habitat', 'Melanization', 'Sample_ID', 'Necrobag_ID', 'Incubation_time', 'Plot', 'Comp', 'unidentified'], axis=1)
    df = df.drop(['Domain', 'Habitat', 'Melanization', 'Sample_ID', 'Necrobag_ID', 'Incubation_time', 'Plot', 'Comp'], axis=1)
    # df_transformed = pd.DataFrame(PowerTransformer().fit_transform(df), columns=df.columns) 
    # df_transformed.to_csv(f'./{dataset_name}_power_transformed.csv', index=False)
    df.to_csv(f'./{dataset_name}_raw.csv', index=False)

In [None]:
data_path = '/projects/genomic-ml/da2343/ml_project_1/data/necromass'
dataset_list = ['Dec22_bacteria_conservative_r_same', 'Dec22_fungi_conservative_r_same']
for dataset_name in dataset_list:
    df = pd.read_csv(f'{data_path}/{dataset_name}.csv')
    df = df.drop(columns=['sample_ID', 'Necrobag_ID', 'Habitat', 'Melanization', 'Incubation_time', 'Plot', 'Comp'])
    # df.to_csv(f'./{dataset_name}_raw.csv', index=False)
df

## Plot Data Transformation Graph

In [None]:
# Set the style and size of the plots
sns.set_style("whitegrid")

# Create four dataframes with some sample data
df_raw_bacteria = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_rarefied_otu_mapping_PKfixTrimmed_raw.csv')
df_raw_bacteria = df_raw_bacteria.iloc[:, :10]

df_transformed_bacteria = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_rarefied_otu_mapping_PKfixTrimmed_power_transformed.csv')
df_transformed_bacteria = df_transformed_bacteria.iloc[:, :10]

df_raw_fungi = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/fungi_rarefied_otu_mapping_PKfix_raw.csv')
df_raw_fungi = df_raw_fungi.iloc[:, :10]

df_transformed_fungi = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/fungi_rarefied_otu_mapping_PKfix_power_transformed.csv')
df_transformed_fungi = df_transformed_fungi.iloc[:, :10]

# Create a subplot with 2 rows and 2 columns
fig, axes = plt.subplots(2, 2)

# increase the size of the plot
fig.set_size_inches(15, 10)

# Create a boxplot for each dataframe in each subplot
sns.boxplot(data=df_raw_bacteria, ax=axes[0, 0])
axes[0, 0].set_title("Raw necromass bacteria")

sns.boxplot(data=df_transformed_bacteria, ax=axes[1, 0])
axes[1, 0].set_title("Transformed necromass bacteria")

sns.boxplot(data=df_raw_fungi, ax=axes[0, 1])
axes[0, 1].set_title("Raw necromass fungi")

sns.boxplot(data=df_transformed_fungi, ax=axes[1, 1])
axes[1, 1].set_title("Transformed necromass fungi")

# Set the visibility of the x-axis ticks and labels for the top plot to False
plt.setp(axes[0, 0].get_xticklabels(), visible=False)
plt.setp(axes[0, 1].get_xticklabels(), visible=False)

# Adjust the spacing and layout of the subplots
plt.tight_layout()
plt.show()

# save the plot
fig.savefig('./necromass_data_dist_boxplots.png', dpi=1500)


## Combine the bacteria and fungi data

In [3]:
necromass_data_dir = '/projects/genomic-ml/da2343/ml_project_1/data/necromass'

df_transformed_bacteria = pd.read_csv(f'{necromass_data_dir}/Dec22_bacteria_conservative_r_same_raw.csv')

df_transformed_fungi = pd.read_csv(f'{necromass_data_dir}/Dec22_fungi_conservative_r_same_raw.csv')

df_transformed_bacteria_fungi = pd.concat([df_transformed_bacteria, df_transformed_fungi], axis=1)
df_transformed_bacteria_fungi.to_csv(f'{necromass_data_dir}/Dec22_bacteria_fungi_conservative_r_same_raw.csv', index=False)

In [15]:
df = pd.read_csv(f'{necromass_data_dir}/Dec22_bacteria_fungi_conservative_r_same_raw.csv')
df_transformed = PowerTransformer().fit_transform(df)
df_new = pd.DataFrame(df_transformed, columns=df.columns)

df_new.to_csv(f'{necromass_data_dir}/Dec22_bacteria_fungi_conservative_r_same_transformed.csv', index=False)

In [11]:
necromass_data_dir = '/projects/genomic-ml/da2343/ml_project_1/data/necromass'

df_bacteria = pd.read_csv(f'{necromass_data_dir}/Dec22_bacteria_conservative_r_same_raw.csv')
df_bacteria_transformed = PowerTransformer().fit_transform(df_bacteria)
df_bacteria_transformed = pd.DataFrame(df_bacteria_transformed, columns=df_bacteria.columns)

df_fungi = pd.read_csv(f'{necromass_data_dir}/Dec22_fungi_conservative_r_same_raw.csv')
df_fungi_transformed = PowerTransformer().fit_transform(df_fungi)
df_fungi_transformed = pd.DataFrame(df_fungi_transformed, columns=df_fungi.columns)

df_bacteria_fungi = pd.concat([df_bacteria_transformed, df_fungi_transformed], axis=1)
# df_transformed_bacteria_fungi.to_csv(f'{necromass_data_dir}/Dec22_bacteria_fungi_conservative_r_same_raw.csv', index=False)
df_bacteria_fungi



Unnamed: 0,Bacillus,Bradyrhizobium,Burkholderia,Cellvibrio,Chitinophaga,Flavobacterium,Gp16,Gp6,Kaistia,Labrys,...,Mortierella,Mucor,Ovicillium,Penicillium,Phialocephala,Russula,Tomentella,Trichoderma,Umbelopsis,Wilcoxina
0,-1.051310,1.303894,-0.424502,-0.662478,-1.105309,-0.965418,1.026884,1.416732,-0.755748,-1.218715,...,0.311410,-0.387194,-0.546835,1.342957,-0.155797,0.862388,0.497154,-0.207413,0.961926,-0.387296
1,0.418411,1.320025,-0.618416,-0.662478,-1.105309,0.352021,1.435768,1.453572,-0.755748,-1.218715,...,-0.992687,-0.387194,-0.546835,1.583523,0.383227,1.368293,-0.752548,-1.424537,1.369599,-0.387296
2,-0.005731,1.249316,0.172507,-0.662478,-1.105309,-0.965418,1.321678,1.496106,-0.755748,-0.489037,...,-1.208840,-0.387194,-0.546835,1.073953,-0.887091,1.253182,-1.283232,-1.424537,-0.872391,-0.387296
3,0.056990,1.263940,-0.056213,-0.662478,-1.105309,-0.212046,1.457725,1.512844,-0.755748,-1.218715,...,0.179118,-0.387194,-0.546835,1.352706,-0.887091,1.688586,0.712848,-0.748688,1.028229,-0.387296
4,1.424627,1.228629,0.029956,-0.662478,-1.105309,-0.965418,1.440550,1.158578,-0.755748,-1.218715,...,-1.208840,-0.387194,-0.546835,0.813602,0.836798,1.596822,0.590149,-1.424537,0.770741,-0.387296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,-0.005731,1.188620,0.105424,-0.662478,-1.105309,-0.212046,1.401666,1.450094,-0.755748,-1.218715,...,-0.223561,-0.387194,-0.546835,-0.922557,-0.887091,-0.893952,0.781099,1.268884,-0.872391,-0.387296
65,-0.459163,-1.093004,-1.388676,1.041395,-0.106708,-0.965418,-0.724456,-0.705626,1.683835,1.160502,...,-1.569465,-0.387194,-0.546835,0.179539,1.413702,-0.893952,-1.283232,1.175973,-0.872391,2.567993
66,-1.493826,-0.113313,1.630102,-0.662478,0.910772,-0.965418,-0.724456,-0.705626,-0.755748,1.528557,...,1.228625,-0.387194,-0.546835,-0.169313,1.435482,0.607643,1.415446,0.839282,-0.872391,-0.387296
67,-0.074961,1.320025,0.662026,-0.662478,-1.105309,-0.965418,1.353900,1.360709,-0.755748,-1.218715,...,0.154034,2.431741,-0.546835,-0.922557,1.651623,0.524723,0.318935,0.617299,0.622225,-0.387296


## Combine the genus bacteria and fungi data

In [None]:
df_transformed_bacteria = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_genus_power_transformed.csv')
df_transformed_fungi = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/fungi_genus_power_transformed.csv')

df_transformed_bacteria_fungi = pd.concat([df_transformed_bacteria, df_transformed_fungi], axis=1)
df_transformed_bacteria_fungi.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_fungi_genus_power_transformed.csv', index=False)

## Combine the conservative bacteria and fungi data

In [None]:
df_transformed_bacteria = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_conservative_power_transformed.csv')
df_transformed_fungi = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/fungi_conservative_power_transformed.csv')

df_transformed_bacteria_fungi = pd.concat([df_transformed_bacteria, df_transformed_fungi], axis=1)
df_transformed_bacteria_fungi.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_fungi_conservative_power_transformed.csv', index=False)

In [None]:
df_transformed_bacteria = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_conservative_raw.csv')
df_transformed_fungi = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/fungi_conservative_raw.csv')

df_transformed_bacteria_fungi = pd.concat([df_transformed_bacteria, df_transformed_fungi], axis=1)
df_transformed_bacteria_fungi.to_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_fungi_conservative_raw.csv', index=False)

In [None]:
import pandas as pd 

df = pd.read_csv('/projects/genomic-ml/da2343/ml_project_1/data/necromass/bacteria_conservative_raw.csv')
df