In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist
from skbio.stats.distance import DistanceMatrix
from skbio import io
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
import statsmodels.api as sm
import itertools
from statsmodels.formula.api import ols
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(531)

# Load the data (replace with your actual loading method)
# Assuming the data is in a pickle file, otherwise modify as needed
LCPM_all_names_edited_filtered = pd.read_pickle("LCPM_all_names_edited_filtered.pkl")

# Convert to a dataframe and add type variable
LCPM_all_names_edited_filtered['type'] = LCPM_all_names_edited_filtered['type'].astype('category')
LCPM_all_names_edited_filtered['type'] = LCPM_all_names_edited_filtered['type'].cat.set_categories(['no_RS', 'no_RS', 'no_RS', 'RS', 'no_RS'])

# Subset data by species (this is a placeholder, adjust as needed)
LCPM_all_names_edited_filtered_species = LCPM_all_names_edited_filtered.groupby('Species')

# Add number of Runella reads
LCPM_all_names_edited_filtered_species['Runella'] = LCPM_all_names_edited_filtered_species['Genus'].apply(lambda x: np.sum(x == 'g_Runella'))

# Create boxplot for Runella reads
plt.figure(figsize=(6, 4))
sns.boxplot(x='RUN', y='Runella', hue='type', data=LCPM_all_names_edited_filtered_species, dodge=True, palette="Set1")
plt.xticks(rotation=45)
plt.title('Runella reads per runs and plates')
plt.savefig('FigSup_RC_plot.pdf', bbox_inches='tight')
plt.close()

# Perform rarefaction (rarefying to 10000 reads per sample)
# Placeholder for rarefaction method; use your library for actual rarefaction
LCPM_all_names_edited_filtered_species_10K = rarefy_depth(LCPM_all_names_edited_filtered_species, 10000)

# Subset controls (exclude stool samples)
LCPM_all_names_edited_filtered_species_10K_Controls = LCPM_all_names_edited_filtered_species_10K[LCPM_all_names_edited_filtered_species_10K['type.1'] != 'stool']

# Bray-Curtis distance calculation
from sklearn.metrics import pairwise_distances
bray_curtis_dist = pairwise_distances(LCPM_all_names_edited_filtered_species_10K_Controls.values.T, metric='braycurtis')

# Create dataframe with pairwise Bray-Curtis distances
distances_tax = pd.DataFrame(bray_curtis_dist, index=LCPM_all_names_edited_filtered_species_10K_Controls.columns,
                             columns=LCPM_all_names_edited_filtered_species_10K_Controls.columns)

# Reshape and merge metadata
names_pairs = list(itertools.combinations(LCPM_all_names_edited_filtered_species_10K_Controls.columns, 2))
distances_tax_reshaped = pd.DataFrame(names_pairs, columns=['X1', 'X2'])
distances_tax_reshaped['dist'] = distances_tax.values.flatten()

metadata_all = LCPM_all_names_edited_filtered_species_10K_Controls[['FastQ_full_ID', 'type.1']]
distances_tax_reshaped = distances_tax_reshaped.merge(metadata_all, left_on='X1', right_on='FastQ_full_ID')
distances_tax_reshaped = distances_tax_reshaped.merge(metadata_all, left_on='X2', right_on='FastQ_full_ID', suffixes=('_x', '_y'))

# Calculate class labels for inter-sample or intra-sample comparisons
distances_tax_reshaped['clase'] = np.where(distances_tax_reshaped['Sample_ID.x'] != distances_tax_reshaped['Sample_ID.y'], 'Intersample',
                                           distances_tax_reshaped['Sample_ID.x'] + '-' + distances_tax_reshaped['Sample_ID.y'])

# Filter out inter-sample comparisons
distances_tax_sub = distances_tax_reshaped[distances_tax_reshaped['clase'] != 'Intersample']

# Add RUN information to distances
distances_tax_sub['RUN'] = distances_tax_sub['X1'].apply(lambda x: x.split('p')[0])

# Plot Bray-Curtis distances between positive controls
plt.figure(figsize=(8, 4))
sns.boxplot(x='run_run', y='dist', data=distances_tax_sub[distances_tax_sub['clase'] == 'PC-PC'])
sns.scatterplot(x='run_run', y='dist', data=distances_tax_sub[distances_tax_sub['clase'] == 'PC-PC'], color='red', alpha=0.7)
plt.axhline(y=0.2, linestyle='dotted', color='blue')
plt.xticks(rotation=45)
plt.title('Bray-Curtis dissimilarity distances between positive control samples')
plt.savefig('FigSup_PC_bc_final_sp.pdf', bbox_inches='tight')
plt.close()

# Filter data by RS type
LCPM_all_names_edited_filtered_species_noRS = LCPM_all_names_edited_filtered_species[LCPM_all_names_edited_filtered_species['type'] != 'RS']

# Perform PCA on transformed data
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
LCPM_noRS_pca = pca.fit_transform(LCPM_all_names_edited_filtered_species_noRS)

# Plot PCA
sns.scatterplot(x=LCPM_noRS_pca[:, 0], y=LCPM_noRS_pca[:, 1], hue=LCPM_all_names_edited_filtered_species_noRS['type.1'])
plt.title('PCA of noRS samples')
plt.show()

# Modify taxa names and filter
LCPM_all_names_edited_filtered_species_noRS['taxa'] = LCPM_all_names_edited_filtered_species_noRS['taxa'].str.replace('.*\\g_', '')

# Create NCE_data and export as CSV
NCE_data = LCPM_all_names_edited_filtered_species_noRS[['type.1', 'otu_table']]
NCE_data.to_csv('NCE_dataSp.csv', index=False)

# Calculate Bray-Curtis distance and perform Adonis test
from skbio.stats.distance import DistanceMatrix
QC_dist_bc = DistanceMatrix(bray_curtis_dist)
from skbio.stats.distance import permanova
adonis_results = permanova(QC_dist_bc, metadata_all['RUN'], permutations=9999)

# Perform pairwise Adonis
from pairwise_adonis import pairwise_adonis
pairwise_adonis(QC_dist_bc, metadata_all['RUN'], p_adjust='bonferroni')

# Update factor for type.1 column
LCPM_all_names_edited_filtered_species_noRS['type.1'] = pd.Categorical(LCPM_all_names_edited_filtered_species_noRS['type.1'], categories=['NCP', 'NCE'])

# Filter taxa
psq = LCPM_all_names_edited_filtered_species_noRS
psq_filtered = psq[psq['taxa_sums'] > 0]

# Create complex heatmap (use Seaborn for basic heatmaps or other libraries for complex)
plt.figure(figsize=(10, 8))
sns.heatmap(psq_filtered.corr(), annot=True)
plt.title('Heatmap of taxa correlations')
plt.show()


ModuleNotFoundError: No module named 'skbio'