# Preprocessing ASV Table and Metadata for ML

## KNN imputation for handling zero values in my dataset

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import KNNImputer

In [6]:
# import and Load your OTU table in TSV format (replace 'your_otu_table.tsv' with your data file)
otu_table = pd.read_csv('C:/Users/USER/Documents/Coco-ML/OTU-table.tsv', sep='\t')

In [7]:
# Define the number of neighbors for KNN imputation
# You can adjust this value based on your data and preferences
# Initialize the KNNImputer with the number of neighbors
imputer = KNNImputer(n_neighbors=5)

In [8]:
# Remove non-numeric columns (e.g., headers) that might be in the data
target = otu_table['OTU_ID']
features_otu_table = otu_table.drop('OTU_ID', axis=1)

In [9]:
# Perform KNN imputation on the OTU table
imputed_otu_table = imputer.fit_transform(features_otu_table)

In [12]:
# Convert the imputed result back to a DataFrame
imputed_otu_table = pd.DataFrame(imputed_otu_table, columns=features_otu_table.columns)

In [13]:
# If applicable, combine the filled dataset with the target variable
imputed_otu_table['OTU_ID'] = target

In [30]:
# Define the list of columns
cols = imputed_otu_table.columns.tolist()
# Move the last column (OTU ID) to the second position
cols = cols[-1:] + cols[:-1]
# Update the DataFrame with the new column order
imputed_otu_table_col = imputed_otu_table[cols]

In [31]:
# Print the f dataset
print(imputed_otu_table_col)

      OTU_ID     1A      1B     1C       1D   1E       2A       2B      2C  \
0      OTU_1  845.0  2162.0  646.0  10542.0  2.0  13029.0  23283.0  5689.0   
1      OTU_2    0.0    70.0    0.0   1141.0  0.0    762.0    372.0  5109.0   
2      OTU_3    0.0     0.0    0.0      2.0  0.0      0.0      0.0     0.0   
3      OTU_4    0.0     6.0    0.0      4.0  0.0    225.0      0.0  3238.0   
4      OTU_5    0.0   102.0    0.0    203.0  3.0    421.0    515.0    59.0   
..       ...    ...     ...    ...      ...  ...      ...      ...     ...   
280  OTU_281    0.0     1.0    0.0      4.0  0.0      0.0      3.0    32.0   
281  OTU_282    0.0     1.0    0.0      1.0  0.0      8.0      0.0     1.0   
282  OTU_283    0.0     7.0    2.0      0.0  0.0      1.0      0.0     0.0   
283  OTU_284    2.0     3.0    0.0      1.0  0.0      2.0      0.0     1.0   
284  OTU_285    0.0     1.0    0.0      0.0  0.0      2.0      0.0     0.0   

         2D    2E    3A   3B   3C   3D   3E  
0    3149.0  12.0

In [32]:
# Save the imputed OTU table to a new TSV file
imputed_otu_table.to_csv('C:/Users/USER/Documents/Coco-ML/imputed_otu_table.tsv', sep='\t', index=False)
imputed_otu_table_col.to_csv('C:/Users/USER/Documents/Coco-ML/imputed_otu_table_col.tsv', sep='\t', index=False)

## Transform your data into the "Samples as Rows and OTU IDs as Columns" orientation

In [33]:
# Transpose the imputed OTU table to have Samples as Rows and OTU IDs as Columns
transposed_otu_table = imputed_otu_table_col.transpose()
#print(transposed_otu_table)
transposed_otu_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,275,276,277,278,279,280,281,282,283,284
OTU_ID,OTU_1,OTU_2,OTU_3,OTU_4,OTU_5,OTU_6,OTU_7,OTU_8,OTU_9,OTU_10,...,OTU_276,OTU_277,OTU_278,OTU_279,OTU_280,OTU_281,OTU_282,OTU_283,OTU_284,OTU_285
1A,845.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0
1B,2162.0,70.0,0.0,6.0,102.0,33.0,21.0,0.0,0.0,1060.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,7.0,3.0,1.0
1C,646.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1D,10542.0,1141.0,2.0,4.0,203.0,410.0,466.0,1337.0,1695.0,26.0,...,25.0,30.0,7.0,0.0,2.0,4.0,1.0,0.0,1.0,0.0


In [76]:
# Transpose the imputed OTU table
transposed_otu_table = imputed_otu_table_col.transpose()

# Set the first row as column headers
transposed_otu_table.columns = transposed_otu_table.iloc[0]

# Drop the first row, which was set as column headers
transposed_otu_table = transposed_otu_table[1:]

# Reset the index to default
transposed_otu_table.reset_index(drop=False, inplace=True)

# Rename the first column to 'SampleID'
transposed_otu_table.rename(columns={transposed_otu_table.columns[0]: 'SampleID'}, inplace=True)
transposed_otu_table.head()

OTU_ID,SampleID,OTU_1,OTU_2,OTU_3,OTU_4,OTU_5,OTU_6,OTU_7,OTU_8,OTU_9,...,OTU_276,OTU_277,OTU_278,OTU_279,OTU_280,OTU_281,OTU_282,OTU_283,OTU_284,OTU_285
0,1A,845.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0
1,1B,2162.0,70.0,0.0,6.0,102.0,33.0,21.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,7.0,3.0,1.0
2,1C,646.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,1D,10542.0,1141.0,2.0,4.0,203.0,410.0,466.0,1337.0,1695.0,...,25.0,30.0,7.0,0.0,2.0,4.0,1.0,0.0,1.0,0.0
4,1E,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
# Save the transposed OTU table to a new TSV file
transposed_otu_table.to_csv('C:/Users/USER/Documents/Coco-ML/transposed_otu_table.tsv', sep='\t', index=False)

## Merge the OTU table and a metadata file(specific metadata columns)

In [73]:
import pandas as pd
# Load metadata files since transposed OTU table is already loaded and available
metadata = pd.read_csv('C:/Users/USER/Documents/Coco-ML/sample-metadata.tsv', sep='\t')
print (metadata)

   #SampleID Condition Counties BarcodeSequence LinkerPrimerSequence  \
0         1A  Diseased    Kwale      TGGCACCACC  GTGCCAGCMGCCGCGGTAA   
1         1B  Diseased    Kwale      AAGCATCATG  GTGCCAGCMGCCGCGGTAA   
2         1C  Diseased    Kwale      CTCCTAGCTT  GTGCCAGCMGCCGCGGTAA   
3         1D  Diseased    Kwale      GACTACGGTA  GTGCCAGCMGCCGCGGTAA   
4         1E   Healthy    Kwale      GATCATATCG  GTGCCAGCMGCCGCGGTAA   
5         2A  Diseased   Kilifi      AATGATAGTG  GTGCCAGCMGCCGCGGTAA   
6         2B  Diseased   Kilifi      CTGATAGATG  GTGCCAGCMGCCGCGGTAA   
7         2C  Diseased   Kilifi      AATCGCGAGA  GTGCCAGCMGCCGCGGTAA   
8         2D  Diseased   Kilifi      CTTGTACAAC  GTGCCAGCMGCCGCGGTAA   
9         2E   Healthy   Kilifi      CTCAGTATTC  GTGCCAGCMGCCGCGGTAA   
10        3A  Diseased     Lamu      TGCTAGCACC  GTGCCAGCMGCCGCGGTAA   
11        3B  Diseased     Lamu      GATCATATCG  GTGCCAGCMGCCGCGGTAA   
12        3C  Diseased     Lamu      CTCAGTATTC  GTGCCAGCMGCCGCG

In [83]:
# Identify the relevant metadata columns for your analysis
selected_metadata = metadata[['Description', 'Condition', 'Counties']]

# Merge the transposed OTU table and selected metadata based on the common column
combined_data = pd.merge(transposed_otu_table, selected_metadata, left_on='SampleID', right_on='Description')

# Drop the 'Description' column
combined_data = combined_data.drop(columns=['Description'])

# Print the combined data
print(combined_data)
# The 'combined_data' now contains both OTU abundances and selected metadata for machine learning.

   SampleID    OTU_1   OTU_2   OTU_3   OTU_4   OTU_5  OTU_6   OTU_7   OTU_8  \
0        1A    845.0     0.0     0.0     0.0     0.0    0.0     0.0     6.0   
1        1B   2162.0    70.0     0.0     6.0   102.0   33.0    21.0     0.0   
2        1C    646.0     0.0     0.0     0.0     0.0    0.0     0.0     0.0   
3        1D  10542.0  1141.0     2.0     4.0   203.0  410.0   466.0  1337.0   
4        1E      2.0     0.0     0.0     0.0     3.0    0.0     0.0     0.0   
5        2A  13029.0   762.0     0.0   225.0   421.0  477.0   520.0    29.0   
6        2B  23283.0   372.0     0.0     0.0   515.0  515.0   392.0     0.0   
7        2C   5689.0  5109.0     0.0  3238.0    59.0  753.0  1052.0   441.0   
8        2D   3149.0   495.0  3270.0     0.0  1586.0   59.0    96.0     0.0   
9        2E     12.0     7.0     0.0     0.0    33.0    0.0     0.0     0.0   
10       3A     17.0     9.0    15.0     0.0     0.0    0.0     0.0     0.0   
11       3B      2.0     0.0     0.0     0.0     0.0

In [94]:
combined_data.head()

Unnamed: 0,SampleID,OTU_1,OTU_2,OTU_3,OTU_4,OTU_5,OTU_6,OTU_7,OTU_8,OTU_9,...,OTU_278,OTU_279,OTU_280,OTU_281,OTU_282,OTU_283,OTU_284,OTU_285,Condition,Counties
0,1A,845.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,Diseased,Kwale
1,1B,2162.0,70.0,0.0,6.0,102.0,33.0,21.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,7.0,3.0,1.0,Diseased,Kwale
2,1C,646.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,Diseased,Kwale
3,1D,10542.0,1141.0,2.0,4.0,203.0,410.0,466.0,1337.0,1695.0,...,7.0,0.0,2.0,4.0,1.0,0.0,1.0,0.0,Diseased,Kwale
4,1E,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Healthy,Kwale


In [84]:
# Save the combined_data to a new TSV file
combined_data.to_csv('C:/Users/USER/Documents/Coco-ML/combined_data.tsv', sep='\t', index=False)

In [89]:
combined_data['Condition'].value_counts()

Condition
Diseased    12
Healthy      3
Name: count, dtype: int64

In [90]:
combined_data['Counties'].value_counts()

Counties
Kwale     5
Kilifi    5
Lamu      5
Name: count, dtype: int64

## TSS Normalization

In [93]:
import numpy as np
import pandas as pd

In [97]:
# Calculate the total sequence count for each sample
# Assuming that your data starts from the second column (index ) and ends at the third-to-last column
data_columns = combined_data.columns[1:-2]  # Adjust based on your actual column positions
print(data_columns)

Index(['OTU_1', 'OTU_2', 'OTU_3', 'OTU_4', 'OTU_5', 'OTU_6', 'OTU_7', 'OTU_8',
       'OTU_9', 'OTU_10',
       ...
       'OTU_276', 'OTU_277', 'OTU_278', 'OTU_279', 'OTU_280', 'OTU_281',
       'OTU_282', 'OTU_283', 'OTU_284', 'OTU_285'],
      dtype='object', length=285)


In [106]:
# Extract the sample IDs from the first column
sample_ids = combined_data.iloc[:, 0]
# Extract conditions and counties from the last two columns
conditions = combined_data.iloc[:, -2]
counties = combined_data.iloc[:, -1]
# Calculate the total sequence count for each sample (excluding the first and last two columns)
total_counts = np.sum(combined_data[data_columns].values, axis=1).astype(float)  # Convert to float

In [107]:
# Check for NaN and zero values in total_counts
if np.isnan(total_counts).any():
    print("total_counts contains NaN values")
if (total_counts == 0).any():
    print("total_counts contains zero values")

In [127]:
# Perform the division operation only if total_counts does not contain any NaN or zero values. 
# The code performs TSS normalization by dividing the ASV counts by the total sequence count for each sample. 
# This rescales the data to account for differences in sequencing depth.
# Divide the count of each ASV in each sample by the total sequence count for that sample and multiply by 1,000,000
# This is a common approach to normalize the data to counts per million (CPM) 
if not np.isnan(total_counts).any() and not (total_counts == 0).any():
    tss_normalized_data = (combined_data[data_columns].values / total_counts.reshape(-1, 1)) * 1_000_000

# Assemble the TSS-normalized data
tss_normalized_df = pd.concat([sample_ids, conditions, counties, pd.DataFrame(tss_normalized_data, columns=data_columns)], axis=1)
tss_normalized_df.head()

Unnamed: 0,SampleID,Condition,Counties,OTU_1,OTU_2,OTU_3,OTU_4,OTU_5,OTU_6,OTU_7,...,OTU_276,OTU_277,OTU_278,OTU_279,OTU_280,OTU_281,OTU_282,OTU_283,OTU_284,OTU_285
0,1A,Diseased,Kwale,490708.478513,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,580.720093,0.0,0.0,0.0,1161.440186,0.0
1,1B,Diseased,Kwale,445589.44765,14427.040396,0.0,1236.603462,21022.258862,6801.319044,4328.112119,...,0.0,0.0,0.0,0.0,206.100577,206.100577,206.100577,1442.70404,618.301731,206.100577
2,1C,Diseased,Kwale,607142.857143,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1879.699248,0.0,0.0
3,1D,Diseased,Kwale,482272.748067,52198.17924,91.495494,182.990988,9286.792625,18756.576239,21318.450066,...,1143.693673,1372.432408,320.234228,0.0,91.495494,182.990988,45.747747,0.0,45.747747,0.0
4,1E,Healthy,Kwale,51282.051282,0.0,0.0,0.0,76923.076923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Scaling

Ensures each feature is given equal importance in the model.

In [130]:
from sklearn.preprocessing import StandardScaler
import numpy as np

In [131]:
# Scale ASV counts data (from column 4)
scaler = StandardScaler()
# use copy method to ensure any modifications made to the new copy will not affect the original data
scaled_data = tss_normalized_df
# use iloc to select all rows and columns starting from the fourth column (column index 3) and
# fit the scaler to the selected data and transform it and
#assign to a new DataFrame called scaled_data
scaled_data.iloc[:, 3:] = scaler.fit_transform(tss_normalized_df.iloc[:, 3:])
# fill all NA with 0
scaled_data=scaled_data.iloc[:, 3:]
scaled_data.head()

Unnamed: 0,OTU_1,OTU_2,OTU_3,OTU_4,OTU_5,OTU_6,OTU_7,OTU_8,OTU_9,OTU_10,...,OTU_276,OTU_277,OTU_278,OTU_279,OTU_280,OTU_281,OTU_282,OTU_283,OTU_284,OTU_285
0,0.764949,-0.561486,-0.334604,-0.290279,-0.628011,-0.66176,-0.600655,-0.144145,-0.267437,-0.278175,...,-0.387402,-0.267261,-0.476581,-0.289967,2.503164,-0.361455,-0.436238,-0.464765,0.2722,-0.274907
1,0.597588,-0.308221,-0.334604,-0.256442,-0.302963,-0.025306,-0.276242,-0.367136,-0.267437,3.739717,...,-0.387402,-0.267261,-0.476581,-0.289967,0.424418,0.211012,1.551792,2.046435,-0.038265,-0.254629
2,1.196843,-0.561486,-0.334604,-0.290279,-0.628011,-0.66176,-0.600655,-0.367136,-0.267437,-0.278175,...,-0.387402,-0.267261,-0.476581,-0.289967,-0.719224,-0.361455,-0.436238,2.807077,-0.391694,-0.274907
3,0.733658,0.354847,-0.333216,-0.285272,-0.484418,1.093444,0.997265,3.547305,3.741657,-0.256299,...,3.634823,3.741657,2.643899,-0.289967,-0.21152,0.146822,0.005041,-0.464765,-0.365544,-0.274907
4,-0.865029,-0.561486,-0.334604,-0.290279,0.561381,-0.66176,-0.600655,-0.367136,-0.267437,-0.278175,...,-0.387402,-0.267261,-0.476581,-0.289967,-0.719224,-0.361455,-0.436238,-0.464765,-0.391694,-0.274907


In [133]:
# Checking for missing values is an important step in data preprocessing to ensure that your data is clean and ready 
# for analysis or modeling.This is a positive outcome because it means that there are no missing values in the 
# numeric columns of your dataset. Having no missing values simplifies data analysis and modeling, as you don't need 
# to deal with imputing or handling missing data in this specific part of your dataset. 
# It's a good practice to ensure that your data is clean and free of missing values before proceeding with data analysis & ML
scaled_data.isna().sum()

OTU_1      0
OTU_2      0
OTU_3      0
OTU_4      0
OTU_5      0
          ..
OTU_281    0
OTU_282    0
OTU_283    0
OTU_284    0
OTU_285    0
Length: 285, dtype: int64

## Dimentionality Reduction (PCA)

In [135]:
from sklearn.decomposition import PCA
# define the number of principal components to keep
n_components = 3

# fit PCA to the scaled data
pca = PCA(n_components=n_components)
pca_data = pca.fit_transform(scaled_data)

In [None]:
# define the number of principal components to keep
n_components = 2

# fit PCA to the scaled data
pca = PCA(n_components=n_components)
pca_data = pca.fit_transform(scaled_data)

# determine the minimum number of components needed to explain at least 90% of the variance
min_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.9)

# use the minimum number of components if it is greater than n_components
n_components = max(n_components, min_components+1)

# Get the column names used in PCA that explain variance by at least 90%
pca_columns = scaled_data.columns[:n_components]

# Get the original features that were not included in PCA
original_features = scaled_data.drop(pca_columns, axis=1)

# Create a DataFrame with the transformed features
pca_df = pd.DataFrame(pca_data[:, :n_components], columns=['PC{}'.format(i) for i in range(1, n_components+1)])

# Assign original column names to principal components
pca_df.columns = pca_columns

# combine the original features with the transformed features
final_df = pd.concat([original_features, pca_df], axis=1)

# print the final DataFrame with original column names
final_df.head()