<a href="https://colab.research.google.com/github/Cecilia-cmd/2024_MLEES/blob/main/Final_Project/Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

In [None]:
import pandas as pd
from openpyxl import load_workbook

#Loading the data
folder_path = '/Users/ceciliatorres/Desktop/UNI/ML/Personal_ML_project/Data/'

#create different df from different tables
otu_abundance = pd.read_csv(folder_path + 'otu_abundance.csv') #OTU = microbial species
taxonomy = pd.read_csv(folder_path + 'taxonomy.csv')
sample_codes = pd.read_csv(folder_path + 'sample_codes.csv')
metadata = pd.read_csv(folder_path + 'metadata.csv')


#-------------Cleaning the data --------------------------------------------------

#filter sample codes table for SRR identifiers
srr_sample_codes = sample_codes[sample_codes['Sample_code'].str.startswith('SRR')]

print(srr_sample_codes.columns) #look at the columns name for the dico below

#Mapping dico
sample_mapping = dict(zip(srr_sample_codes['Kraken2_code '], srr_sample_codes['Sample_code']))
        #Kraken2 correspond to the reads name

#Filter OTU abundance table to include only SRR data (reads)
srr_columns = ['OTU'] + [col for col in otu_abundance.columns if col in sample_mapping]
otu_abundance_filtered = otu_abundance[srr_columns]

#New columns names with directly the name of metagenomics samples
otu_abundance_filtered = otu_abundance_filtered.rename(columns=sample_mapping)

##### Now we merge this new otu table with associated information from metadata table #######
#first we filter the table 4 with SRR samples form otu_abundance_filtered
srr_sample_codes = otu_abundance_filtered.columns  # List of SRR codes after renaming

#select only relevant SRR samples and columns in metadata table
filtered_metadata = metadata[metadata['Run'].isin(srr_sample_codes)][['Run', 'biome_clean', 'location', 'lat', 'long']]

# Remove rows where 'biome_clean' is 'mixed forest' because only 1 data with it and tropcial forest only 3
#filtered_metadata = filtered_metadata[filtered_metadata['biome_clean'] != 'mixed forest']
filtered_metadata = filtered_metadata[~filtered_metadata['biome_clean'].isin(['mixed forest', 'Tropical forest'])]

#rename 'Run' to 'Sample_code' for easier merging with table 2
filtered_metadata = filtered_metadata.rename(columns={'Run': 'Sample_code'})
#check the filtered metadata to ensure it has the expected SRR samples and columns
print(filtered_metadata.head())

#Now we are going to merge filtered_metadata with otu_avundance
# we transpose table otu_abundance_filtered in order to have the sample ids (=Sample_code) in rows to match the table 4
otu_abundance_filtered = otu_abundance_filtered.set_index('OTU').T
otu_abundance_filtered.index.name = 'Sample_code'
otu_abundance_filtered = otu_abundance_filtered.reset_index()
print(otu_abundance_filtered.head())

#merge with filtered metadata on the 'Sample_code' column
merged_data = otu_abundance_filtered.merge(filtered_metadata, on='Sample_code')
print(merged_data.head())


##clean if any NAs
print(merged_data.isnull().sum())
merged_data = merged_data.dropna(subset=['lat', 'long'])

print(merged_data['biome_clean'].value_counts())
print(merged_data['location'].unique())


#-------------Normalize the data -----------------------------------------

#we normalize becaue we will use random forest which is better with normalized data
otu_columns = [col for col in merged_data.columns if col.startswith('OTU_')]
merged_data[otu_columns] = merged_data[otu_columns].div(merged_data[otu_columns].sum(axis=1), axis=0)
print(merged_data.head())
#save the normalized data (without standardization or splitting)
merged_data.to_csv('preprocessed_data.csv', index=False)

