# Define file paths
gold_standard_file = 'data/goldstandard_low_1.profile.txt'
kraken_bracken_file = 'data/B_PlusPF'

In [28]:
import pandas as pd

# Function to read and preprocess the data
def read_data(file_path):
    # Skip metadata lines and read the file
    with open(file_path, 'r') as file:
        lines = file.readlines()
        # Find the first line that doesn't start with '@' (assuming it's the header)
        for i, line in enumerate(lines):
            if not line.startswith('@'):
                header_index = i
                break
        # Read the data starting from the header line
        df = pd.read_csv(file_path, sep='\t', skiprows=header_index)
    return df

# Read the gold standard file
gold_standard_path = 'data/goldstandard_low_1.profile.txt'
gold_standard = read_data(gold_standard_path)

# Ensure TAXID is of integer type in the gold standard DataFrame
gold_standard['TAXID'] = gold_standard['TAXID'].astype(int)

# Normalize function
def normalize_data(df, gold_df):
    # Ensure the key columns for merging exist
    if 'TAXID' not in df.columns or 'PERCENTAGE' not in df.columns:
        print(f"Columns found in data: {df.columns}")
        raise ValueError("Required columns are missing in the dataframe")
    if 'TAXID' not in gold_df.columns or 'PERCENTAGE' not in gold_df.columns:
        print(f"Columns found in gold standard: {gold_df.columns}")
        raise ValueError("Required columns are missing in the gold standard dataframe")
    
    # Ensure TAXID is of integer type in the current DataFrame
    df['TAXID'] = df['TAXID'].astype(int)
    
    # Merge with the gold standard on TAXID to get the gold standard percentages
    normalized_df = pd.merge(df, gold_df[['TAXID', 'PERCENTAGE']], on='TAXID', how='left', suffixes=('', '_gold'))
    # Calculate the new normalized percentage
    normalized_df['PERCENTAGE_normalized'] = normalized_df['PERCENTAGE'] * (normalized_df['PERCENTAGE_gold'] / normalized_df['PERCENTAGE'].sum())
    # Return only necessary columns
    normalized_df = normalized_df[['TAXID', 'RANK', 'TAXPATH', 'TAXPATHSN', 'PERCENTAGE_normalized']]
    return normalized_df

# List of your data files
files = [
    'data/B_C8.tsv',
    'data/B_C16.tsv',
    'data/B_PlusPF.tsv',
    'data/K_C8.tsv',
    'data/K_C16.tsv',
    'data/K_PlusPF.tsv',
    'data/M_UC.tsv'
]

# Process each file
for file_path in files:
    df = read_data(file_path)
    try:
        normalized_df = normalize_data(df, gold_standard)
        # Save the normalized data to a new TSV file
        normalized_df.to_csv(file_path.replace('.tsv', '_normalized.tsv'), sep='\t', index=False)
        print(f"Normalization complete for {file_path}")
    except ValueError as e:
        print(f"Error processing {file_path}: {e}")

print("Normalization complete and files saved.")


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/goldstandard_low_1.profile.txt'