In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [15]:
# Load the dataset
data = pd.read_csv("/Users/estefanos/Desktop/Autoencoder_Grpoup2Project/HIV_ML_ART/NRTI_stanford.csv")

# Rename the first column to "SeqID"
data.rename(columns={data.columns[0]: "SeqID"}, inplace=True)

# Identify rows with missing values (represented by "." or NaN)
missing_vals = set()
for i in range(len(data)):
    for j in range(9, len(data.columns)):  # Columns 10 and onwards (0-indexed in Python)
        if data.iloc[i, j] == "." or pd.isna(data.iloc[i, j]):
            missing_vals.add(data.iloc[i, 0])  # Add SeqID to the set of missing values

# Remove rows with missing values
data_new = data[~data["SeqID"].isin(missing_vals)].copy()

In [None]:
data_new.head()

### Separate data by drugs

In [24]:
# Create separate DataFrames for each drug, excluding rows where the drug column is NaN
ttc = data_new[data_new["3TC"].notna()].copy()  # 3TC
abc = data_new[data_new["ABC"].notna()].copy()  # ABC
azt = data_new[data_new["AZT"].notna()].copy()  # AZT
dft = data_new[data_new["D4T"].notna()].copy()  # D4T (renamed to dft)
ddi = data_new[data_new["DDI"].notna()].copy()  # DDI
tdf = data_new[data_new["TDF"].notna()].copy()  # TDF

In [None]:
ttc.head()

### Remove columns for other drugs

In [26]:
# For each drug-specific DataFrame, drop the columns that are not needed

# 3TC: Keep SeqID and X3TC columns (drop columns 3 to 7, 0-indexed in Python)
ttc = ttc.drop(ttc.columns[2:7], axis=1)  # Columns 2 to 6 (0-indexed)

# ABC: Keep SeqID and ABC columns (drop columns 2, 4 to 7)
abc = abc.drop(abc.columns[[1, 3, 4, 5, 6]], axis=1)  # Columns 1, 3, 4, 5, 6

# AZT: Keep SeqID and AZT columns (drop columns 2:3, 5:7)
azt = azt.drop(azt.columns[[1, 2, 4, 5, 6]], axis=1)  # Columns 1, 2, 4, 5, 6

# D4T: Keep SeqID and D4T columns (drop columns 2:4, 6:7)
dft = dft.drop(dft.columns[[1, 2, 3, 5, 6]], axis=1)  # Columns 1, 2, 3, 5, 6

# DDI: Keep SeqID and DDI columns (drop columns 2:5, 7)
ddi = ddi.drop(ddi.columns[[1, 2, 3, 4, 6]], axis=1)  # Columns 1, 2, 3, 4, 6

# TDF: Keep SeqID and TDF columns (drop columns 2:6)
tdf = tdf.drop(tdf.columns[1:6], axis=1)  # Columns 1 to 5 (0-indexed)

In [None]:
ttc.head()

### Assign resistance classification

In [28]:
# Cutoff = 3.5: 1 = resistant, 0 = not resistant
def assign_resistance(df, drug_col):
    res_vals = []
    for i in range(len(df)):
        if df.iloc[i, 1] >= 3.5:
            res = 1
        else:
            res = 0
        res_vals.append(res)
    df["resistance"] = res_vals
    return df

In [29]:
# Apply the resistance classification function to each drug DataFrame
ttc = assign_resistance(ttc, "3TC")

In [None]:
ttc.head()

In [32]:
ttc.to_csv("ttc_cleaned.csv", index=False)

In [35]:
def create_fasta_RT(data, res_vals, filename):
    # Define the normal RT sequence, or this is th refernce for NRTI
    RT_norm = "PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGKISKIGPENPYNTPVFAIKKKDSTKWRKLVDFRELNKRTQDFWEVQLGIPHPAGLKKKKSVTVLDVGDAYFSVPLDKDFRKYTAFTIPSINNETPGIRYQYNVLPQGWKGSPAIFQSSMTKILEPFRKQNPDIVIYQYMDDLYVGSDLEIGQHRTKIEELRQHLLRWGFTTPDKKHQKEPPFLWMGYELHPDKWT"
    
    flagged = []  # To store SeqIDs with flagged sequences
    flag = False  # Flag to indicate if a sequence was modified
    
    with open(filename, "w") as f:
        for i in range(len(data)):
            # Create the sequence name using SeqID and resistance value
            name = f"{data.iloc[i, 0]}_{res_vals.iloc[i]}"  # Use .iloc to access by integer position
            seq = RT_norm  # Start with the normal RT sequence
            
            # Iterate over the sequence positions (columns 3 to 242 in the data)
            for j in range(240):  # 240 positions in the RT sequence
                x = data.iloc[i, j + 2]  # Get the mutation at position j
                
                # If the mutation has more than one character, take the first character
                if isinstance(x, str) and len(x) > 1:
                    x = x[0]  # Take the first character
                    flag = True  # Set the flag to True
                
                # If the mutation is not '-', replace the corresponding position in the sequence
                if x != '-':
                    seq = seq[:j] + x + seq[j+1:]
            
            # Write the sequence name and sequence to the file
            f.write(f">{name}\n")
            f.write(f"{seq}\n")
            
            # If the flag was set, add the SeqID to the flagged list
            if flag:
                flagged.append(data.iloc[i, 0])
                flag = False  # Reset the flag for the next sequence
    
    print(f"FASTA file saved to {filename}")
    print(f"Flagged SeqIDs: {flagged}")

In [1]:
def create_fasta_PI(data, res_vals, filename):
    # Define the normal PI sequence
    PI_norm = "PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF"
    
    flagged = []  # To store SeqIDs with flagged sequences
    flag = False  # Flag to indicate if a sequence was modified
    
    with open(filename, "w") as f:
        for i in range(len(data)):
            # Create the sequence name using SeqID and resistance value
            name = f"{data.iloc[i, 0]}_{res_vals.iloc[i]}"  # Use .iloc to access by integer position
            seq = PI_norm  # Start with the normal PI sequence
            
            # Iterate over the sequence positions (columns 3 to 101 in the data)
            for j in range(99):  # 99 positions in the PI sequence
                x = data.iloc[i, j + 2]  # Get the mutation at position j
                
                # If the mutation has more than one character, take the first character
                if isinstance(x, str) and len(x) > 1:
                    x = x[0]  # Take the first character
                    flag = True  # Set the flag to True
                
                # If the mutation is not '-', replace the corresponding position in the sequence
                if x != '-':
                    seq = seq[:j] + x + seq[j+1:]
            
            # Write the sequence name and sequence to the file
            f.write(f">{name}\n")
            f.write(f"{seq}\n")
            
            # If the flag was set, add the SeqID to the flagged list
            if flag:
                flagged.append(data.iloc[i, 0])
                flag = False  # Reset the flag for the next sequence
    
    print(f"FASTA file saved to {filename}")
    print(f"Flagged SeqIDs: {flagged}")

In [36]:
create_fasta_RT(ttc, ttc["resistance"], "ttc.fasta")

FASTA file saved to ttc.fasta
Flagged SeqIDs: [2997, 4427, 4487, 4663, 4697, 5222, 5280, 5465, 5641, 6519, 6540, 6569, 6859, 7328, 7364, 7377, 7381, 7878, 7888, 7913, 7969, 8044, 8379, 8612, 9342, 9415, 9632, 9650, 9988, 10011, 10126, 10157, 10492, 10496, 10586, 10722, 10723, 11150, 11338, 11501, 11845, 11849, 12270, 12293, 12503, 12528, 12531, 12552, 12929, 14669, 14720, 15633, 16243, 16271, 26028, 26030, 26060, 26072, 26206, 26212, 26469, 26473, 26506, 27618, 27723, 27744, 27749, 28210, 28211, 28214, 28233, 28234, 28236, 28238, 28241, 28242, 28243, 28244, 28247, 38707, 38711, 38719, 38721, 38729, 38741, 38747, 38753, 38755, 38757, 38763, 38765, 38767, 38771, 38773, 38781, 38787, 38791, 38805, 38807, 38821, 38825, 38851, 38853, 38859, 38865, 38871, 38873, 38881, 38883, 38885, 38899, 38907, 38909, 38925, 38927, 39949, 39959, 39977, 40011, 40166, 40381, 40447, 40507, 40525, 40594, 41011, 41031, 41121, 41125, 41211, 41280, 41342, 41466, 41546, 41596, 41677, 41769, 41797, 41809, 44043, 44

The flagged sequences in  refer to sequences where a mutation (in the NRTI sequence) has more than one character. Example: AC" becomes "A" or "AC" or "GT".