In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem


In [3]:
file_name = 'peptides.csv'

# Attempt to read the CSV file with different encodings
try:
    df = pd.read_csv(file_name, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv(file_name, encoding='latin1')
    except UnicodeDecodeError:
        df = pd.read_csv(file_name, encoding='iso-8859-1')

# Display the first few rows of the DataFrame
print("Original DataFrame:")
print(df.head())

Original DataFrame:
   Mass_Tag_ID Peptide  Mass_Tag_Mass  Mass_Tag_NET  Charge State  Mobility  \
0     10098159  TAGLVR       615.3704         0.178             2     1.461   
1     10098159  TAGLVR       615.3704         0.178             1     0.936   
2      6702362  LTALTK       645.4061         0.200             2     1.547   
3     20754127  IVEAVK       657.4061         0.183             2     1.413   
4     20754127  IVEAVK       657.4061         0.183             1     0.882   

   Cross-section    Slope  Intercept  R2_Value  Drift time  Organism  \
0        284.569   8031.1       1.54     1.000    20.642744       HP   
1        222.205  12542.1       3.15     1.000    32.975144       HP   
2        268.581   7587.5       1.86     0.999    19.904767       HP   
3        293.921   8306.6       1.73     0.999    21.482614       MP   
4        235.389  13304.7       2.12     1.000    33.758620       MP   

   Unnamed: 12 Unnamed: 13                 Unnamed: 14  
0          NaN 

In [10]:
def peptide_to_smiles(peptide_sequence):
    # Convert peptide sequence to SMILES using RDKit
    mol = Chem.MolFromSequence(peptide_sequence)
    if mol:
        return Chem.MolToSmiles(mol)
    else:
        return None

# Assuming the peptide sequences are in a column named 'Peptide'
# Add a new column with the SMILES representation
df['SMILES'] = df['Peptide'].apply(peptide_to_smiles)

# Function to convert charge state to the desired format, with a special case for 1
def charge_state_to_string(charge_state):
    if charge_state == 1:
        return "[M+H]+"
    else:
        return f"[M+{charge_state}H]{charge_state}+"

# Assuming the charge states are in a column named 'Charge State'
# Add a new column with the converted charge state representation
df['Adduct'] = df['Charge State'].apply(charge_state_to_string)

# Display the first few rows of the DataFrame with SMILES and Charge State String
print("DataFrame with SMILES and Charge State String:")
print(df.head())

# Save the new DataFrame to a CSV file
new_file_name = 'Peptides_with_smiles_and_charge.csv'
df.to_csv(new_file_name, index=False)

print(f"DataFrame with SMILES and Charge State String saved to {new_file_name}")

DataFrame with SMILES and Charge State String:
   Mass_Tag_ID Peptide  Mass_Tag_Mass  Mass_Tag_NET  Charge State  Mobility  \
0     10098159  TAGLVR       615.3704         0.178             2     1.461   
1     10098159  TAGLVR       615.3704         0.178             1     0.936   
2      6702362  LTALTK       645.4061         0.200             2     1.547   
3     20754127  IVEAVK       657.4061         0.183             2     1.413   
4     20754127  IVEAVK       657.4061         0.183             1     0.882   

   Cross-section    Slope  Intercept  R2_Value  Drift time  Organism  \
0        284.569   8031.1       1.54     1.000    20.642744       HP   
1        222.205  12542.1       3.15     1.000    32.975144       HP   
2        268.581   7587.5       1.86     0.999    19.904767       HP   
3        293.921   8306.6       1.73     0.999    21.482614       MP   
4        235.389  13304.7       2.12     1.000    33.758620       MP   

   Unnamed: 12 Unnamed: 13                 Un