In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def preprocess_peptide_data_v1():
    # Load dataset
    df = pd.read_csv('./CycPeptMPDB_Peptide_Assay_PAMPA.csv')
    
    # Remove duplicates based on SMILES column, keeping the latest entry
    unique_smiles_df = df.sort_values('Year', ascending=False).drop_duplicates(subset='SMILES', keep='first')
    
    # Find and separate rows with missing detection limits
    missing_detection_limits_df = unique_smiles_df[pd.isna(unique_smiles_df['Detection_Limit_1']) & pd.isna(unique_smiles_df['Detection_Limit_2'])]
    
    # Identify suspicious rows based on PAMPA values
    suspicious_rows_df = missing_detection_limits_df[missing_detection_limits_df["PAMPA"] == -10.0]
    
    # Combine suspicious rows from both criteria
    non_missing_detection_limits_df = unique_smiles_df[~(pd.isna(unique_smiles_df['Detection_Limit_1']) & pd.isna(unique_smiles_df['Detection_Limit_2']))]
    all_suspicious_df = pd.concat([non_missing_detection_limits_df, suspicious_rows_df], ignore_index=True)
    
    # Filter out the suspicious rows from the main dataset
    clean_df = missing_detection_limits_df[missing_detection_limits_df["PAMPA"] != -10.0].reset_index(drop=True)
    
    # Save outputs
    # all_suspicious_df.to_csv('unclear_values_PAMPA_v1.csv', index=False)
    # clean_df.to_csv('filtered_PAMPA_v1.csv', index=False)
    
    # Identify and save duplicate entries not in the final dataset
    merged_df = df.merge(unique_smiles_df, how='outer', indicator=True)
    duplicates_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
    # duplicates_df.to_csv('duplicates_v1.csv', index=False)
    
    return clean_df

def preprocess_peptide_data_v2():
    # Load dataset
    df = pd.read_csv('./CycPeptMPDB_Peptide_Assay_PAMPA.csv')
    
    # Identify unique and non-unique SMILES entries
    smiles_counts = df['SMILES'].value_counts()
    unique_smiles = smiles_counts[smiles_counts == 1].index
    unique_smiles_df = df[df['SMILES'].isin(unique_smiles)]
    non_unique_smiles_df = df[~df['SMILES'].isin(unique_smiles)]
    
    # Calculate standard deviation and mean PAMPA for non-unique SMILES
    stats_df = non_unique_smiles_df.groupby('SMILES')['PAMPA'].agg(['std', 'mean']).reset_index()
    valid_smiles = stats_df[stats_df['std'] <= 1]['SMILES']
    
    # Filter entries based on PAMPA variation
    filtered_non_unique_smiles_df = non_unique_smiles_df[non_unique_smiles_df['SMILES'].isin(valid_smiles)]
    
    # Merge with mean PAMPA values where applicable
    mean_PAMPA_df = stats_df[['SMILES', 'mean']]
    merged_df = filtered_non_unique_smiles_df.merge(mean_PAMPA_df, on='SMILES', how='left', suffixes=('', '_mean'))
    merged_df['PAMPA'] = merged_df['mean'].fillna(merged_df['PAMPA']).drop(columns=['mean'])
    
    # Remove duplicates, keep the latest entry
    final_df = merged_df.sort_values('Year', ascending=False).drop_duplicates(subset='SMILES', keep='first')
    
    # Combine unique and processed non-unique SMILES entries
    combined_df = pd.concat([unique_smiles_df, final_df], ignore_index=True)
    clean_combined_df = combined_df[pd.isna(combined_df['Detection_Limit_1']) & pd.isna(combined_df['Detection_Limit_2'])]
    
    # Remove suspicious PAMPA values
    clean_combined_df = clean_combined_df[clean_combined_df["PAMPA"] != -10.0].reset_index(drop=True)
    
    # Identify and save missing IDs
    missing_ids = ~df['CycPeptMPDB_ID'].isin(clean_combined_df['CycPeptMPDB_ID'])
    missing_rows_df = df[missing_ids]
    # missing_rows_df.to_csv('deleted_v2.csv', index=False)
    # clean_combined_df.to_csv('filtered_PAMPA_v2.csv', index=False)
    
    return clean_combined_df


In [3]:

df = preprocess_peptide_data_v2()

  df = pd.read_csv('./CycPeptMPDB_Peptide_Assay_PAMPA.csv')


In [9]:
def cut_df(df):
    columns = [
    'SMILES',
    'PAMPA']

    return df[columns]

In [10]:
df = cut_df(df)


In [11]:
df

Unnamed: 0,SMILES,PAMPA
0,CC(C)C[C@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@...,-7.000
1,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...,-7.100
2,CC(C)C[C@@H]1NC(=O)[C@H](CC(C)C)NC(=O)[C@H](Cc...,-7.300
3,CC(C)C[C@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C@...,-7.300
4,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C...,-7.300
...,...,...
6551,CCCC[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)CN(CC2C...,-5.045
6552,CCCCCN1CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](CC...,-5.360
6553,CC(C)C[C@@H]1NC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[C...,-6.495
6554,CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)NC(=O)[C@@H](...,-5.830


In [15]:
import tempfile
import deepchem as dc
print(dc.__version__)

AttributeError: partially initialized module 'deepchem' has no attribute 'data' (most likely due to a circular import)

In [13]:

with dc.utils.UniversalNamedTemporaryFile(mode='w') as tmpfile:
    df.to_csv(tmpfile.name)
    loader = dc.data.CSVLoader(["task1"], feature_field="smiles",
    featurizer=dc.feat.CircularFingerprint())
    dataset = loader.create_dataset(tmpfile.name)
len(dataset)


2024-02-19 17:30:10.370037: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-19 17:30:10.925308: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 17:30:10.925387: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 17:30:10.997478: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-19 17:30:11.160245: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-19 17:30:11.167657: I tensorflow/core/platform/cpu_feature_guard.cc:1

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


AttributeError: partially initialized module 'deepchem' has no attribute 'data' (most likely due to a circular import)