### CYP P450 2C19, CYP P450 2D6, CYP P450 3A4, CYP P450 1A2, CYP P450 2C9 Inhibition

References:

[1] Veith, Henrike et al. “Comprehensive characterization of cytochrome P450 isozyme selectivity across chemical libraries.” Nature biotechnology vol. 27,11 (2009): 1050-5.

Dataset License: CC BY 4.0.

In [1]:
import pandas as pd

In [17]:
from tdc.single_pred import ADME

for data_name in ['CYP2C19_Veith', 'CYP2D6_Veith', 'CYP3A4_Veith', 'CYP1A2_Veith', 'CYP2C9_Veith']:
    data = ADME(name=data_name)
    split = data.get_split()

    train_df, valid_df, test_df = split['train'], split['valid'], split['test']

    full_df = pd.concat([train_df, valid_df, test_df], ignore_index=True).drop(columns=['Drug_ID'])
    full_df = full_df.rename(columns={
        'Drug': 'Canonical_Smiles',
        'Y': 'Inhibitor'
    })

    full_df['dataset'] = data_name

    print(f'{data_name}: \n', full_df['Inhibitor'].value_counts())

    full_df.to_csv(f'../data/{data_name}.csv', index=False)

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


CYP2C19_Veith: 
 Inhibitor
0    6846
1    5819
Name: count, dtype: int64
CYP2D6_Veith: 
 Inhibitor
0    10616
1     2514
Name: count, dtype: int64
CYP3A4_Veith: 
 Inhibitor
0    7218
1    5110
Name: count, dtype: int64
CYP1A2_Veith: 
 Inhibitor
0    6750
1    5829
Name: count, dtype: int64
CYP2C9_Veith: 
 Inhibitor
0    8047
1    4045
Name: count, dtype: int64


# PubChem

#### Cytochrome P450 3A4 (human)

- https://pubchem.ncbi.nlm.nih.gov/protein/P08684 (main 페이지)
- https://pubchem.ncbi.nlm.nih.gov/bioassay/1851 (AID_1851)

In [2]:
def clean_data(df):
    df = df.copy()

    desc_lower = df['Curve_Description'].str.lower()

    # 1. 'poor fit'이 포함된 경우 (대소문자 무관)
    cond1 = desc_lower.str.contains('poor fit', na=False)

    # 2. 'single point of activity'와 일치하는 경우 (대소문자 무관)
    cond2 = desc_lower == 'single point of activity'

    # 3. 'partial'이 2번 이상 포함된 경우 (대소문자 무관)
    cond3 = desc_lower.str.count('partial') >= 2

    # 위 세 조건 중 하나라도 해당하는 행을 식별 (OR 연산자 | 사용)
    rows_to_delete = cond1 | cond2 | cond3

    df_cleaned = df[~rows_to_delete].reset_index(drop=True)

    return df_cleaned

In [3]:
def data_cleaning_AID_1851(df):
    df = df.iloc[4:].reset_index(drop=True)
    df = df.loc[df['Panel Name'] == 'p450-cyp3a4', :].reset_index(drop=True) # cyp3a4에 해당하는 데이터만 사용

    df = clean_data(df)

    df = df.loc[df['PUBCHEM_ACTIVITY_OUTCOME'] != 'Inconclusive', :].reset_index(drop=True) # 불확실 데이터 제거
    df = df[~((df['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active') & (df['Potency'].isna()))].reset_index(drop=True) # Active이면서 결측인 데이터 제거
    df = df.dropna(subset=['PUBCHEM_EXT_DATASOURCE_SMILES']).reset_index(drop=True)

    ## Smile이 겹치면서 Inactive인 데이터는 하나를 제외하고 모두 제거
    # 1) 'PUBCHEM_EXT_DATASOURCE_SMILES'별 그룹이 모두 Inactive이고, 크기가 2 이상인 그룹만 식별
    inactive_groups = (
        df
        .groupby('PUBCHEM_EXT_DATASOURCE_SMILES')
        .filter(lambda g: len(g) > 1 and g['PUBCHEM_ACTIVITY_OUTCOME'].eq('Inactive').all())
        ['PUBCHEM_EXT_DATASOURCE_SMILES']
        .unique()
    )

    # 2) 위에서 찾은 그룹에 대해 첫 행(keep='first')만 남기고 나머지 중복을 제거
    mask_to_drop = (
        df['PUBCHEM_EXT_DATASOURCE_SMILES'].isin(inactive_groups)
        & df.duplicated('PUBCHEM_EXT_DATASOURCE_SMILES', keep='first')
    )
    df = df.loc[~mask_to_drop].reset_index(drop=True)

    ## Smile이 겹치면서 Active, Inactiver가 존재하는 데이터는 모두 제거
    # 1) Active·Inactive가 동시에 존재하는 SMILES 목록 추출
    conflicted_smiles = (
        df
        .groupby('PUBCHEM_EXT_DATASOURCE_SMILES')['PUBCHEM_ACTIVITY_OUTCOME']
        .nunique()                 # 서로 다른 outcome 개수
        .loc[lambda s: s > 1]      # 2개 이상(= Active와 Inactive 동시 존재)
        .index                     # 문제되는 SMILES 목록
    )
    # 2) 해당 SMILES 전체 행 제거
    df = (
        df
        .loc[~df['PUBCHEM_EXT_DATASOURCE_SMILES'].isin(conflicted_smiles)]
        .reset_index(drop=True)
    )

    ## Smile이 겹치면서 Active인 데이터는 R2 값이 높은 데이터만 사용
    # 1) Active 행 중에서 SMILES 별 Fit_R2 최대값을 가지는 행(index) 추출
    df['Fit_R2'] = pd.to_numeric(df['Fit_R2'], errors='coerce')
    idx_best_active = (
        df[df['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active']
        .groupby('PUBCHEM_EXT_DATASOURCE_SMILES')['Fit_R2']
        .idxmax()                # 각 그룹에서 Fit_R2 최고값을 가진 행의 인덱스
    )
    # 2) Active 행 중, 위에서 선택되지 않은 행은 모두 삭제
    mask_drop_active = (
        (df['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active')   # Active 행이면서
        & (~df.index.isin(idx_best_active))            # 최고 Fit_R2 행이 아닌 경우
    )
    df = df.drop(index=df[mask_drop_active].index).reset_index(drop=True)

    cols_to_use = [
        'PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'Potency',
        'Fit_LogAC50', 'Fit_R2', 'Fit_HillSlope', 'Fit_InfiniteActivity', 'Fit_ZeroActivity', 'Fit_CurveClass',
        'Max_Response', 'Curve_Description'

    ]
    df = df[cols_to_use]

    cols_to_numeric = [
        'Potency',
        'Fit_LogAC50', 'Fit_R2', 'Fit_HillSlope', 'Fit_InfiniteActivity', 'Fit_ZeroActivity', 'Fit_CurveClass',
        'Max_Response'
    ]
    df[cols_to_numeric] = df[cols_to_numeric].astype(float)

    return df

In [4]:
def convert_potency_to_inhibition(
    potency_uM: float,
    concentration: float = 10.0,
    hill_slope: float = 1.0,
    max_inhibition: float = 100.0
) -> float:
    """
    단일 Potency (AC50/IC50) 값을 특정 농도에서의 저해율(%)로 변환합니다.
    힐 방정식(Hill Equation)을 기반으로 계산합니다.

    Args:
        potency_uM (float): 변환할 Potency 값 (AC50 또는 IC50, 단위: uM).
        concentration (float): 가정한 실험 농도 (단위: uM). 기본값은 10.0 uM.
        hill_slope (float): 가정한 힐 계수. 기본값은 1.0.
        max_inhibition (float): 최대 저해율(%). 기본값은 100.0.

    Returns:
        float: 계산된 저해율(%).
    """
    # Potency 값이 유효하지 않은 경우(예: 0, NaN, 음수) 0% 저해율을 반환
    if pd.isna(potency_uM) or potency_uM <= 0:
        return 0.0

    # 힐 방정식 적용
    # Inhibition = Max / (1 + (Potency / Concentration)^HillSlope)
    inhibition = max_inhibition / (1 + (potency_uM / concentration) ** hill_slope)

    return inhibition

def calculate_inhibition(row):
    if row['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active':
        # Active인 경우 Potency 값을 변환
        return convert_potency_to_inhibition(row['Potency'])
    elif row['PUBCHEM_ACTIVITY_OUTCOME'] == 'Inactive':
        # Inactive인 경우 0으로 설정
        return 0.0
    else:
        return np.nan

def sophisticated_potency_to_inhibition(row, concentration=10.0):
    log_ac50 = row['Fit_LogAC50']
    hill_slope = row['Fit_HillSlope']
    min_activity = row['Fit_InfiniteActivity']

    # 힐 계수가 0이거나 NaN인 경우 기본값 1.0 사용
    if pd.isna(hill_slope) or hill_slope == 0:
        hill_slope = 1.0

    ac50_M = 10**log_ac50
    ac50_uM = ac50_M * 1_000_000

    max_inhibition = -min_activity

    if hill_slope <= 0: # 힐 계수는 양수여야 함
        hill_slope = 1.0 # 비정상적인 경우 표준값으로 대체

    # hill_slope = 1.0 # 보통 힐 계수를 1로 설정한다고 함

    ratio = ac50_uM / concentration
    denominator = 1 + ratio ** hill_slope
    inhibition = max_inhibition / denominator

    # return np.clip(inhibition, 0.0, 100.0)
    return inhibition

# def sophisticated_calculate_inhibition(row):
#     if row['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active':
#         return sophisticated_potency_to_inhibition(row)
#     elif row['PUBCHEM_ACTIVITY_OUTCOME'] == 'Inactive':
#         return 0.0
#     else:
#         return np.nan

def sophisticated_calculate_inhibition_1851(row):
    if row['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active':
        return sophisticated_potency_to_inhibition(row)
    elif row['PUBCHEM_ACTIVITY_OUTCOME'] == 'Inactive':
        return 0.0
    else:
        return np.nan


def sophisticated_calculate_inhibition(row):
    if row['Phenotype'] == 'Inhibitor':
        return sophisticated_potency_to_inhibition(row)
    elif row['Phenotype'] == 'Inactive':
        return 0.0
    else:
        return np.nan

In [5]:
AID_1851 = pd.read_csv('../data/raw/AID_1851_datatable_all.csv', low_memory=False)
AID_1851 = data_cleaning_AID_1851(AID_1851)


AID_1851['Inhibition_hill'] = AID_1851.apply(sophisticated_calculate_inhibition_1851, axis=1)
# AID_1851['Inhibition_eff'] = 100 - AID_1851['Efficacy']
AID_1851['Inhibition_fit'] = AID_1851['Fit_InfiniteActivity'] * -1
AID_1851['Inhibition_max'] = AID_1851['Max_Response'] * -1

AID_1851 = AID_1851.rename(columns={
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'Canonical_Smiles',
})
cols = [
    'Canonical_Smiles',
    'Inhibition_hill',
    # 'Inhibition_eff',
    'Inhibition_fit',
    'Inhibition_max'
]
AID_1851[cols].to_csv('../data/external/AID_1851.csv', index=False)

AID_1851

Unnamed: 0,Canonical_Smiles,PUBCHEM_ACTIVITY_OUTCOME,Potency,Fit_LogAC50,Fit_R2,Fit_HillSlope,Fit_InfiniteActivity,Fit_ZeroActivity,Fit_CurveClass,Max_Response,Curve_Description,Inhibition_hill,Inhibition_fit,Inhibition_max
0,CCCC(=O)NC1=CC(=C(C=C1)N2CCN(CC2)CC)Cl.Cl,Inactive,,,,,,,4.0,-2.2935,,0.000000,,2.2935
1,C1OC2=C(O1)C=C3C(=C2)C=C(C(=O)N3)CN(CCCO)CC4=N...,Active,3.16228,-5.50,0.9914,0.8000,-105.4660,-15.4027,-1.1,-97.4718,Complete curve; high efficacy,75.434847,105.4660,97.4718
2,CC(=O)N(C1=CC2=C(C=C1)OC(=O)S2)S(=O)(=O)C3=CC=CS3,Active,14.12540,-4.85,0.9944,1.0100,-113.7150,1.0042,-2.1,-94.7622,Partial curve; high efficacy,47.039728,113.7150,94.7622
3,CC(C)(C)N1C(=NN=N1)C(C2=CC=CC=C2OC)N3CCN(CC3)C...,Active,3.16228,-5.50,0.9989,1.7885,-96.2811,-2.0298,-1.1,-97.0468,Complete curve; high efficacy,85.388103,96.2811,97.0468
4,CC1=CC=CC=C1NC(=O)N2CCCC2C(=O)NC3=CC=CC(=C3)NC...,Inactive,,,,,,,4.0,0.0054,,0.000000,,-0.0054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11002,C=CC1=C[C@H]([C@H]2[C@@H](C13OCCCO3)O2)O,Inactive,,,,,,,4.0,-4.0397,,0.000000,,4.0397
11003,C1COC2([C@@H]3[C@H](O3)[C@H]([C@@H]4C2=CC[C@H]...,Inactive,,,,,,,4.0,-21.2044,,0.000000,,21.2044
11004,CCN1C(=O)[C@H]2CC=C3[C@H]([C@H]2C1=O)[C@@H]([C...,Inactive,,,,,,,4.0,-21.3967,,0.000000,,21.3967
11005,C[C@H](C1=CC=CC=C1)N2C(=O)[C@@H]3CC[C@H]4[C@H]...,Inactive,,,,,,,4.0,-18.1157,,0.000000,,18.1157


- AID_884

In [6]:
def data_cleaning_AID_884(df):
    df = df.iloc[5:].reset_index(drop=True)
    df = df.loc[df['PUBCHEM_ACTIVITY_OUTCOME'] != 'Inconclusive', :].reset_index(drop=True) # 불확실 데이터 제거
    df = df[~((df['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active') & (df['Potency'].isna()))].reset_index(drop=True) # Active이면서 결측인 데이터 제거
    df = df.dropna(subset=['PUBCHEM_EXT_DATASOURCE_SMILES']).reset_index(drop=True)

    df = df.loc[df['Phenotype'] != 'Activator', :].reset_index(drop=True) # 오히려 활성시키는 데이터 제외 (optional)

    ## Smile이 겹치면서 Inactive인 데이터는 하나를 제외하고 모두 제거
    # 1) 'PUBCHEM_EXT_DATASOURCE_SMILES'별 그룹이 모두 Inactive이고, 크기가 2 이상인 그룹만 식별
    inactive_groups = (
        df
        .groupby('PUBCHEM_EXT_DATASOURCE_SMILES')
        .filter(lambda g: len(g) > 1 and g['PUBCHEM_ACTIVITY_OUTCOME'].eq('Inactive').all())
        ['PUBCHEM_EXT_DATASOURCE_SMILES']
        .unique()
    )

    # 2) 위에서 찾은 그룹에 대해 첫 행(keep='first')만 남기고 나머지 중복을 제거
    mask_to_drop = (
        df['PUBCHEM_EXT_DATASOURCE_SMILES'].isin(inactive_groups)
        & df.duplicated('PUBCHEM_EXT_DATASOURCE_SMILES', keep='first')
    )
    df = df.loc[~mask_to_drop].reset_index(drop=True)

    ## Smile이 겹치면서 Active, Inactiver가 존재하는 데이터는 모두 제거
    # 1) Active·Inactive가 동시에 존재하는 SMILES 목록 추출
    conflicted_smiles = (
        df
        .groupby('PUBCHEM_EXT_DATASOURCE_SMILES')['PUBCHEM_ACTIVITY_OUTCOME']
        .nunique()                 # 서로 다른 outcome 개수
        .loc[lambda s: s > 1]      # 2개 이상(= Active와 Inactive 동시 존재)
        .index                     # 문제되는 SMILES 목록
    )
    # 2) 해당 SMILES 전체 행 제거
    df = (
        df
        .loc[~df['PUBCHEM_EXT_DATASOURCE_SMILES'].isin(conflicted_smiles)]
        .reset_index(drop=True)
    )

    ## Smile이 겹치면서 Active인 데이터는 R2 값이 높은 데이터만 사용
    # 1) Active 행 중에서 SMILES 별 Fit_R2 최대값을 가지는 행(index) 추출
    df['Fit_R2'] = pd.to_numeric(df['Fit_R2'], errors='coerce')
    idx_best_active = (
        df[df['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active']
        .groupby('PUBCHEM_EXT_DATASOURCE_SMILES')['Fit_R2']
        .idxmax()                # 각 그룹에서 Fit_R2 최고값을 가진 행의 인덱스
    )
    # 2) Active 행 중, 위에서 선택되지 않은 행은 모두 삭제
    mask_drop_active = (
        (df['PUBCHEM_ACTIVITY_OUTCOME'] == 'Active')   # Active 행이면서
        & (~df.index.isin(idx_best_active))            # 최고 Fit_R2 행이 아닌 경우
    )
    df = df.drop(index=df[mask_drop_active].index).reset_index(drop=True)

    cols_to_use = [
        'PUBCHEM_EXT_DATASOURCE_SMILES', 'PUBCHEM_ACTIVITY_OUTCOME', 'Phenotype',
        'Potency', 'Efficacy', 'Fit_LogAC50', 'Fit_HillSlope', 'Fit_R2', 'Fit_InfiniteActivity', 'Fit_ZeroActivity', 'Max_Response', 'Curve_Description'
    ]
    df = df[cols_to_use]

    cols_to_numeric = [
        'Potency', 'Efficacy', 'Fit_LogAC50', 'Fit_HillSlope', 'Fit_R2', 'Fit_InfiniteActivity', 'Fit_ZeroActivity',
        'Max_Response'
    ]
    df[cols_to_numeric] = df[cols_to_numeric].astype(float)

    return df

In [7]:
import pandas as pd
import numpy as np

AID_884 = pd.read_csv('../data/raw/AID_884_datatable_all.csv', low_memory=False)
AID_884 = data_cleaning_AID_884(AID_884)
AID_884 = clean_data(AID_884)
#
# AID_884 = AID_884.rename(columns={
#     'PUBCHEM_EXT_DATASOURCE_SMILES': 'Canonical_Smiles',
# })
#
AID_884['Inhibition_hill'] = AID_884.apply(sophisticated_calculate_inhibition, axis=1)
AID_884['Inhibition_eff'] = AID_884['Efficacy']
AID_884['Inhibition_fit'] = AID_884['Fit_InfiniteActivity'] * -1
AID_884['Inhibition_max'] = AID_884['Max_Response'] * -1

AID_884 = AID_884.rename(columns={
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'Canonical_Smiles',
})
cols = [
    'Canonical_Smiles',
    'Inhibition_hill', 'Inhibition_eff', 'Inhibition_fit', 'Inhibition_max'
]
AID_884[cols].to_csv('../data/external/AID_884.csv', index=False)


AID_884

Unnamed: 0,Canonical_Smiles,PUBCHEM_ACTIVITY_OUTCOME,Phenotype,Potency,Efficacy,Fit_LogAC50,Fit_HillSlope,Fit_R2,Fit_InfiniteActivity,Fit_ZeroActivity,Max_Response,Curve_Description,Inhibition_hill,Inhibition_eff,Inhibition_fit,Inhibition_max
0,C1=CC(=CC=C1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCC2=...,Inactive,Inactive,,,,,,,,-24.5753,,0.000000,,,24.5753
1,C1CCC(C1)(C#N)NC2=CC=C(C=C2)C3=CC=C(C=C3)NC4(C...,Inactive,Inactive,,,,,,,,-20.8980,,0.000000,,,20.8980
2,COC1=CC=C(C=C1)C2=NC3=CN=C(N=C3N(C2=O)CC4=CC=C...,Active,Inhibitor,3.1623,84.3805,-5.5,4.4495,0.9689,-88.9174,-4.5368,-76.6849,Complete curve; high efficacy,88.390588,84.3805,88.9174,76.6849
3,C(=S)=S,Inactive,Inactive,,,,,,,,-10.8373,,0.000000,,,10.8373
4,C[C@@H]([C@@H](C(=O)O)N)OP(=O)(O)O,Inactive,Inactive,,,,,,,,-9.7879,,0.000000,,,9.7879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8980,COC1=CC=CC=C1C2=NC3=CC=CC=C3C(=N2)NCC4=CC=CC=C4,Active,Inhibitor,3.9811,130.9490,-5.4,0.5000,0.9270,-132.4370,-1.4875,-103.6510,Partial curve; high efficacy,81.202001,130.9490,132.4370,103.6510
8981,CC(C)NC(=O)N1CC2(C1)CCN(CC2)C(=O)C3=CC(=CC(=C3...,Inactive,Inactive,,,,,,,,-10.4924,,0.000000,,,10.4924
8982,C1=CC(=CC=C1C(F)(F)F)Cl,Inactive,Inactive,,,,,,,,-21.4120,,0.000000,,,21.4120
8983,CCN(CC)CCOC(=O)C(CC1CCCO1)CC2=CC=CC3=CC=CC=C32...,Active,Inhibitor,1.0000,61.1351,-6.0,1.1000,0.9743,-79.6557,-18.5207,-81.8361,Complete curve; high efficacy,73.794032,61.1351,79.6557,81.8361


- AID_885

In [8]:
import pandas as pd
import numpy as np

AID_885 = pd.read_csv('../data/raw/AID_885_datatable_all.csv', low_memory=False)
AID_885 = data_cleaning_AID_884(AID_885)
AID_885 = clean_data(AID_885)

AID_885.head()

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME,Phenotype,Potency,Efficacy,Fit_LogAC50,Fit_HillSlope,Fit_R2,Fit_InfiniteActivity,Fit_ZeroActivity,Max_Response,Curve_Description
0,C1=CC(=CC=C1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCC2=...,Inactive,Inactive,,,,,,,,-24.5753,
1,C1CCC(C1)(C#N)NC2=CC=C(C=C2)C3=CC=C(C=C3)NC4(C...,Inactive,Inactive,,,,,,,,-20.898,
2,COC1=CC=C(C=C1)C2=NC3=CN=C(N=C3N(C2=O)CC4=CC=C...,Inactive,Inhibitor,3.1623,84.3805,-5.5,4.4495,0.9689,-88.9174,-4.5368,-76.6849,Complete curve; high efficacy
3,C(=S)=S,Inactive,Inactive,,,,,,,,-10.8373,
4,C[C@@H]([C@@H](C(=O)O)N)OP(=O)(O)O,Inactive,Inactive,,,,,,,,-9.7879,


In [9]:
def sophisticated_potency_to_inhibition(row, concentration=10.0):
    log_ac50 = row['Fit_LogAC50']
    hill_slope = row['Fit_HillSlope']
    min_activity = row['Fit_InfiniteActivity']
    # min_activity = row['Max_Response']

    # 힐 계수가 0이거나 NaN인 경우 기본값 1.0 사용
    if pd.isna(hill_slope) or hill_slope == 0:
        hill_slope = 1.0

    ac50_M = 10**log_ac50
    ac50_uM = ac50_M * 1_000_000

    max_inhibition = -min_activity

    if hill_slope <= 0: # 힐 계수는 양수여야 함
        hill_slope = 1.0 # 비정상적인 경우 표준값으로 대체

    # hill_slope = 1.0 # 보통 힐 계수를 1로 설정한다고 함

    ratio = ac50_uM / concentration
    denominator = 1 + ratio ** hill_slope
    inhibition = max_inhibition / denominator

    # return np.clip(inhibition, 0.0, 100.0)
    return inhibition

def sophisticated_calculate_inhibition(row):
    if row['Phenotype'] == 'Inhibitor':
        return sophisticated_potency_to_inhibition(row)
    elif row['Phenotype'] == 'Inactive':
        return 0.0
    else:
        return np.nan


AID_885['Inhibition_hill'] = AID_885.apply(sophisticated_calculate_inhibition, axis=1)
AID_885['Inhibition_eff'] = AID_885['Efficacy']
AID_885['Inhibition_fit'] = AID_885['Fit_InfiniteActivity'] * -1
AID_885['Inhibition_max'] = np.clip(AID_885['Max_Response'] * -1, 0.0, 100.0)
AID_885['Inhibition_max'] = AID_885['Max_Response'] * -1

AID_885 = AID_885.rename(columns={
    'PUBCHEM_EXT_DATASOURCE_SMILES': 'Canonical_Smiles',
})
cols = [
    'Canonical_Smiles',
    'Inhibition_hill', 'Inhibition_eff', 'Inhibition_fit', 'Inhibition_max'
]
AID_885[cols].to_csv('../data/external/AID_885.csv', index=False)

In [10]:
AID_885

Unnamed: 0,Canonical_Smiles,PUBCHEM_ACTIVITY_OUTCOME,Phenotype,Potency,Efficacy,Fit_LogAC50,Fit_HillSlope,Fit_R2,Fit_InfiniteActivity,Fit_ZeroActivity,Max_Response,Curve_Description,Inhibition_hill,Inhibition_eff,Inhibition_fit,Inhibition_max
0,C1=CC(=CC=C1C(=O)N[C@@H](CCC(=O)O)C(=O)O)NCC2=...,Inactive,Inactive,,,,,,,,-24.5753,,0.000000,,,24.5753
1,C1CCC(C1)(C#N)NC2=CC=C(C=C2)C3=CC=C(C=C3)NC4(C...,Inactive,Inactive,,,,,,,,-20.8980,,0.000000,,,20.8980
2,COC1=CC=C(C=C1)C2=NC3=CN=C(N=C3N(C2=O)CC4=CC=C...,Inactive,Inhibitor,3.1623,84.3805,-5.5,4.4495,0.9689,-88.9174,-4.5368,-76.6849,Complete curve; high efficacy,88.390588,84.3805,88.9174,76.6849
3,C(=S)=S,Inactive,Inactive,,,,,,,,-10.8373,,0.000000,,,10.8373
4,C[C@@H]([C@@H](C(=O)O)N)OP(=O)(O)O,Inactive,Inactive,,,,,,,,-9.7879,,0.000000,,,9.7879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9182,COC1=CC=CC=C1C2=NC3=CC=CC=C3C(=N2)NCC4=CC=CC=C4,Inactive,Inhibitor,3.9811,130.9490,-5.4,0.5000,0.9270,-132.4370,-1.4875,-103.6510,Partial curve; high efficacy,81.202001,130.9490,132.4370,103.6510
9183,CC(C)NC(=O)N1CC2(C1)CCN(CC2)C(=O)C3=CC(=CC(=C3...,Inactive,Inactive,,,,,,,,-10.4924,,0.000000,,,10.4924
9184,C1=CC(=CC=C1C(F)(F)F)Cl,Inactive,Inactive,,,,,,,,-21.4120,,0.000000,,,21.4120
9185,CCN(CC)CCOC(=O)C(CC1CCCO1)CC2=CC=CC3=CC=CC=C32...,Inactive,Inhibitor,1.0000,61.1351,-6.0,1.1000,0.9743,-79.6557,-18.5207,-81.8361,Complete curve; high efficacy,73.794032,61.1351,79.6557,81.8361
