### Objective: To train Classification models for Heparin RRT with data from multiple antibodies.


In [1]:
import mlflow
from mlflow.models import infer_signature

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, average_precision_score, roc_auc_score, accuracy_score, f1_score    
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.neighbors import KNeighborsRegressor

from tqdm.notebook import tqdm
import tempfile

from developability.utils import ls
from developability.modeling import  *
from developability import data as data_

## retina display
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

# paths 

data_path = Path(data_.__path__[0])

### Load training data

In [2]:
training_path = data_path / 'train'
today = '04-26-2024'
filename = training_path/f'train_{today}.parquet'
train = pd.read_parquet(filename)
train.head()

Unnamed: 0,HeparinRRT_scaled,HCDR1_APBS_pos,HCDR1_APBS_neg,HCDR1_APBS_net,HCDR2_APBS_pos,HCDR2_APBS_neg,HCDR2_APBS_net,HCDR3_APBS_pos,HCDR3_APBS_neg,HCDR3_APBS_net,HFR1_APBS_pos,HFR1_APBS_neg,HFR1_APBS_net,HFR2_APBS_pos,HFR2_APBS_neg,HFR2_APBS_net,HFR3_APBS_pos,HFR3_APBS_neg,HFR3_APBS_net,HFR4_APBS_pos,HFR4_APBS_neg,HFR4_APBS_net,LCDR1_APBS_pos,LCDR1_APBS_neg,LCDR1_APBS_net,LCDR2_APBS_pos,LCDR2_APBS_neg,LCDR2_APBS_net,LCDR3_APBS_pos,LCDR3_APBS_neg,LCDR3_APBS_net,LFR1_APBS_pos,LFR1_APBS_neg,LFR1_APBS_net,LFR2_APBS_pos,LFR2_APBS_neg,LFR2_APBS_net,LFR3_APBS_pos,LFR3_APBS_neg,LFR3_APBS_net,LFR4_APBS_pos,LFR4_APBS_neg,LFR4_APBS_net,HCDR_APBS_pos,HCDR_APBS_neg,HCDR_APBS_net,LCDR_APBS_pos,LCDR_APBS_neg,LCDR_APBS_net,HFR_APBS_pos,HFR_APBS_neg,HFR_APBS_net,LFR_APBS_pos,LFR_APBS_neg,LFR_APBS_net,HC_APBS_pos,HC_APBS_neg,HC_APBS_net,LC_APBS_pos,LC_APBS_neg,LC_APBS_net,TOTAL_CDR_APBS_pos,TOTAL_CDR_APBS_neg,TOTAL_CDR_APBS_net,TOTAL_FR_APBS_pos,TOTAL_FR_APBS_neg,TOTAL_FR_APBS_net,TOTAL_APBS_pos,TOTAL_APBS_neg,TOTAL_APBS_net
VIRMAB-MPK65-R1-19,0.94363,514.043614,-226.497693,287.545921,957.03916,-2774.632167,-1817.593007,474.052607,-11631.036286,-11156.983678,9187.47317,-1570.692614,7616.780557,2185.065987,-203.909684,1981.156303,4158.978221,-1759.85287,2399.125351,772.320449,-949.248069,-176.92762,497.569119,-1568.220608,-1070.651489,2535.649024,-371.314366,2164.334658,430.989836,-729.034198,-298.044362,3267.75769,-1719.993507,1547.764183,5513.449735,-22.10747,5491.342266,4917.313653,-2783.716576,2133.597077,1322.69265,-907.0956,415.59705,1945.135381,-14632.166146,-12687.030764,3464.207979,-2668.569172,795.638807,16303.837827,-4483.703237,11820.13459,15021.213728,-5432.913152,9588.300576,18248.973209,-19115.869383,-866.896174,18485.421707,-8101.482324,10383.939383,5409.34336,-17300.735317,-11891.391957,31325.051555,-9916.61639,21408.435166,36734.394916,-27217.351707,9517.043209
VIRMAB-MPK65-R1-21,0.944588,242.08893,-155.894361,86.194569,1144.556373,-2836.386774,-1691.830401,241.478273,-11589.613907,-11348.135635,8455.836708,-1173.958997,7281.877712,2439.492207,-316.636589,2122.855618,6257.033415,-286.276976,5970.756439,257.495449,-927.100073,-669.604624,600.417592,-1436.307811,-835.890219,2255.001478,-333.490586,1921.510893,413.984661,-705.248976,-291.264315,2982.452006,-2070.164975,912.28703,2189.138402,-609.270846,1579.867556,3574.310194,-2720.005493,854.304702,980.825967,-1350.726489,-369.900522,1628.123576,-14581.895043,-12953.771467,3269.403732,-2475.047373,794.356359,17409.85778,-2703.972634,14705.885145,9726.726569,-6750.167803,2976.558766,19037.981355,-17285.867677,1752.113678,12996.130301,-9225.215175,3770.915125,4897.527308,-17056.942415,-12159.415108,27136.584348,-9454.140437,17682.443911,32034.111656,-26511.082852,5523.028804
VIRMAB-MPK65-R1-27,0.944588,464.560335,-58.598476,405.961859,865.244076,-2724.851363,-1859.607288,423.496289,-11045.290265,-10621.793976,9069.096991,-1690.99355,7378.10344,2381.159008,-213.788903,2167.370104,3713.731344,-2169.095956,1544.635388,808.693052,-1036.254124,-227.561072,548.447,-1413.785404,-865.338404,2539.518784,-305.141182,2234.377601,546.423271,-638.26571,-91.842439,3479.121146,-1744.15875,1734.962396,5419.619395,-17.908031,5401.711364,4822.124473,-2612.472569,2209.651903,1460.308495,-951.52045,508.788045,1753.3007,-13828.740105,-12075.439405,3634.389055,-2357.192296,1277.196759,15972.680394,-5110.132534,10862.54786,15181.17351,-5326.059801,9855.113709,17725.981094,-18938.872638,-1212.891545,18815.562564,-7683.252097,11132.310467,5387.689754,-16185.932401,-10798.242646,31153.853903,-10436.192335,20717.661569,36541.543658,-26622.124735,9919.418923
VIRMAB-MPK65-R1-24,0.945546,291.646471,-147.543778,144.102693,820.74161,-2737.173248,-1916.431638,418.326279,-11228.470281,-10810.144003,9326.261275,-1077.094244,8249.167032,373.585959,-904.631815,-531.045856,5655.61545,-298.212578,5357.402871,588.545738,-885.861933,-297.316195,421.403946,-1570.19175,-1148.787804,2487.563063,-391.122694,2096.440369,476.021169,-727.770477,-251.749308,2494.457975,-2282.583853,211.874121,4551.228647,-33.474826,4517.753821,4311.286793,-3212.986209,1098.300584,883.860993,-1370.737351,-486.876358,1530.71436,-14113.187308,-12582.472948,3384.988179,-2689.084922,695.903257,15944.008422,-3165.800571,12778.207851,12240.834408,-6899.782239,5341.052169,17474.722782,-17278.987878,195.734904,15625.822587,-9588.867161,6036.955427,4915.702539,-16802.272229,-11886.56969,28184.84283,-10065.58281,18119.260021,33100.545369,-26867.855039,6232.69033
VIRMAB-MPK65-R1-22,0.945546,366.064822,-108.70411,257.360713,1468.203155,-2324.547613,-856.344458,377.512744,-11357.946167,-10980.433422,9703.61638,-1144.026385,8559.589996,2902.108756,-244.230142,2657.878614,6786.561876,-100.900932,6685.660944,761.538236,-751.449461,10.088775,693.839926,-1396.639095,-702.79917,1805.883533,-559.961224,1245.92231,489.324095,-631.560859,-142.236764,3623.278379,-1924.117822,1699.160557,4772.156701,-29.028735,4743.127966,1996.512878,-3657.939415,-1661.426537,1162.808584,-1365.885758,-203.077174,2211.780721,-13791.197889,-11579.417167,2989.047554,-2588.161178,400.886376,20153.825249,-2240.60692,17913.218329,11554.756542,-6976.97173,4577.784812,22365.60597,-16031.804809,6333.801162,14543.804096,-9565.132909,4978.671188,5200.828276,-16379.359067,-11178.530791,31708.581791,-9217.57865,22491.003141,36909.410067,-25596.937717,11312.472349


### Features

In [3]:
def has_number(s): 
    for c in s: 
        if c.isdigit():
            return True
        
    return False

feature_sets = {'positive_regions': [col for col in train.columns if col.endswith('pos') and has_number(col)],
                'negative_regions': [col for col in train.columns if col.endswith('neg') and has_number(col)],
                'net_regions': [col for col in train.columns if col.endswith('net') and has_number(col)], 
                'total_cdr_pos': [col for col in train.columns if not has_number(col) and col.endswith('pos') and col.startswith('TOTAL')],
                'total_cdr_neg': [col for col in train.columns if not has_number(col) and col.endswith('neg') and col.startswith('TOTAL')],
                'total_cdr_net': [col for col in train.columns if not has_number(col) and col.endswith('net') and col.startswith('TOTAL')],
                'all': [col for col in train.columns if 'APBS' in col]
}

### set up a target and save the file 

In [7]:
label = (train['HeparinRRT_scaled']>=1)*1

if 'HeparinRRT>=1' not in train: 
    train.insert(1, 'HeparinRRT>=1', label)
    train.to_parquet(filename)

train.head()


Unnamed: 0,HeparinRRT_scaled,HeparinRRT>=1,HCDR1_APBS_pos,HCDR1_APBS_neg,HCDR1_APBS_net,HCDR2_APBS_pos,HCDR2_APBS_neg,HCDR2_APBS_net,HCDR3_APBS_pos,HCDR3_APBS_neg,HCDR3_APBS_net,HFR1_APBS_pos,HFR1_APBS_neg,HFR1_APBS_net,HFR2_APBS_pos,HFR2_APBS_neg,HFR2_APBS_net,HFR3_APBS_pos,HFR3_APBS_neg,HFR3_APBS_net,HFR4_APBS_pos,HFR4_APBS_neg,HFR4_APBS_net,LCDR1_APBS_pos,LCDR1_APBS_neg,LCDR1_APBS_net,LCDR2_APBS_pos,LCDR2_APBS_neg,LCDR2_APBS_net,LCDR3_APBS_pos,LCDR3_APBS_neg,LCDR3_APBS_net,LFR1_APBS_pos,LFR1_APBS_neg,LFR1_APBS_net,LFR2_APBS_pos,LFR2_APBS_neg,LFR2_APBS_net,LFR3_APBS_pos,LFR3_APBS_neg,LFR3_APBS_net,LFR4_APBS_pos,LFR4_APBS_neg,LFR4_APBS_net,HCDR_APBS_pos,HCDR_APBS_neg,HCDR_APBS_net,LCDR_APBS_pos,LCDR_APBS_neg,LCDR_APBS_net,HFR_APBS_pos,HFR_APBS_neg,HFR_APBS_net,LFR_APBS_pos,LFR_APBS_neg,LFR_APBS_net,HC_APBS_pos,HC_APBS_neg,HC_APBS_net,LC_APBS_pos,LC_APBS_neg,LC_APBS_net,TOTAL_CDR_APBS_pos,TOTAL_CDR_APBS_neg,TOTAL_CDR_APBS_net,TOTAL_FR_APBS_pos,TOTAL_FR_APBS_neg,TOTAL_FR_APBS_net,TOTAL_APBS_pos,TOTAL_APBS_neg,TOTAL_APBS_net
VIRMAB-MPK65-R1-19,0.94363,0,514.043614,-226.497693,287.545921,957.03916,-2774.632167,-1817.593007,474.052607,-11631.036286,-11156.983678,9187.47317,-1570.692614,7616.780557,2185.065987,-203.909684,1981.156303,4158.978221,-1759.85287,2399.125351,772.320449,-949.248069,-176.92762,497.569119,-1568.220608,-1070.651489,2535.649024,-371.314366,2164.334658,430.989836,-729.034198,-298.044362,3267.75769,-1719.993507,1547.764183,5513.449735,-22.10747,5491.342266,4917.313653,-2783.716576,2133.597077,1322.69265,-907.0956,415.59705,1945.135381,-14632.166146,-12687.030764,3464.207979,-2668.569172,795.638807,16303.837827,-4483.703237,11820.13459,15021.213728,-5432.913152,9588.300576,18248.973209,-19115.869383,-866.896174,18485.421707,-8101.482324,10383.939383,5409.34336,-17300.735317,-11891.391957,31325.051555,-9916.61639,21408.435166,36734.394916,-27217.351707,9517.043209
VIRMAB-MPK65-R1-21,0.944588,0,242.08893,-155.894361,86.194569,1144.556373,-2836.386774,-1691.830401,241.478273,-11589.613907,-11348.135635,8455.836708,-1173.958997,7281.877712,2439.492207,-316.636589,2122.855618,6257.033415,-286.276976,5970.756439,257.495449,-927.100073,-669.604624,600.417592,-1436.307811,-835.890219,2255.001478,-333.490586,1921.510893,413.984661,-705.248976,-291.264315,2982.452006,-2070.164975,912.28703,2189.138402,-609.270846,1579.867556,3574.310194,-2720.005493,854.304702,980.825967,-1350.726489,-369.900522,1628.123576,-14581.895043,-12953.771467,3269.403732,-2475.047373,794.356359,17409.85778,-2703.972634,14705.885145,9726.726569,-6750.167803,2976.558766,19037.981355,-17285.867677,1752.113678,12996.130301,-9225.215175,3770.915125,4897.527308,-17056.942415,-12159.415108,27136.584348,-9454.140437,17682.443911,32034.111656,-26511.082852,5523.028804
VIRMAB-MPK65-R1-27,0.944588,0,464.560335,-58.598476,405.961859,865.244076,-2724.851363,-1859.607288,423.496289,-11045.290265,-10621.793976,9069.096991,-1690.99355,7378.10344,2381.159008,-213.788903,2167.370104,3713.731344,-2169.095956,1544.635388,808.693052,-1036.254124,-227.561072,548.447,-1413.785404,-865.338404,2539.518784,-305.141182,2234.377601,546.423271,-638.26571,-91.842439,3479.121146,-1744.15875,1734.962396,5419.619395,-17.908031,5401.711364,4822.124473,-2612.472569,2209.651903,1460.308495,-951.52045,508.788045,1753.3007,-13828.740105,-12075.439405,3634.389055,-2357.192296,1277.196759,15972.680394,-5110.132534,10862.54786,15181.17351,-5326.059801,9855.113709,17725.981094,-18938.872638,-1212.891545,18815.562564,-7683.252097,11132.310467,5387.689754,-16185.932401,-10798.242646,31153.853903,-10436.192335,20717.661569,36541.543658,-26622.124735,9919.418923
VIRMAB-MPK65-R1-24,0.945546,0,291.646471,-147.543778,144.102693,820.74161,-2737.173248,-1916.431638,418.326279,-11228.470281,-10810.144003,9326.261275,-1077.094244,8249.167032,373.585959,-904.631815,-531.045856,5655.61545,-298.212578,5357.402871,588.545738,-885.861933,-297.316195,421.403946,-1570.19175,-1148.787804,2487.563063,-391.122694,2096.440369,476.021169,-727.770477,-251.749308,2494.457975,-2282.583853,211.874121,4551.228647,-33.474826,4517.753821,4311.286793,-3212.986209,1098.300584,883.860993,-1370.737351,-486.876358,1530.71436,-14113.187308,-12582.472948,3384.988179,-2689.084922,695.903257,15944.008422,-3165.800571,12778.207851,12240.834408,-6899.782239,5341.052169,17474.722782,-17278.987878,195.734904,15625.822587,-9588.867161,6036.955427,4915.702539,-16802.272229,-11886.56969,28184.84283,-10065.58281,18119.260021,33100.545369,-26867.855039,6232.69033
VIRMAB-MPK65-R1-22,0.945546,0,366.064822,-108.70411,257.360713,1468.203155,-2324.547613,-856.344458,377.512744,-11357.946167,-10980.433422,9703.61638,-1144.026385,8559.589996,2902.108756,-244.230142,2657.878614,6786.561876,-100.900932,6685.660944,761.538236,-751.449461,10.088775,693.839926,-1396.639095,-702.79917,1805.883533,-559.961224,1245.92231,489.324095,-631.560859,-142.236764,3623.278379,-1924.117822,1699.160557,4772.156701,-29.028735,4743.127966,1996.512878,-3657.939415,-1661.426537,1162.808584,-1365.885758,-203.077174,2211.780721,-13791.197889,-11579.417167,2989.047554,-2588.161178,400.886376,20153.825249,-2240.60692,17913.218329,11554.756542,-6976.97173,4577.784812,22365.60597,-16031.804809,6333.801162,14543.804096,-9565.132909,4978.671188,5200.828276,-16379.359067,-11178.530791,31708.581791,-9217.57865,22491.003141,36909.410067,-25596.937717,11312.472349


### Look at the labels. 

### Set up and run the experiment. 

In [13]:
tracking_uri = 'http://127.0.0.1:5000'

target = 'HeparinRRT>=1'
experiment_name = 'MultipleParentalAntibodyClassificationExp1'

experiment = MLFlowExperiment(filename,
                              target=target, 
                              experiment_name=experiment_name, 
                              regression=False, 
                              feature_sets = feature_sets, 
                              tracking_uri=tracking_uri)
experiment.train_models()

  0%|          | 0/7 [00:00<?, ?it/s]


AttributeError: 'int' object has no attribute 'split'

In [20]:
y = experiment.data[experiment.target]
X = experiment.data[experiment.feature_sets['all']]
experiment.get_cv_splitter(X,y)

AttributeError: 'int' object has no attribute 'split'

In [22]:
RepeatedStratifiedKFold(n_splits=5,
                        n_repeats=10,
                        random_state=42
                                           ).split(X, y)

<generator object _RepeatedSplits.split at 0x7f9376fe7810>