In [1]:
import numpy as np
import pandas as pd

import re
from typing import Dict
from data_prep import batch_encode, pad_sequences

from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, SGDRegressor, BayesianRidge

### Pre-Processing (Our Data)

In [2]:
data = pd.read_csv('data/Sensor-Data-Update-9-11-23.csv')
data

Unnamed: 0,SensorName,SensorSequence,Length,Entropy,DG,H-bond,Tm,TmSensor,FoldIncrease
0,AdeS1,GACGCGACUGAAUGAAAUGGUGAAGGACGGGUCCAGCUGCGGAAGA...,5,-129.6,-6.9,14,30.2,81.4,9.0
1,AdeS2,GACGCGACUGAAUGAAAUGGUGAAGGACGGGUCCAGUAAUGGGAAG...,6,-109.9,-3.1,14,1.2,82.0,10.0
2,Ade S3,GACGCGACUGAAUGAAAUGGUGAAGGACGGGUCCAGUUGUGGAAGA...,5,-91.6,-3.1,12,-4.4,80.0,1.0
3,Ade S4,GACGCGACUGAAUGAAAUGGUGAAGGACGGGUCCAGUGUGUGGGAA...,7,-103.5,-0.6,16,-19.9,74.0,8.0
4,Ade S5,GACGCGACUGAAUGAAAUGGUGAAGGACGGGUCCAGUAUGUGGGAA...,7,-118.3,-2.6,14,-0.2,75.4,1.0
...,...,...,...,...,...,...,...,...,...
98,DGR-III/CM-5,GCCCGGAUAGCUCAGUCGGUAGAGCAGCGGAGACGGUCGGGUCUAU...,5,-75.8,0.3,10,-44.4,82.6,0.8
99,DGR-IV/CM-2,GCCCGGAUAGCUCAGUCGGUAGAGCAGCGGAGACGGUCGGGUCUAA...,2,-3.0,4.2,4,-400.7,79.7,1.7
100,DGR-IV/CM-3,GCCCGGAUAGCUCAGUCGGUAGAGCAGCGGAGACGGUCGGGUCUAU...,3,-28.4,2.7,6,-160.2,81.2,0.7
101,DGR-IV/CM-4,GCCCGGAUAGCUCAGUCGGUAGAGCAGCGGAGACGGUCGGGUCUAU...,4,-50.4,1.8,8,-84.8,80.7,1.3


In [3]:
# Get sensors as list
sequences = data['SensorSequence'].to_list()

# Remove spaces, pad them, encode them
sequences = [s.replace(' ', '') for s in sequences]
sequences = pad_sequences(sequences, 'end', pad_length='max')
sequences = batch_encode(sequences)

# Flatten and stack them into a 2d array
sequences = [s.flatten() for s in sequences]
sequences = np.vstack(sequences)

# Transpose the matrix and give each column to a numerical key in the dictionary
sequences_dict = {i: s for s, i in zip(sequences.T, range(sequences.T.shape[0]))}

# Instantiate a DataFrame from the dictionary of sequences
SequenceDF = pd.DataFrame.from_dict(sequences_dict)
SequenceDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,750,751,752,753,754,755,756,757,758,759
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
99,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
100,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
101,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [4]:
TargetDF = pd.DataFrame(data['FoldIncrease']).rename(columns={'FoldIncrease': 'target'})
TargetDF

Unnamed: 0,target
0,9.0
1,10.0
2,1.0
3,8.0
4,1.0
...,...
98,0.8
99,1.7
100,0.7
101,1.3


In [5]:
ThermoDynamicDF= data.drop(columns=['FoldIncrease', 'SensorName', 'SensorSequence'])
ThermoDynamicDF

Unnamed: 0,Length,Entropy,DG,H-bond,Tm,TmSensor
0,5,-129.6,-6.9,14,30.2,81.4
1,6,-109.9,-3.1,14,1.2,82.0
2,5,-91.6,-3.1,12,-4.4,80.0
3,7,-103.5,-0.6,16,-19.9,74.0
4,7,-118.3,-2.6,14,-0.2,75.4
...,...,...,...,...,...,...
98,5,-75.8,0.3,10,-44.4,82.6
99,2,-3.0,4.2,4,-400.7,79.7
100,3,-28.4,2.7,6,-160.2,81.2
101,4,-50.4,1.8,8,-84.8,80.7


In [6]:
OurData = {'thermo': ThermoDynamicDF, 'sequence': SequenceDF, 'target': TargetDF}

### Pre-Processing (MIT)

In [7]:
mit_data = pd.read_csv('data/mit-data.csv')

In [8]:
# Columns that contain sequence data
sequence_cols = {
    'pre_seq',
	'promoter',
	'trigger',
	'loop1',
	'switch',
	'loop2',
	'stem1',
	'atg',
	'stem2',
	'linker',
	'post_linker'
}

# Drop all the columns that don't 
drop_cols = set(mit_data.columns) - sequence_cols
sequences = mit_data.drop(columns=drop_cols)

# Transpose the DataFrame, iterate through its columns (which are now sequences),
# convert them to lists instead of Series, and join the lists into strings. Finally,
# batch encode the strings.
sequences = [sequence.to_list() for _, sequence in sequences.T.items() ]
sequences = [''.join(sequence) for sequence in sequences]
sequences = batch_encode(sequences)

# Flatten and stack them into a 2d array
sequences = [s.flatten() for s in sequences]
sequences = np.vstack(sequences)

# Transpose the matrix and give each column to a numerical key in the dictionary
sequences_dict = {i: s for s, i in zip(sequences.T, range(sequences.T.shape[0]))}

# Instantiate a DataFrame from the dictionary of sequences
SequenceDF = pd.DataFrame.from_dict(sequences_dict)
SequenceDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,730,731,732,733,734,735,736,737,738,739
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97431,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
97432,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
97433,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
97434,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
TargetDF = pd.DataFrame(mit_data['ON']).rename(columns={'ON': 'target'})
TargetDF

Unnamed: 0,target
0,
1,
2,0.068295
3,0.000000
4,0.080666
...,...
97431,
97432,
97433,
97434,0.749523


In [10]:
thermo_columns = {column for column in mit_data.columns if re.match('(i_ed_*|mfe_*|n_ed_*)', column)}
to_drop = set(mit_data.columns) - thermo_columns
ThermoDynamicDF = mit_data.drop(columns=to_drop)
ThermoDynamicDF

Unnamed: 0,i_ed_seq_Stem,i_ed_seq_StemTop,i_ed_seq_SwitchOFF,i_ed_seq_SwitchOFF_GFP,i_ed_seq_SwitchOFF_NoTo,i_ed_seq_SwitchON,i_ed_seq_SwitchON_GFP,i_ed_seq_ToeholdON,mfe_seq_AscendingStem,mfe_seq_DescendingStem,...,mfe_seq_ToeholdON,mfe_seq_Trigger,n_ed_seq_Stem,n_ed_seq_StemTop,n_ed_seq_SwitchOFF,n_ed_seq_SwitchOFF_GFP,n_ed_seq_SwitchOFF_NoTo,n_ed_seq_SwitchON,n_ed_seq_SwitchON_GFP,n_ed_seq_ToeholdON
0,0.045594,0.107483,0.045629,0.220045,0.176445,0.098725,0.206450,0.464523,-3.9,-2.5,...,-6.000000,-3.3,0.045594,0.107483,0.045629,0.094631,0.115150,0.087723,0.118163,0.366573
1,0.116376,0.103055,0.085353,0.247558,0.215779,0.165060,0.207707,0.355484,-3.8,-4.2,...,-3.400000,-0.7,0.046177,0.103055,0.045117,0.092424,0.115289,0.070147,0.120335,0.328411
2,0.077448,0.194465,0.064785,0.237387,0.194620,0.073367,0.149128,0.133784,-2.3,-0.1,...,-3.700000,-0.4,0.039217,0.194465,0.042872,0.099625,0.112484,0.075111,0.117784,0.121668
3,0.067319,0.047449,0.059073,0.231989,0.189072,0.114928,0.169648,0.134479,-6.5,-2.7,...,-7.500000,-2.3,0.025179,0.047449,0.034920,0.083096,0.104712,0.040880,0.197496,0.062479
4,0.081366,0.080877,0.077610,0.232337,0.196834,0.111117,0.162888,0.125176,-4.8,-0.9,...,-8.300000,-3.1,0.039036,0.080877,0.053348,0.163261,0.111635,0.101242,0.094598,0.144753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97431,0.106791,0.071546,0.155707,0.266451,0.211119,0.098958,0.159071,0.157733,-5.1,-2.1,...,-24.600000,-5.6,0.023890,0.071546,0.109744,0.190693,0.103692,0.088115,0.135301,0.012329
97432,0.099065,0.049921,0.133889,0.277358,0.207044,0.101572,0.149023,0.141199,-9.8,-3.4,...,-27.500000,-7.6,0.019232,0.049921,0.048357,0.148788,0.101348,0.012747,0.126926,0.112055
97433,0.105998,0.035708,0.151451,0.244402,0.210563,0.103103,0.183708,0.091269,-8.0,-1.5,...,-18.500000,-3.3,0.037524,0.035708,0.108178,0.111214,0.111301,0.091882,0.185176,0.091269
97434,0.080427,0.339341,0.159152,0.224620,0.194920,0.129650,0.187160,0.114606,-0.9,-2.0,...,-17.700001,-4.5,0.038175,0.274887,0.117673,0.162247,0.110500,0.111933,0.234785,0.114606


In [11]:
MitData: Dict[str, pd.DataFrame] = {'thermo': ThermoDynamicDF, 'sequence': SequenceDF, 'target': TargetDF}
to_drop = TargetDF[TargetDF['target'].isnull()].index
for key, data in MitData.items():
    MitData[key] = data.drop(index=to_drop)
    print(MitData[key].shape)

(60983, 30)
(60983, 740)
(60983, 1)


### Models and Training

In [12]:
def compare_data_types(data: Dict[str, pd.DataFrame], data_name: str) -> Dict[str, np.ndarray]:
    models = {
        'SVM': LinearSVR(max_iter=10_000, dual=True),
        'Decision Tree': DecisionTreeRegressor(),
        'MLP': MLPRegressor(activation='relu',
                         hidden_layer_sizes=(50, 50, 50),
                         learning_rate='adaptive',
                         max_iter=1000,
                         solver='adam'),
        'Random Forest': RandomForestRegressor(),
        'Linear': LinearRegression(n_jobs=-1),
        'SGD': SGDRegressor(),
        'Bayesian Ridge': BayesianRidge()
    }

    y = data['target'].values.ravel()
    r_val = {}
    for data_type in ['thermo', 'sequence']:
        x = data[data_type]
        assert isinstance(x, pd.DataFrame)
        for model_name, model in models.items():
            print(f'Training {model_name} on {data_type} data...')
            r_val[f'{data_name}-{data_type}-{model_name}'] = cross_val_score(model, x, y, n_jobs=-1)

    return r_val

In [13]:
scores: Dict[str, np.ndarray] = compare_data_types(MitData, 'MIT')

Training SVM on thermo data...




Training Decision Tree on thermo data...
Training MLP on thermo data...
Training Random Forest on thermo data...
Training Linear on thermo data...
Training SGD on thermo data...
Training Bayesian Ridge on thermo data...
Training SVM on sequence data...




Training Decision Tree on sequence data...
Training MLP on sequence data...
Training Random Forest on sequence data...
Training Linear on sequence data...
Training SGD on sequence data...
Training Bayesian Ridge on sequence data...


In [14]:
scores.update(compare_data_types(OurData, 'Fernandez'))

Training SVM on thermo data...




Training Decision Tree on thermo data...
Training MLP on thermo data...




Training Random Forest on thermo data...
Training Linear on thermo data...
Training SGD on thermo data...
Training Bayesian Ridge on thermo data...
Training SVM on sequence data...
Training Decision Tree on sequence data...
Training MLP on sequence data...
Training Random Forest on sequence data...
Training Linear on sequence data...
Training SGD on sequence data...
Training Bayesian Ridge on sequence data...


### Cleaning scores

In [19]:
scores = pd.DataFrame.from_dict(scores).T
scores = scores.rename(columns={i: name for i, name in zip(range(5), range(1,6))})
scores['info'] = scores.index
scores['Dataset'] = 0
scores.loc[scores['info'].str.contains('MIT'), 'Dataset'] = 'MIT'
scores.loc[scores['info'].str.contains('Fernandez'), 'Dataset'] = 'Fernandez'
scores['Data Type'] = 0
scores.loc[scores['info'].str.contains('thermo'), 'Data Type'] = 'Thermodynamic Parameters'
scores.loc[scores['info'].str.contains('sequence'), 'Data Type'] = 'Sequences'
scores['Model Type'] = ''
info_strs = scores['info'].to_list()
info_strs = [s[s.rindex('-') + 1:] for s in info_strs]
scores['Model Type'] = info_strs
scores.drop(columns='info', inplace=True)
scores.reset_index(inplace=True)
scores.drop(columns='index', inplace=True)

In [72]:
try:
    scores.to_csv('data/comparison.csv', mode='x')
except FileExistsError:
    print('File already exists, not overwriting it.')