# Identification of enhancers case study 

This section will present a comparative analysis to demonstrate the application and performance of proPythia for addressing sequence-based prediction problems.

We'll try to replicate one of the [BioSeq-Analysis](https://academic.oup.com/nar/article/47/20/e127/5559689?login=true) case studies for identifying [enhancers](https://academic.oup.com/bioinformatics/article/32/3/362/1744331?login=true#btv604-M1).

In [138]:
%load_ext autoreload

import sys
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.preprocessing import StandardScaler

sys.path.append('../../../../src/')
from propythia.shallow_ml import ShallowML

from descriptors import DNADescriptor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [139]:
import csv

def write_dict_to_csv(d: dict, filename: str):
    """
    Writes a dictionary to a csv file.
    """
    with open(filename, 'w') as csv_file:
        writer = csv.writer(csv_file)
        headers = ["id", "sequence"]
        writer.writerow(headers)
        for key, val in d.items():
            writer.writerow([key, val])

In [140]:
from sequence import ReadDNA

dna = ReadDNA()
dna.read_fasta_in_folder('enhancer_dataset')
for i in dna.d:
    write_dict_to_csv(dna.d[i], f'enhancer_dataset/{i}.csv')

This dataset has **742** weak enhancers, **742** strong enhancers, and **1484** non-enhancers.

In [141]:
strong_file =  r'enhancer_dataset/strong.csv'
weak_file =  r'enhancer_dataset/weak.csv'
non_file =  r'enhancer_dataset/non-enhancers.csv'

strong = pd.read_csv(strong_file)
weak = pd.read_csv(weak_file)
non = pd.read_csv(non_file)

print('strong', strong.shape)
print('weak', weak.shape)
print('non', non.shape)

strong (742, 2)
weak (742, 2)
non (1484, 2)


To calculate features, and to be more easy, we create a function to calculate features, calculating all available DNA features.

In [142]:
# List of descriptors to be calculed. If empty, all descriptors will be calculated.
specifics = ["binary"]

In [143]:
def calculate_feature(data):
    list_feature = []
    count = 0
    for seq in data['sequence']:
        res = {'sequence': seq}
        dna = DNADescriptor(seq)
        feature = dna.get_descriptors(specifics=specifics)
        res.update(feature)
        list_feature.append(res)
        
        # print progress every 100 sequences
        if count % 100 == 0:
            print(count, '/', len(data))

        count += 1
    print("Done!")
    df = pd.DataFrame(list_feature)
    return df

strong_feature = calculate_feature(strong)
weak_feature = calculate_feature(weak)
non_feature = calculate_feature(non)

0 / 742
100 / 742
200 / 742
300 / 742
400 / 742
500 / 742
600 / 742
700 / 742
Done!
0 / 742
100 / 742
200 / 742
300 / 742
400 / 742
500 / 742
600 / 742
700 / 742
Done!
0 / 1484
100 / 1484
200 / 1484
300 / 1484
400 / 1484
500 / 1484
600 / 1484
700 / 1484
800 / 1484
900 / 1484
1000 / 1484
1100 / 1484
1200 / 1484
1300 / 1484
1400 / 1484
Done!


- In the dataframe, each row is a sequence and each column is a feature.
- There are 19 different features for each sequence.

In [144]:
# put labels for each dataset   
strong_feature['label'] = 2
weak_feature['label'] = 1
non_feature['label'] = 0

print(strong_feature.shape)
print(weak_feature.shape)
print(non_feature.shape)

(742, 22)
(742, 22)
(1484, 22)


In [145]:
dataset = pd.concat([strong_feature, weak_feature, non_feature])

fps_y = dataset['label']
fps_x = dataset.loc[:, dataset.columns != 'label']
fps_x = fps_x.loc[:, fps_x.columns != 'sequence']

print(fps_x.shape)

(2968, 20)


Unnamed: 0,sequence,length,gc_content,at_content,nucleic_acid_composition,enhanced_nucleic_acid_composition,dinucleotide_composition,trinucleotide_composition,k_spaced_nucleic_acid_pairs,kmer,...,DAC,DCC,DACC,TAC,TCC,TACC,PseDNC,PseKNC,binary,label
0,CACAATGTAGAAGCAGAGACACAGGAACCAGGCTTGGTGATGGCTC...,200,0.505,0.495,"{'A': 44, 'C': 36, 'G': 65, 'T': 55}","[{'A': 3, 'C': 2, 'G': 0, 'T': 0}, {'A': 3, 'C...","{'AA': 6, 'AC': 9, 'AG': 19, 'AT': 10, 'CA': 1...","{'AAA': 0, 'AAC': 1, 'AAG': 4, 'AAT': 1, 'ACA'...","{'AA': 6, 'AC': 9, 'AG': 19, 'AT': 10, 'CA': 1...","{'AA': 6, 'AC': 9, 'AG': 19, 'AT': 10, 'CA': 1...",...,"[[-0.382, -0.12, 0.021, 0.018]]","[[-0.288, -0.216, 0.055, -0.006]]","[[-0.382, -0.12, 0.021, 0.018, -0.288, -0.216,...","[[-0.048, 0.336, -0.15, 0.03]]","[[-0.072, -0.024, 0.083, -0.157]]","[[-0.048, 0.336, -0.15, 0.03, -0.072, -0.024, ...","{'AA': 0.025, 'AC': 0.037, 'AG': 0.078, 'AT': ...","{'AAA': 0.0, 'AAC': 0.002, 'AAG': 0.01, 'AAT':...","[[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [1,...",2
1,GCCCTCACATTCCCTGGCCCATCCCCTCCACCTCAAAATTTACAAA...,200,0.570,0.430,"{'A': 57, 'C': 46, 'G': 68, 'T': 29}","[{'A': 0, 'C': 3, 'G': 1, 'T': 1}, {'A': 0, 'C...","{'AA': 20, 'AC': 11, 'AG': 18, 'AT': 8, 'CA': ...","{'AAA': 10, 'AAC': 1, 'AAG': 6, 'AAT': 3, 'ACA...","{'AA': 20, 'AC': 11, 'AG': 18, 'AT': 8, 'CA': ...","{'AA': 20, 'AC': 11, 'AG': 18, 'AT': 8, 'CA': ...",...,"[[-0.285, -0.066, 0.075, -0.023]]","[[-0.263, -0.145, 0.01, 0.071]]","[[-0.285, -0.066, 0.075, -0.023, -0.263, -0.14...","[[0.182, 0.49, 0.028, 0.051]]","[[-0.238, -0.372, -0.012, -0.066]]","[[0.182, 0.49, 0.028, 0.051, -0.238, -0.372, -...","{'AA': 0.082, 'AC': 0.045, 'AG': 0.074, 'AT': ...","{'AAA': 0.026, 'AAC': 0.003, 'AAG': 0.016, 'AA...","[[0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0], [0,...",2
2,GAGCAGGAGGCCAGTCACCCTGAGTCAGCCACGGGGAGACGCTGCA...,200,0.640,0.360,"{'A': 52, 'C': 71, 'G': 57, 'T': 20}","[{'A': 2, 'C': 1, 'G': 2, 'T': 0}, {'A': 2, 'C...","{'AA': 12, 'AC': 16, 'AG': 23, 'AT': 0, 'CA': ...","{'AAA': 2, 'AAC': 6, 'AAG': 4, 'AAT': 0, 'ACA'...","{'AA': 12, 'AC': 16, 'AG': 23, 'AT': 0, 'CA': ...","{'AA': 12, 'AC': 16, 'AG': 23, 'AT': 0, 'CA': ...",...,"[[-0.414, -0.14, 0.121, 0.072]]","[[-0.266, -0.304, 0.101, 0.075]]","[[-0.414, -0.14, 0.121, 0.072, -0.266, -0.304,...","[[0.09, 0.238, 0.039, -0.033]]","[[-0.158, -0.157, -0.044, 0.015]]","[[0.09, 0.238, 0.039, -0.033, -0.158, -0.157, ...","{'AA': 0.05, 'AC': 0.066, 'AG': 0.095, 'AT': 0...","{'AAA': 0.005, 'AAC': 0.015, 'AAG': 0.01, 'AAT...","[[0, 0, 1, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0,...",2
3,CCTCTGCTGAGAACAGGACTGGGGCTTCCAGGGCAACAGGAAGGGT...,200,0.570,0.430,"{'A': 47, 'C': 48, 'G': 66, 'T': 39}","[{'A': 0, 'C': 3, 'G': 0, 'T': 2}, {'A': 0, 'C...","{'AA': 13, 'AC': 10, 'AG': 19, 'AT': 5, 'CA': ...","{'AAA': 1, 'AAC': 4, 'AAG': 7, 'AAT': 1, 'ACA'...","{'AA': 13, 'AC': 10, 'AG': 19, 'AT': 5, 'CA': ...","{'AA': 13, 'AC': 10, 'AG': 19, 'AT': 5, 'CA': ...",...,"[[-0.21, -0.077, -0.122, -0.112]]","[[-0.203, -0.122, -0.058, -0.112]]","[[-0.21, -0.077, -0.122, -0.112, -0.203, -0.12...","[[-0.074, 0.223, 0.005, 0.005]]","[[-0.159, -0.163, -0.118, -0.071]]","[[-0.074, 0.223, 0.005, 0.005, -0.159, -0.163,...","{'AA': 0.055, 'AC': 0.043, 'AG': 0.081, 'AT': ...","{'AAA': 0.003, 'AAC': 0.011, 'AAG': 0.02, 'AAT...","[[0, 1, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [0,...",2
4,ACAGCCTTAAAGGGAGCTTTTCAGGGACCTCTGGCCAGTGGGGGAT...,200,0.585,0.415,"{'A': 43, 'C': 52, 'G': 65, 'T': 40}","[{'A': 2, 'C': 2, 'G': 1, 'T': 0}, {'A': 1, 'C...","{'AA': 8, 'AC': 7, 'AG': 24, 'AT': 3, 'CA': 13...","{'AAA': 3, 'AAC': 1, 'AAG': 3, 'AAT': 1, 'ACA'...","{'AA': 8, 'AC': 7, 'AG': 24, 'AT': 3, 'CA': 13...","{'AA': 8, 'AC': 7, 'AG': 24, 'AT': 3, 'CA': 13...",...,"[[-0.164, -0.042, -0.017, -0.014]]","[[-0.081, -0.13, -0.036, -0.006]]","[[-0.164, -0.042, -0.017, -0.014, -0.081, -0.1...","[[0.037, 0.386, 0.054, 0.069]]","[[-0.044, -0.105, 0.032, -0.051]]","[[0.037, 0.386, 0.054, 0.069, -0.044, -0.105, ...","{'AA': 0.035, 'AC': 0.031, 'AG': 0.105, 'AT': ...","{'AAA': 0.009, 'AAC': 0.003, 'AAG': 0.009, 'AA...","[[1, 0, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [0,...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1479,TGAGGACCTACTTTGTGCTAGGAACTTTATGTAAATGTGTTCTTTT...,200,0.295,0.705,"{'A': 63, 'C': 32, 'G': 27, 'T': 78}","[{'A': 1, 'C': 0, 'G': 3, 'T': 1}, {'A': 2, 'C...","{'AA': 23, 'AC': 12, 'AG': 7, 'AT': 20, 'CA': ...","{'AAA': 8, 'AAC': 5, 'AAG': 2, 'AAT': 7, 'ACA'...","{'AA': 23, 'AC': 12, 'AG': 7, 'AT': 20, 'CA': ...","{'AA': 23, 'AC': 12, 'AG': 7, 'AT': 20, 'CA': ...",...,"[[-0.425, -0.286, 0.061, 0.053]]","[[-0.358, -0.375, 0.13, 0.028]]","[[-0.425, -0.286, 0.061, 0.053, -0.358, -0.375...","[[0.261, 0.564, 0.134, 0.132]]","[[-0.088, -0.185, -0.044, -0.066]]","[[0.261, 0.564, 0.134, 0.132, -0.088, -0.185, ...","{'AA': 0.088, 'AC': 0.046, 'AG': 0.027, 'AT': ...","{'AAA': 0.017, 'AAC': 0.011, 'AAG': 0.004, 'AA...","[[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0,...",0
1480,ACTATACTTTACTTAGCTCCTTTTGGACTGCTGTTGGCTTCAAGTA...,200,0.390,0.610,"{'A': 72, 'C': 53, 'G': 25, 'T': 50}","[{'A': 2, 'C': 1, 'G': 0, 'T': 2}, {'A': 2, 'C...","{'AA': 19, 'AC': 29, 'AG': 12, 'AT': 12, 'CA':...","{'AAA': 5, 'AAC': 5, 'AAG': 5, 'AAT': 4, 'ACA'...","{'AA': 19, 'AC': 29, 'AG': 12, 'AT': 12, 'CA':...","{'AA': 19, 'AC': 29, 'AG': 12, 'AT': 12, 'CA':...",...,"[[-0.61, -0.266, 0.288, 0.168]]","[[-0.429, -0.414, 0.171, 0.193]]","[[-0.61, -0.266, 0.288, 0.168, -0.429, -0.414,...","[[0.114, 0.306, -0.017, 0.104]]","[[-0.083, -0.189, -0.034, -0.141]]","[[0.114, 0.306, -0.017, 0.104, -0.083, -0.189,...","{'AA': 0.075, 'AC': 0.114, 'AG': 0.047, 'AT': ...","{'AAA': 0.011, 'AAC': 0.011, 'AAG': 0.011, 'AA...","[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 1], [1,...",0
1481,CATGTATGCATTTTATTTAAAGATAAATCTTATTTAAAAATAAACT...,200,0.260,0.740,"{'A': 68, 'C': 26, 'G': 26, 'T': 80}","[{'A': 1, 'C': 1, 'G': 1, 'T': 2}, {'A': 2, 'C...","{'AA': 21, 'AC': 7, 'AG': 13, 'AT': 27, 'CA': ...","{'AAA': 8, 'AAC': 3, 'AAG': 3, 'AAT': 7, 'ACA'...","{'AA': 21, 'AC': 7, 'AG': 13, 'AT': 27, 'CA': ...","{'AA': 21, 'AC': 7, 'AG': 13, 'AT': 27, 'CA': ...",...,"[[-0.366, -0.235, 0.097, -0.034]]","[[-0.269, -0.342, 0.095, 0.077]]","[[-0.366, -0.235, 0.097, -0.034, -0.269, -0.34...","[[0.112, 0.701, 0.002, 0.272]]","[[-0.104, 0.119, -0.05, -0.031]]","[[0.112, 0.701, 0.002, 0.272, -0.104, 0.119, -...","{'AA': 0.079, 'AC': 0.026, 'AG': 0.049, 'AT': ...","{'AAA': 0.017, 'AAC': 0.006, 'AAG': 0.006, 'AA...","[[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0,...",0
1482,AACTCTCATATTAAATAATCGTTATGCTGAATACAGAGCTCTAGGC...,200,0.360,0.640,"{'A': 60, 'C': 42, 'G': 30, 'T': 68}","[{'A': 2, 'C': 2, 'G': 0, 'T': 1}, {'A': 1, 'C...","{'AA': 18, 'AC': 11, 'AG': 7, 'AT': 24, 'CA': ...","{'AAA': 8, 'AAC': 2, 'AAG': 2, 'AAT': 6, 'ACA'...","{'AA': 18, 'AC': 11, 'AG': 7, 'AT': 24, 'CA': ...","{'AA': 18, 'AC': 11, 'AG': 7, 'AT': 24, 'CA': ...",...,"[[-0.445, -0.231, 0.006, -0.039]]","[[-0.289, -0.4, -0.094, 0.06]]","[[-0.445, -0.231, 0.006, -0.039, -0.289, -0.4,...","[[0.057, 0.561, 0.119, 0.201]]","[[-0.304, -0.248, -0.065, -0.095]]","[[0.057, 0.561, 0.119, 0.201, -0.304, -0.248, ...","{'AA': 0.07, 'AC': 0.043, 'AG': 0.027, 'AT': 0...","{'AAA': 0.017, 'AAC': 0.004, 'AAG': 0.004, 'AA...","[[1, 0, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0,...",0


In [146]:
no_need_normalization = ["length", "at_content", "gc_content"]

need_dict_normalization = ["nucleic_acid_composition", "enhanced_nucleic_acid_composition","dinucleotide_composition","trinucleotide_composition","k_spaced_nucleic_acid_pairs","kmer","PseDNC", "PseKNC"]

need_list_normalization = ["nucleotide_chemical_property", "accumulated_nucleotide_frequency", "DAC", "DCC", "DACC", "TAC","TCC","TACC", "binary"]

def normalize_dict(d, field):
    df = pd.json_normalize(d)
    df.columns = [str(field) + "_" + str(i) for i in df.columns]
    
    for f in df.columns:
        if isinstance(df[f][0], dict):
            df = pd.concat([df, normalize_dict(df[f], f)], axis=1)
            df.drop(f, axis=1, inplace=True)
    return df

def normalize_list(l, field):
    df = pd.DataFrame(l.to_list())
    df.columns = [str(field) + "_" + str(i) for i in df.columns]
    
    for f in df.columns:
        if isinstance(df[f][0], list):
            df = pd.concat([df, normalize_list(df[f], f)], axis=1)
            df.drop(f, axis=1, inplace=True)
    return df

new_fps_x = pd.DataFrame()

for col in fps_x.columns:
    if col in need_dict_normalization:
        new_fps_x = pd.concat([new_fps_x, normalize_dict(fps_x[col], col)], axis=1)
    elif col in need_list_normalization:
        new_fps_x = pd.concat([new_fps_x, normalize_list(fps_x[col], col)], axis=1)
    else:
        new_fps_x[col] = fps_x[col].to_numpy()
        
new_fps_x

Unnamed: 0,length,gc_content,at_content,nucleic_acid_composition_A,nucleic_acid_composition_C,nucleic_acid_composition_G,nucleic_acid_composition_T,enhanced_nucleic_acid_composition_0_A,enhanced_nucleic_acid_composition_0_C,enhanced_nucleic_acid_composition_0_G,...,binary_197_2,binary_197_3,binary_198_0,binary_198_1,binary_198_2,binary_198_3,binary_199_0,binary_199_1,binary_199_2,binary_199_3
0,200,0.505,0.495,44,36,65,55,3,2,0,...,0,0,0,0,0,1,0,0,0,1
1,200,0.570,0.430,57,46,68,29,0,3,1,...,0,0,0,1,0,0,0,1,0,0
2,200,0.640,0.360,52,71,57,20,2,1,2,...,0,0,0,1,0,0,1,0,0,0
3,200,0.570,0.430,47,48,66,39,0,3,0,...,0,0,1,0,0,0,0,0,1,0
4,200,0.585,0.415,43,52,65,40,2,2,1,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2963,200,0.295,0.705,63,32,27,78,1,0,3,...,0,1,1,0,0,0,1,0,0,0
2964,200,0.390,0.610,72,53,25,50,2,1,0,...,0,0,1,0,0,0,0,0,0,1
2965,200,0.260,0.740,68,26,26,80,1,1,1,...,0,0,1,0,0,0,0,0,1,0
2966,200,0.360,0.640,60,42,30,68,2,2,0,...,0,0,0,1,0,0,0,0,0,1


In [147]:
X_train, X_test, y_train, y_test = train_test_split(new_fps_x, fps_y, stratify=fps_y)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# open a ShallowML object
ml = ShallowML(X_train, X_test, y_train, y_test, report_name=None, columns_names=new_fps_x.columns)

# define param grid as article, here we will search in 100, 200 and 500 estimators
param_grid = [{'clf__n_estimators': [100, 200, 500], 'clf__max_features': ['sqrt']}]

# rain_best_model will perform a GRIDSEARCHCV optimizing MCC with a cv = 10
best_rf_model_enhancers = ml.train_best_model(model_name=None, model='rf', score=make_scorer(matthews_corrcoef), param_grid=param_grid, cv=10)

performing gridSearch...
GridSearchCV took 23.11 seconds for 3 candidate parameter settings.
GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scl', None),
                                       ('clf',
                                        RandomForestClassifier(random_state=1))]),
             n_jobs=10,
             param_grid=[{'clf__max_features': ['sqrt'],
                          'clf__n_estimators': [100, 200, 500]}],
             scoring=make_scorer(matthews_corrcoef))
Model with rank: 1
 Mean validation score: 0.417 (std: 0.030)
 Parameters: {'clf__max_features': 'sqrt', 'clf__n_estimators': 500}
 

Model with rank: 2
 Mean validation score: 0.405 (std: 0.041)
 Parameters: {'clf__max_features': 'sqrt', 'clf__n_estimators': 200}
 

Model with rank: 3
 Mean validation score: 0.403 (std: 0.039)
 Parameters: {'clf__max_features': 'sqrt', 'clf__n_estimators': 100}
 

make_scorer(matthews_corrcoef)
10
Best score (scorer: make_scorer(matthews_corrcoef)) and parameters 

In [148]:
scores, report, cm, cm2 = ml.score_testset(best_rf_model_enhancers)
print(report)
print(cm)  
scores

              precision    recall  f1-score   support

           0       0.67      0.86      0.75       371
           1       0.35      0.06      0.11       185
           2       0.51      0.64      0.57       186

    accuracy                           0.61       742
   macro avg       0.51      0.52      0.48       742
weighted avg       0.55      0.61      0.55       742

[[318   9  44]
 [102  12  71]
 [ 54  13 119]]


{'Accuracy': 0.605121293800539,
 'MCC': 0.3526646140159632,
 'log_loss': 0.8517542375525492,
 'f1 score weighted': 0.5457032916782006,
 'f1 score macro': 0.47630614321868264,
 'f1 score micro': 0.605121293800539,
 'roc_auc ovr': 0.7616846494737284,
 'roc_auc ovo': 0.7371287769102848,
 'precision': 0.550919939911075,
 'recall': 0.605121293800539}

In [149]:
# SVM MODEL

# best_rf_model_enhancers = ml.train_best_model(model_name=None,model='svm', scaler=None,
#                 score=make_scorer(matthews_corrcoef),
#                 cv=10, optType='gridSearch', param_grid=None,
#                 n_jobs=10,
#                 random_state=1, n_iter=15, refit=True)