# Identification of enhancers case study 

This section will present a comparative analysis to demonstrate the application and performance of proPythia for addressing sequence-based prediction problems.

We'll try to replicate one of the [BioSeq-Analysis](https://academic.oup.com/nar/article/47/20/e127/5559689?login=true) case studies for identifying [enhancers](https://academic.oup.com/bioinformatics/article/32/3/362/1744331?login=true#btv604-M1).

In [2]:
%load_ext autoreload

import sys
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.preprocessing import StandardScaler

sys.path.append('../../../../src/')
from propythia.shallow_ml import ShallowML

from descriptors import DNADescriptor

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import csv

def write_dict_to_csv(d: dict, filename: str):
    """
    Writes a dictionary to a csv file.
    """
    with open(filename, 'w') as csv_file:
        writer = csv.writer(csv_file)
        headers = ["id", "sequence"]
        writer.writerow(headers)
        for key, val in d.items():
            writer.writerow([key, val])

In [4]:
from sequence import ReadDNA

dna = ReadDNA()
dna.read_fasta_in_folder('enhancer_dataset')
for i in dna.d:
    write_dict_to_csv(dna.d[i], f'enhancer_dataset/{i}.csv')

This dataset has **742** weak enhancers, **742** strong enhancers, and **1484** non-enhancers.

In [5]:
strong_file =  r'enhancer_dataset/strong.csv'
weak_file =  r'enhancer_dataset/weak.csv'
non_file =  r'enhancer_dataset/non-enhancers.csv'

strong = pd.read_csv(strong_file)
weak = pd.read_csv(weak_file)
non = pd.read_csv(non_file)

print('strong', strong.shape)
print('weak', weak.shape)
print('non', non.shape)

strong (742, 2)
weak (742, 2)
non (1484, 2)


To calculate features, and to be more easy, we create a function to calculate features, calculating all available DNA features.

In [6]:
def calculate_feature(data):
    list_feature = []
    count = 0
    for seq in data['sequence']:
        res = {'sequence': seq}
        dna = DNADescriptor(seq)
        feature = dna.get_all_descriptors()
        res.update(feature)
        list_feature.append(res)
        
        # print progress every 100 sequences
        if count % 100 == 0:
            print(count, '/', len(data))

        count += 1
    print("Done!")
    df = pd.DataFrame(list_feature)
    return df

strong_feature = calculate_feature(strong)
weak_feature = calculate_feature(weak)
non_feature = calculate_feature(non)

0 / 742
100 / 742
200 / 742
300 / 742
400 / 742
500 / 742
600 / 742
700 / 742
Done!
0 / 742
100 / 742
200 / 742
300 / 742
400 / 742
500 / 742
600 / 742
700 / 742
Done!
0 / 1484
100 / 1484
200 / 1484
300 / 1484
400 / 1484
500 / 1484
600 / 1484
700 / 1484
800 / 1484
900 / 1484
1000 / 1484
1100 / 1484
1200 / 1484
1300 / 1484
1400 / 1484
Done!


- In the dataframe, each row is a sequence and each column is a feature.
- There are 19 different features for each sequence.

In [7]:
# put labels for each dataset   
strong_feature['label'] = 2
weak_feature['label'] = 1
non_feature['label'] = 0

print(strong_feature.shape)
print(weak_feature.shape)
print(non_feature.shape)

(742, 21)
(742, 21)
(1484, 21)


In [8]:
dataset = pd.concat([strong_feature, weak_feature, non_feature])

fps_y = dataset['label']
fps_x = dataset.loc[:, dataset.columns != 'label']
fps_x = fps_x.loc[:, fps_x.columns != 'sequence']

print(fps_x.shape)

(2968, 19)


In [9]:
no_need_normalization = ["length", "at_content", "gc_content"]

need_dict_normalization = ["nucleic_acid_composition", "enhanced_nucleic_acid_composition","dinucleotide_composition","trinucleotide_composition","k_spaced_nucleic_acid_pairs","kmer","PseDNC", "PseKNC"]

need_list_normalization = ["nucleotide_chemical_property", "accumulated_nucleotide_frequency", "DAC", "DCC", "DACC", "TAC","TCC","TACC"]

def normalize_dict(d, field):
    df = pd.json_normalize(d)
    df.columns = [str(field) + "_" + str(i) for i in df.columns]
    
    for f in df.columns:
        if isinstance(df[f][0], dict):
            df = pd.concat([df, normalize_dict(df[f], f)], axis=1)
            df.drop(f, axis=1, inplace=True)
    return df

def normalize_list(l, field):
    df = pd.DataFrame(l.to_list())
    df.columns = [str(field) + "_" + str(i) for i in df.columns]
    
    for f in df.columns:
        if isinstance(df[f][0], list):
            df = pd.concat([df, normalize_list(df[f], f)], axis=1)
            df.drop(f, axis=1, inplace=True)
    return df

new_fps_x = pd.DataFrame()

for col in fps_x.columns:
    if col in need_dict_normalization:
        new_fps_x = pd.concat([new_fps_x, normalize_dict(fps_x[col], col)], axis=1)
    elif col in need_list_normalization:
        new_fps_x = pd.concat([new_fps_x, normalize_list(fps_x[col], col)], axis=1)
    else:
        new_fps_x[col] = fps_x[col].to_numpy()

field nucleic_acid_composition
d 0       {'A': 44, 'C': 36, 'G': 65, 'T': 55}
1       {'A': 57, 'C': 46, 'G': 68, 'T': 29}
2       {'A': 52, 'C': 71, 'G': 57, 'T': 20}
3       {'A': 47, 'C': 48, 'G': 66, 'T': 39}
4       {'A': 43, 'C': 52, 'G': 65, 'T': 40}
                        ...                 
1479    {'A': 63, 'C': 32, 'G': 27, 'T': 78}
1480    {'A': 72, 'C': 53, 'G': 25, 'T': 50}
1481    {'A': 68, 'C': 26, 'G': 26, 'T': 80}
1482    {'A': 60, 'C': 42, 'G': 30, 'T': 68}
1483    {'A': 72, 'C': 33, 'G': 53, 'T': 42}
Name: nucleic_acid_composition, Length: 2968, dtype: object
field enhanced_nucleic_acid_composition
d 0       [{'A': 3, 'C': 2, 'G': 0, 'T': 0}, {'A': 3, 'C...
1       [{'A': 0, 'C': 3, 'G': 1, 'T': 1}, {'A': 0, 'C...
2       [{'A': 2, 'C': 1, 'G': 2, 'T': 0}, {'A': 2, 'C...
3       [{'A': 0, 'C': 3, 'G': 0, 'T': 2}, {'A': 0, 'C...
4       [{'A': 2, 'C': 2, 'G': 1, 'T': 0}, {'A': 1, 'C...
                              ...                        
1479    [{'A': 1, 'C':

In [10]:
X_train, X_test, y_train, y_test = train_test_split(new_fps_x, fps_y, stratify=fps_y)

# standard scaler article does not refer scaling and do not validate in x_test, however, we do it anyway

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# open a ShallowML object
ml = ShallowML(X_train, X_test, y_train, y_test, report_name=None, columns_names=new_fps_x.columns)

# define param grid as article, here we will search in 100, 200 and 500 estimators
param_grid = [{'clf__n_estimators': [100, 200, 500], 'clf__max_features': ['sqrt']}]

# rain_best_model will perform a GRIDSEARCHCV optimizing MCC with a cv = 10
best_rf_model_enhancers = ml.train_best_model(model_name=None, model='rf', score=make_scorer(matthews_corrcoef), param_grid=param_grid, cv=10)

# best_rf_model_enhancers = ml.train_best_model(model_name=None,model='svm', scaler=None,
#                  score=make_scorer(matthews_corrcoef),
#                  cv=10, optType='gridSearch', param_grid=None,
#                  n_jobs=10,
#                  random_state=1, n_iter=15, refit=True)

performing gridSearch...
GridSearchCV took 22.24 seconds for 3 candidate parameter settings.
GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scl', None),
                                       ('clf',
                                        RandomForestClassifier(random_state=1))]),
             n_jobs=10,
             param_grid=[{'clf__max_features': ['sqrt'],
                          'clf__n_estimators': [100, 200, 500]}],
             scoring=make_scorer(matthews_corrcoef))
Model with rank: 1
 Mean validation score: 0.409 (std: 0.025)
 Parameters: {'clf__max_features': 'sqrt', 'clf__n_estimators': 500}
 

Model with rank: 2
 Mean validation score: 0.401 (std: 0.019)
 Parameters: {'clf__max_features': 'sqrt', 'clf__n_estimators': 200}
 

Model with rank: 3
 Mean validation score: 0.395 (std: 0.034)
 Parameters: {'clf__max_features': 'sqrt', 'clf__n_estimators': 100}
 

make_scorer(matthews_corrcoef)
10
Best score (scorer: make_scorer(matthews_corrcoef)) and parameters 

KeyboardInterrupt: 

In [None]:
scores, report, cm, cm2 = ml.score_testset(best_rf_model_enhancers)
print(report)
print(cm)  
scores

              precision    recall  f1-score   support

           0       0.69      0.87      0.77       371
           1       0.39      0.05      0.09       185
           2       0.52      0.70      0.59       186

    accuracy                           0.62       742
   macro avg       0.53      0.54      0.48       742
weighted avg       0.57      0.62      0.56       742

[[324   6  41]
 [ 96   9  80]
 [ 48   8 130]]


{'Accuracy': 0.623989218328841,
 'MCC': 0.3917814923704463,
 'log_loss': 0.8292365194048797,
 'f1 score weighted': 0.5568926567744779,
 'f1 score macro': 0.4846173899895776,
 'f1 score micro': 0.623989218328841,
 'roc_auc ovr': 0.7777556648074809,
 'roc_auc ovo': 0.754091381354792,
 'precision': 0.5735473309279806,
 'recall': 0.623989218328841}