In [1]:
import os
import sys
import argparse
import pandas as pd
import numpy as np
from scipy import interpolate
from collections import Counter
import scipy.sparse as sp
import math
import json
import random
import pickle
import regex
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, mean_squared_error

"""
This script trains a MLP classifier to predict the binding affinity of a 30 bases sequence w.r.t. the obtained priming window.

The training data is obtained from the upstream 30 bases of the alignments of R1 in our paired-end read alignment results from the genome. The training data has the same orientation as its read2, so, it should contain a polyA site.
As reads might come from polyA tails, which are not included in the genome, we would expect that some of the input data come from intergenic regions and do not correspond to a polyA site. This is saying our labels are noisy, so we would not expect to obtain perfect accuracy.

We train a MLP classifier by reading the training data from each SRR and call `partial_fit` to update the model. 

"""

parser = json.load(open("params.json", "r"))
parent_dir = parser["parent_dir"]
PE_sheet = parser["PE_sheet"]
outdir = parser["outdir"]
random_seed = parser["random_seed"]
snr_len = parser["snr_len"]
snr_mismatch = parser["snr_mismatch"]

outdir = outdir
print(parser)

os.makedirs(outdir, exist_ok=True)


{'parent_dir': 'workflow_output', 'PE_sheet': 'sample_url_sheet.csv', 'outdir': 'mlp_model', 'random_seed': 1, 'snr_len': 6, 'snr_mismatch': 1}


In [2]:

encoder = OneHotEncoder(categories=[['A', 'C', 'G', 'T', 'N']] * 30, handle_unknown='ignore')

parent_dir = parent_dir
# 1----- Get the PE datasets spreadsheet
PE_sheet = pd.read_csv(PE_sheet)

# 2----- loop through GSE(s), combine all tlen from its SRR
# check if we have all datasets processed
missing_files = []

for (GSE, group_gse_lst) in PE_sheet.groupby('GSE'):
    SRR_lst = group_gse_lst['SRR']
    for SRR in SRR_lst:
        polya_path = os.path.join(
            parent_dir, "process_data", "frag_len_dist", GSE, SRR, "priming_site_seqs", "polya_seq.txt")

        if os.path.exists(polya_path):
            check_file = os.path.getsize(polya_path)
            if (check_file == 0):
                missing_files.append(f"{GSE}-{SRR}")
                error_occur = True
        else:
            missing_files.append(f"{GSE}-{SRR}")
            error_occur = True

if missing_files:
    raise ValueError(f"Please re-run the previous step, the output of following dataset(s) is either missing or empty: {missing_files}")


In [3]:

polya_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, alpha=1e-4,
                    solver='adam', verbose=10, random_state=random_seed, shuffle=True,
                    learning_rate_init=.001)

held_out_polya = sp.csr_matrix((0, 150))
held_out_polya_bg = sp.csr_matrix((0, 150)) 

random.seed(random_seed)
train_list = PE_sheet.loc[PE_sheet['GSE'].isin(random.sample(PE_sheet['GSE'].unique().tolist(), 8))]
num_srr = {GSE: group_gse_lst.shape[0] for (GSE, group_gse_lst) in train_list.groupby('GSE')}
# initialize the sparse matrix 
training_batch = sp.csr_matrix((0, 150))
held_out_polya_bg = sp.csr_matrix((0, 150)) 

training_data_acc_list = []

# we group by GSE, then loop through SRRs to get the polya seq
for i, (GSE, group_gse_lst) in enumerate(train_list.groupby('GSE')):
    SRR_lst = group_gse_lst['SRR']
    num_held_out = 1000//len(SRR_lst)
    
    for SRR in SRR_lst:
        # The 30 bases are defined according to R2. So, they should always be polyA

        polya_path = os.path.join(
            parent_dir, "process_data", "frag_len_dist", GSE, SRR, "priming_site_seqs", "polya_seq.txt")
        bg_path = os.path.join(
            parent_dir, "process_data", "frag_len_dist", GSE, SRR, "priming_site_seqs", "polya_bg_seq.txt")

        # get polya and bg seq
        # we want to make sure the polya seq has a polyA six mer with at most one mismatch
        polya_batch = encoder.fit_transform([list(x.strip()) for x in open(polya_path).readlines() if len(x.strip()) == 30 and regex.search("A(" + 'A' * (snr_len-2)+ "){s<=" + str(snr_mismatch) +"}A", x.strip())])
        # polya_batch = encoder.fit_transform([list(x.strip()) for x in open(polya_path).readlines() if len(x.strip()) == 30])
        
        # if we do not have enough data, then skip this SRR
        if polya_batch.shape[0] < num_held_out * 2:
            print("Not enough training examples for", GSE, SRR)
            continue
        
        bg_batch = encoder.fit_transform([list(x.strip()) for x in open(bg_path).readlines() if len(x.strip()) == 30])
        
        # if we do not have enough data, then skip this SRR
        if bg_batch.shape[0] < num_held_out * 2:
            print("Not enough background examples for", GSE, SRR)
            continue
        
        # num_held_out = min(math.ceil(polya_batch.shape[0] * 0.1), 20)
        num_train_fg = polya_batch.shape[0]
        num_train_bg = min(num_train_fg, bg_batch.shape[0])

        polya_batch = polya_batch[np.random.randint(0, polya_batch.shape[0], size = num_train_fg), : ]
        bg_batch = bg_batch[np.random.randint(0, bg_batch.shape[0], size = num_train_bg), : ]

        # append held out data
        polya_held_out = polya_batch[:num_held_out,]
        bg_held_out = bg_batch[:num_held_out,]
        
        held_out_polya = sp.vstack([held_out_polya, polya_held_out])
        held_out_polya_bg = sp.vstack([held_out_polya_bg, bg_held_out])
        
        fg_shuf_id = np.arange((num_train_fg+num_train_bg-num_held_out*2))
        np.random.shuffle(fg_shuf_id)

        # build training data
        train_polya = sp.vstack([
            polya_batch[num_held_out:],
            bg_batch[num_held_out:]
        ])[fg_shuf_id,:]
        
        del polya_batch
        del bg_batch
        
        label_polya = np.hstack([
            np.ones(num_train_fg - num_held_out), 
            np.zeros(num_train_bg - num_held_out)
        ])[fg_shuf_id]
        
        # fit the model using the data from this SRR
        polya_mlp.partial_fit(
            train_polya,
            label_polya,
            classes=[0,1]
        )
        
        acc = accuracy_score(
            np.hstack(
                [np.ones(num_held_out), 
                np.zeros(num_held_out)
                ]
            ), 
            polya_mlp.predict(
                sp.vstack([
                    polya_held_out, 
                    bg_held_out]
                )
            )
        )
        training_data_acc_list.append(acc)
        # get the accuracy on the held out data
        print("Accuracy:", acc
        )

print("The mean accuracy on the training datasets is:", sum(training_data_acc_list)/len(training_data_acc_list))


Iteration 1, loss = 0.23050745
Accuracy: 0.918


Iteration 1, loss = 0.22794548
Accuracy: 0.9225352112676056


Iteration 1, loss = 0.23990190
Accuracy: 0.9084507042253521


Iteration 1, loss = 0.24675366
Accuracy: 0.8450704225352113


Iteration 1, loss = 0.24989827
Accuracy: 0.9154929577464789


Iteration 1, loss = 0.24654350
Accuracy: 0.9014084507042254


Iteration 1, loss = 0.23868496
Accuracy: 0.8802816901408451


Iteration 1, loss = 0.22755573
Accuracy: 0.9225352112676056


Iteration 1, loss = 0.23264007
Accuracy: 0.9014084507042254


Iteration 1, loss = 0.24192234
Accuracy: 0.9225352112676056


Iteration 1, loss = 0.23932190
Accuracy: 0.8943661971830986


Iteration 1, loss = 0.24625334
Accuracy: 0.9014084507042254


Iteration 1, loss = 0.24967864
Accuracy: 0.8943661971830986


Iteration 1, loss = 0.25003547
Accuracy: 0.9084507042253521


Iteration 1, loss = 0.22184379
Accuracy: 0.8802816901408451


Iteration 1, loss = 0.23905608
Accuracy: 0.9054054054054054


Iteration 1, loss = 0.24733760
Accuracy: 0.9324324324324325


Iteration 1, loss = 0.24734331
Accuracy: 0.8828828828828829


Iteration 1, loss = 0.23523795
Accuracy: 0.8783783783783784


Iteration 1, loss = 0.24034690
Accuracy: 0.9099099099099099


Iteration 1, loss = 0.23539133
Accuracy: 0.8828828828828829


Iteration 1, loss = 0.22957478
Accuracy: 0.9144144144144144


Iteration 1, loss = 0.23635629
Accuracy: 0.8963963963963963


Iteration 1, loss = 0.22892798
Accuracy: 0.8738738738738738


Iteration 1, loss = 0.21747809
Accuracy: 0.9246987951807228


Iteration 1, loss = 0.23248102
Accuracy: 0.8975903614457831


Iteration 1, loss = 0.21484872
Accuracy: 0.9216867469879518


Iteration 1, loss = 0.22581230
Accuracy: 0.9126506024096386


Iteration 1, loss = 0.21462839
Accuracy: 0.8975903614457831


Iteration 1, loss = 0.21187902
Accuracy: 0.8855421686746988


Iteration 1, loss = 0.26421668
Accuracy: 0.8873873873873874


Iteration 1, loss = 0.25022885
Accuracy: 0.8858858858858859


Iteration 1, loss = 0.26415883
Accuracy: 0.9084084084084084
Iteration 1, loss = 0.33464810
Accuracy: 0.8125
Iteration 1, loss = 0.30949506
Accuracy: 0.9375
Iteration 1, loss = 0.32599999
Accuracy: 0.875
Iteration 1, loss = 0.28125567
Accuracy: 1.0


Iteration 1, loss = 0.26058216
Accuracy: 0.8125
Iteration 1, loss = 0.31167263
Accuracy: 0.9375
Iteration 1, loss = 0.29960605
Accuracy: 0.875
Iteration 1, loss = 0.29171059
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.29103689
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875


Iteration 1, loss = 0.34728059
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.39410426
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.625
Iteration 1, loss = 0.35770228
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.37983913
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.34331456
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.6875
Iteration 1, loss = 0.30495622
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.33520041
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75
Iteration 1, loss = 0.27048017
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.24116651
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.32507110
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75


Iteration 1, loss = 0.31139178
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.33054030
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.31409262
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.27469125
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.31933778
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75
Iteration 1, loss = 0.27037652
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.28145712
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.23609085
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875


Iteration 1, loss = 0.29781502
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.25084802
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.28724942
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.6875


Iteration 1, loss = 0.36170167
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75
Iteration 1, loss = 0.30990570
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.6875
Iteration 1, loss = 0.37563491
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.24583507
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.37263483
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.34679929
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.26126076
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.33442162
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.36637953
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.32785751
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.29891650
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75


Iteration 1, loss = 0.33382348
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.27169517
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.31343610
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.29901394
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.31068224
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.29254607
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.30640806
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.5625
Iteration 1, loss = 0.29265684
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 1.0
Iteration 1, loss = 0.30580963
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.34351309
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.32600695
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.35681786
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.36521041
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.26770398
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 1.0
Iteration 1, loss = 0.28805839
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.35153854
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.24604828
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.24677714
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.26842457
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.28886874
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75
Iteration 1, loss = 0.33909840
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 1.0


Iteration 1, loss = 0.31778601
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.29514428
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.27044629
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 1.0
Iteration 1, loss = 0.31708897
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.38996505
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.33599557
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.37715880
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.33448420
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75
Iteration 1, loss = 0.34766906
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.35960368
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.32047473
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75


Iteration 1, loss = 0.30097593
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.26321330
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.26830478
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.36505336
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.26387508
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.26497259
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.24010553
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125


Iteration 1, loss = 0.29574111
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.28919040
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.33738400
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.27458701
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.29806165
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.31315156
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.6875


Iteration 1, loss = 0.28271910
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.29726146
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.27384683
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.32638458
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875


Iteration 1, loss = 0.31809062
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.34619576
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.32252653
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.32330408
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.30615711
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.6875


Iteration 1, loss = 0.25517588
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.31708440
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.35019641
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.26821396
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.625
Iteration 1, loss = 0.25528292
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125


Iteration 1, loss = 0.29393500
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.36403192
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 1.0
Iteration 1, loss = 0.28788240
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 1.0
Iteration 1, loss = 0.28408698
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.75
Iteration 1, loss = 0.28521845
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.32143389
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375


Iteration 1, loss = 0.30561562
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.33609540
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.26389264
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.9375
Iteration 1, loss = 0.33351609
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875
Iteration 1, loss = 0.33594975
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.875


Iteration 1, loss = 0.32904513
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125
Iteration 1, loss = 0.41529830
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.625
Iteration 1, loss = 0.32728852
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.6875
Iteration 1, loss = 0.27813794
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 1.0
Iteration 1, loss = 0.26890871
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy: 0.8125


Iteration 1, loss = 0.16499810
Accuracy: 0.936


Iteration 1, loss = 0.22934368
Accuracy: 0.89


Iteration 1, loss = 0.16683318
Accuracy: 0.926


Iteration 1, loss = 0.22210650
Accuracy: 0.914


Iteration 1, loss = 0.20361029
Accuracy: 0.93


Iteration 1, loss = 0.20011299
Accuracy: 0.915


Iteration 1, loss = 0.19937842
Accuracy: 0.925


Iteration 1, loss = 0.20044782
Accuracy: 0.935
Not enough training examples for GSE148504 SRR11538631
The mean accuracy on the training datasets is: 0.8707682091955324


In [4]:

# finally, we want to get the overall accuracy on all held out data
predictions = polya_mlp.predict(sp.vstack([held_out_polya, held_out_polya_bg]) )
print("The mean accuracy on the holdout training examples is :", accuracy_score(np.hstack([np.ones(held_out_polya.shape[0]), np.zeros(held_out_polya_bg.shape[0])]), predictions))


The mean accuracy on the holdout training examples is : 0.8635951270088128


In [5]:

# we a;sp also want to test the accuracy on test data
# we read in the test data
test_list = PE_sheet.loc[~PE_sheet['GSE'].isin(random.sample(PE_sheet['GSE'].unique().tolist(), 8))]
test_acc_list = []
# we group by GSE, then loop through SRRs to get the polya seq
for i, (GSE, group_gse_lst) in enumerate(test_list.groupby('GSE')):
    SRR_lst = group_gse_lst['SRR']
    for SRR in SRR_lst:
        # The 30 bases are defined according to R2. So, they should always be polyA

        polya_path = os.path.join(
            parent_dir, "process_data", "frag_len_dist", GSE, SRR, "priming_site_seqs", "polya_seq.txt")
        bg_path = os.path.join(
            parent_dir, "process_data", "frag_len_dist", GSE, SRR, "priming_site_seqs", "polya_bg_seq.txt")

        # get polya and bg seq
        # we want to make sure the polya seq has a polyA six mer with at most one mismatch
        polya_batch = encoder.fit_transform([list(x.strip()) for x in open(polya_path).readlines() if len(x.strip()) == 30 and regex.search("A(" + 'A' * (snr_len-2)+ "){s<=" + str(snr_mismatch) +"}A", x.strip())])
        # polya_batch = encoder.fit_transform([list(x.strip()) for x in open(polya_path).readlines() if len(x.strip()) == 30])
        
        # if we do not have enough data, then skip this SRR
        if polya_batch.shape[0] < num_held_out * 2:
            print("Not enough training examples for", GSE, SRR)
            continue
        
        bg_batch = encoder.fit_transform([list(x.strip()) for x in open(bg_path).readlines() if len(x.strip()) == 30])
        
        # if we do not have enough data, then skip this SRR
        if bg_batch.shape[0] < num_held_out * 2:
            print("Not enough background examples for", GSE, SRR)
            continue
        
        # build training data
        train_polya = sp.vstack([
            polya_batch[random.sample(range(polya_batch.shape[0]), min(polya_batch.shape[0], bg_batch.shape[0])), : ],
            bg_batch[random.sample(range(bg_batch.shape[0]), min(polya_batch.shape[0], bg_batch.shape[0])), : ]
        ])
        label_polya = np.hstack([
            np.ones(min(polya_batch.shape[0], bg_batch.shape[0])), 
            np.zeros(min(polya_batch.shape[0], bg_batch.shape[0]))
        ])
        
        
        del polya_batch
        del bg_batch
        
        # get the accuracy on the held out data
        acc_scores = accuracy_score(
            label_polya, 
            polya_mlp.predict(
                train_polya
            )
        )
        test_acc_list.append(acc_scores)
        
        print("Accuracy:", acc_scores)


print("The mean accuracy on the test datasets is:", sum(test_acc_list)/len(test_acc_list))



Accuracy: 0.8890777242229756


Accuracy: 0.8802900907488554


Accuracy: 0.8788523114652818


Accuracy: 0.8890574450387473


Accuracy: 0.8844824799553176


Accuracy: 0.8868484848151765


Accuracy: 0.8887398646135535


Accuracy: 0.8865105081756116


Accuracy: 0.8901097556038253


Accuracy: 0.8198358564853324


Accuracy: 0.8423008967563423


Accuracy: 0.778667760810618
The mean accuracy on the test datasets is: 0.8678977648909697


In [6]:

model_pkl_path = os.path.join(outdir, 'mlp.pkl')
with open(model_pkl_path, 'wb') as file_model:
    pickle.dump(polya_mlp, file_model)