In [1]:
import pandas as pd

In [2]:
df_merged = pd.read_csv("data/15_trainset_populated.tsv", sep="\t")
df_merged

Unnamed: 0,chr,start,end,strand,class,recount3_score,antisense_exon_start_ss,antisense_exon_end_ss,maxentscan_sequence,start_ss,...,repeat_features_end_site_Type I Transposons/SINE,repeat_features_end_site_RNA repeats,repeat_features_end_site_Unknown,repeat_features_end_site_Low complexity regions,repeat_features_end_site_Type I Transposons/LINE,repeat_features_end_site_Centromere,repeat_features_end_site_Dust,repeat_features_end_site_Satellite repeats,repeat_features_end_site_Tandem repeats,repeat_features_end_site_LTRs
0,chr1,12227,12612,+,1,59151,False,False,ccaGTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGC...,GT,...,0,0,0,0,0,0,0,0,0,0
1,chr1,12721,13220,+,1,61021,False,False,gagGTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTA...,GT,...,0,0,0,0,0,0,0,0,0,0
2,chr1,12057,12178,+,1,0,False,False,gagCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG...,CA,...,0,0,0,0,0,0,0,0,0,0
3,chr1,12697,12974,+,1,21,False,False,cttGTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAG...,GT,...,0,0,0,0,0,0,0,0,0,0
4,chr1,13052,13220,+,1,526,False,False,tagGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGG...,GC,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,0,False,False,cacGTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCG...,GT,...,0,0,0,0,0,0,0,0,0,0
519030,chrY,25513173,25513588,-,0,31374,False,False,aggGTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCA...,GT,...,0,0,0,0,0,0,0,0,0,0
519031,chrY,25513745,25516715,-,0,67,False,False,gctGTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGC...,GT,...,0,0,0,0,0,0,0,0,0,0
519032,chrY,25525288,25527646,-,0,18958,False,False,tgtGTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAA...,GT,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df_merged["sequence"] = df_merged.maxentscan_sequence.str[3:-3]

In [4]:
def calculate_gc_content(sequence):
    return (sequence.count('G') + sequence.count('C')) / len(sequence)

df_merged["GC_content"] = df_merged["sequence"].apply(calculate_gc_content)

In [5]:
def calculate_cpg_island(sequence):
    c_count = sequence.count('C')
    g_count = sequence.count('G')
    cg_count = sequence.count('CG')
    total_count = len(sequence)

    try:
        cpg_ratio = (cg_count * total_count) / (c_count * g_count)
        gc_content = (c_count + g_count) / total_count
    except ZeroDivisionError:
        # handle sequences with no 'C' or 'G' nucleotides
        cpg_ratio = 0
        gc_content = 0

    return cpg_ratio > 0.6 and gc_content > 0.5

df_merged["CpG_island"] = df_merged["sequence"].apply(calculate_cpg_island)

In [6]:
def calculate_intron_length(row):
    return row.end - row.start

df_merged["intron_length"] = df_merged.apply(calculate_intron_length, axis=1)

In [7]:
df_merged

Unnamed: 0,chr,start,end,strand,class,recount3_score,antisense_exon_start_ss,antisense_exon_end_ss,maxentscan_sequence,start_ss,...,repeat_features_end_site_Type I Transposons/LINE,repeat_features_end_site_Centromere,repeat_features_end_site_Dust,repeat_features_end_site_Satellite repeats,repeat_features_end_site_Tandem repeats,repeat_features_end_site_LTRs,sequence,GC_content,CpG_island,intron_length
0,chr1,12227,12612,+,1,59151,False,False,ccaGTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGC...,GT,...,0,0,0,0,0,0,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...,0.638961,False,385
1,chr1,12721,13220,+,1,61021,False,False,gagGTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTA...,GT,...,0,0,0,0,0,0,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...,0.597194,False,499
2,chr1,12057,12178,+,1,0,False,False,gagCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG...,CA,...,0,0,0,0,0,0,CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...,0.512397,False,121
3,chr1,12697,12974,+,1,21,False,False,cttGTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAG...,GT,...,0,0,0,0,0,0,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,0.595668,False,277
4,chr1,13052,13220,+,1,526,False,False,tagGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGG...,GC,...,0,0,0,0,0,0,GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...,0.595238,False,168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,0,False,False,cacGTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCG...,GT,...,0,0,0,0,0,0,GTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCGGAG...,0.628163,True,909
519030,chrY,25513173,25513588,-,0,31374,False,False,aggGTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCA...,GT,...,0,0,0,0,0,0,GTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCACAG...,0.325301,False,415
519031,chrY,25513745,25516715,-,0,67,False,False,gctGTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGC...,GT,...,0,0,0,0,0,0,GTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGCACG...,0.383165,False,2970
519032,chrY,25525288,25527646,-,0,18958,False,False,tgtGTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAA...,GT,...,0,0,0,0,0,0,GTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAATAC...,0.384648,False,2358


In [8]:
df_merged.to_csv("data/16_basic_features_added_trainset.tsv", sep="\t", index=False)

In [9]:
!md5sum data/16_basic_features_added_trainset.tsv

5e3b014194142188e8b4dca9263193b6  data/16_basic_features_added_trainset.tsv
