In [1]:
import pandas as pd

In [3]:
df_merged = pd.read_csv("data/08_trainset_with_seq.tsv", sep="\t")
df_merged

Unnamed: 0,chr,start,end,strand,class,maxentscan_sequence,start_ss,end_ss
0,chr1,12227,12612,+,1,ccaGTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGC...,GT,AG
1,chr1,12721,13220,+,1,gagGTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTA...,GT,AG
2,chr1,12057,12178,+,1,gagCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG...,CA,GA
3,chr1,12697,12974,+,1,cttGTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAG...,GT,AG
4,chr1,13052,13220,+,1,tagGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGG...,GC,AG
...,...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,cacGTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCG...,GT,AG
519030,chrY,25513173,25513588,-,0,aggGTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCA...,GT,AG
519031,chrY,25513745,25516715,-,0,gctGTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGC...,GT,AG
519032,chrY,25525288,25527646,-,0,tgtGTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAA...,GT,AG


In [4]:
df_merged["sequence"] = df_merged.maxentscan_sequence.str[3:-3]

In [5]:
def calculate_gc_content(sequence):
    return (sequence.count("G") + sequence.count("C")) / len(sequence)

df_merged["GC_content"] = df_merged["sequence"].apply(calculate_gc_content)

In [6]:
def calculate_cpg_island(sequence):
    c_count = sequence.count("C")
    g_count = sequence.count("G")
    cg_count = sequence.count("CG")
    total_count = len(sequence)

    try:
        cpg_ratio = (cg_count * total_count) / (c_count * g_count)
        gc_content = (c_count + g_count) / total_count
    except ZeroDivisionError:
        # In case there are sequences with no "C" or "G" nucleotides
        cpg_ratio = 0
        gc_content = 0

    return cpg_ratio > 0.6 and gc_content > 0.5

df_merged["CpG_island"] = df_merged["sequence"].apply(calculate_cpg_island)

In [7]:
def calculate_intron_length(row):
    return row.end - row.start

df_merged["intron_length"] = df_merged.apply(calculate_intron_length, axis=1)
df_merged

Unnamed: 0,chr,start,end,strand,class,maxentscan_sequence,start_ss,end_ss,sequence,GC_content,CpG_island,intron_length
0,chr1,12227,12612,+,1,ccaGTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGC...,GT,AG,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...,0.638961,False,385
1,chr1,12721,13220,+,1,gagGTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTA...,GT,AG,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...,0.597194,False,499
2,chr1,12057,12178,+,1,gagCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG...,CA,GA,CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...,0.512397,False,121
3,chr1,12697,12974,+,1,cttGTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAG...,GT,AG,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,0.595668,False,277
4,chr1,13052,13220,+,1,tagGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGG...,GC,AG,GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...,0.595238,False,168
...,...,...,...,...,...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,cacGTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCG...,GT,AG,GTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCGGAG...,0.628163,True,909
519030,chrY,25513173,25513588,-,0,aggGTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCA...,GT,AG,GTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCACAG...,0.325301,False,415
519031,chrY,25513745,25516715,-,0,gctGTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGC...,GT,AG,GTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGCACG...,0.383165,False,2970
519032,chrY,25525288,25527646,-,0,tgtGTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAA...,GT,AG,GTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAATAC...,0.384648,False,2358


In [8]:
df_merged.to_csv("data/11_trainset_basic_features.tsv", sep="\t", index=False)

In [10]:
!md5sum data/11_trainset_basic_features.tsv

d263a57c48e54cd57a7f4f2df515c9bb  data/11_trainset_basic_features.tsv
