In [1]:
import pandas as pd

data = pd.read_csv('../../../Dataset/data.csv')
data.shape

(1859483, 10)

In [120]:
df = data.sample(50000).reset_index(drop=True)

In [121]:
def gc_content(seq):
    seq = seq.upper()
    gc = seq.count('G') + seq.count('C')
    return gc / len(seq)

In [122]:
df['gc_content'] = df['sequence'].apply(gc_content)

In [123]:
def at_content(seq):
    seq = seq.upper()
    return (seq.count('A') + seq.count('T')) / len(seq)

In [124]:
df['at_content'] = df['sequence'].apply(at_content)

In [125]:
def is_cpg_site(row):
    seq = row['sequence'].upper()
    pos = row['mutation_pos']

    if pos < len(seq)-1 and seq[pos] == 'C' and seq[pos+1] == 'G':
        return 1

    if pos > 0 and seq[pos-1] == 'C' and seq[pos] == 'G':
        return 1
    return 0

In [126]:
df['cpg_flag'] = df.apply(is_cpg_site, axis=1)

In [127]:
from collections import Counter
import numpy as np

def sequence_entropy(seq):
    counts = Counter(seq)
    total = len(seq)
    probs = [count / total for count in counts.values()]
    return -sum(p * np.log2(p) for p in probs)

In [128]:
df['sequence_entropy'] = df['sequence'].apply(sequence_entropy)

In [129]:
def is_transition(ref, alt):
    transitions = {('A','G'), ('G','A'), ('C','T'), ('T','C')}
    return 1 if (ref, alt) in transitions else 0

In [130]:
transition = []

for ref, alt in zip(df['ref'].tolist(), df['alt'].tolist()):
    transition.append(is_transition(ref, alt))

df['transition'] = transition

In [131]:
# Human genome chromosome lengths (GRCh38)
chrom_lengths = {
    'chr1': 248956422,
    'chr2': 242193529,
    'chr3': 198295559,
    'chr4': 190214555,
    'chr5': 181538259,
    'chr6': 170805979,
    'chr7': 159345973,
    'chr8': 145138636,
    'chr9': 138394717,
    'chr10': 133797422,
    'chr11': 135086622,
    'chr12': 133275309,
    'chr13': 114364328,
    'chr14': 107043718,
    'chr15': 101991189,
    'chr16': 90338345,
    'chr17': 83257441,
    'chr18': 80373285,
    'chr19': 58617616,
    'chr20': 64444167,
    'chr21': 46709983,
    'chr22': 50818468,
}

# Function to normalize genomic position
def normalized_genomic_pos(row):
    chrom = row['chrom']
    chrom_length = chrom_lengths.get(chrom, 1)  # Default 1 to avoid division by zero
    return row['genomic_pos'] / chrom_length

# Apply normalization
df['genomic_pos_norm'] = df.apply(normalized_genomic_pos, axis=1)


In [132]:
df = pd.get_dummies(df, columns=['mutation_type'])

In [133]:
df = pd.get_dummies(df, columns=['chrom'])

In [None]:
x = df['sequence']
y = df['label']
gc_content = df['gc_content'] # float
at_content = df['at_content'] # float
cpg_flag = df['cpg_flag']
sequence_entropy = df['sequence_entropy'] #float
transition = df['transition']
genomic_pos_norm = df['genomic_pos_norm'] # float
mut_type_df = df.iloc[:, 14:26]
chrom_df = df.iloc[:, 26:]

In [144]:
genomic_pos_norm

0        0.576960
1        0.976161
2        0.430030
3        0.626473
4        0.656758
           ...   
49995    0.592575
49996    0.376829
49997    0.676352
49998    0.776440
49999    0.006988
Name: genomic_pos_norm, Length: 50000, dtype: float64

In [154]:
def get_codon(seq, k=3):
    return [seq[i:i+k] for i in range(len(seq) - k + 1)]

vocab = {}

for seq in df['sequence']:
    for codons in get_codon(seq.lower()):
        if codons not in vocab:
            vocab[codons] = len(vocab)+2
        else:
            continue

def get_tensor(text):
    return [vocab[codons.lower()] for codons in get_codon(text)]

In [156]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, random_split

class CustomDataset(Dataset):
  def __init__(
    self, 
    x, 
    y, 
    gc_content, 
    at_content, 
    cpg_flag, 
    sequence_entropy,
    transition,
    genomic_pos_norm,
    mut_type_df,
    chrom_df
    ):
    self.x_frame = x
    self.y_frame = y
    self.gc_content_frame = gc_content
    self.at_content_frame = at_content
    self.cpg_flag_frame = cpg_flag
    self.sequence_entropy_frame = sequence_entropy
    self.transition_frame = transition
    self.genomic_pos_norm_frame = genomic_pos_norm
    self.mut_type_df_frame = mut_type_df
    self.chrom_df_frame = chrom_df

  def __len__(self):
    return len(self.x_frame)

  def __getitem__(self, index):
      seq_x = torch.tensor(get_tensor(self.x_frame[index]) + [
          self.cpg_flag_frame[index],
          self.transition_frame[index]
      ] + self.mut_type_df_frame.loc[index].astype(float).tolist() + self.chrom_df_frame.loc[index].astype(float).tolist(), dtype=torch.long)

      features = [
        self.gc_content_frame[index],
        self.at_content_frame[index],
        self.sequence_entropy_frame[index],
        self.genomic_pos_norm_frame[index]
      ]

      features_x = torch.tensor(features, dtype=torch.float16)

      y = torch.tensor(self.y_frame[index], dtype=torch.long)

      return seq_x, features_x, y

In [157]:
dataset = CustomDataset(
    x,
    y,
    gc_content,
    at_content,
    cpg_flag,
    sequence_entropy,
    transition,
    genomic_pos_norm,
    mut_type_df,
    chrom_df
)

In [158]:
dataset[10]

(tensor([31, 32, 19, 16, 52, 55, 48, 52, 21, 15, 44, 39, 32, 61, 47, 40, 55, 48,
         17, 39, 10, 11, 54, 48, 17, 39, 32, 14, 33, 65, 57, 52, 55, 38, 45, 58,
         42, 65, 57, 17, 39, 36, 10, 58, 13, 19, 44, 18,  3, 22, 30, 31, 32, 19,
         26, 50,  4,  5,  2, 14,  9, 10, 58, 24, 37, 48, 57, 17, 18, 19, 20, 55,
         48, 17, 18, 19, 16, 57, 57, 57, 49, 35,  9, 32, 19, 20, 56, 24, 37, 48,
         57, 57, 17, 45,  7, 15, 16, 57, 57, 17, 45, 58, 31, 36, 36, 10, 37, 38,
         39, 32, 19, 16, 17, 60, 34, 35,  9, 36, 36, 59, 65, 49, 50,  5,  2, 14,
          2, 19, 16, 57, 52, 55, 48, 57, 57, 17, 39, 59, 64, 45, 37, 48, 57, 57,
         17, 45, 37, 48, 17, 39, 10, 37, 40, 55, 38, 45, 58, 24, 37, 38, 60, 64,
         39, 32, 19, 16, 52, 55, 48, 17, 18, 61, 47, 40, 56, 24,  7, 15, 26, 63,
         26, 63, 20, 56, 13, 19, 20, 55, 40, 21, 15, 16, 52, 55, 48, 17, 45, 37,
         40,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  1,  0,  0,