In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from concurrent.futures import ProcessPoolExecutor, as_completed
from torch.utils.data import DataLoader
from datasets import load_dataset
from torch.utils.data import Dataset
from typing import Dict, Sequence
from dataclasses import dataclass
from torch.nn import Softmax
from Bio import SeqIO
from torch import nn
import transformers
import numpy as np
import threading
import argparse
import torch
import csv
import re
import os

In [43]:
input_seq_file = "C:/Users/anjan/Downloads/EGFR_datasets/ncbi_dataset/data/gene.fna"
mut_file = "C:/Users/anjan/variant_data_egfr.csv"

In [44]:
mutations = []

In [50]:
for record in SeqIO.parse(input_seq_file, "fasta"):
    sequence = record.seq
    header = record.description
    des = header.split(' ')
    position = des[0].split(':')
    start, end = position[1].split('-')
print(len(sequence))
print(start, end)
start = int(start)
end = int(end)

193120
55178937 55372056


In [35]:
def get_alter_of_dna_sequence(sequence: str):
    MAP = {"A": "T", "T": "A", "C": "G", "G": "C"}
    # return "".join([MAP[c] for c in reversed(sequence)])
    return "".join([MAP[c] for c in sequence])

In [36]:
sequence = get_alter_of_dna_sequence(sequence)

In [51]:
import vcf
variant_data = "C:/Users/anjan/Downloads/ClinVar variants with precise endpoints (6).VCF"
vcf_reader = vcf.Reader(open(variant_data, 'r'))
variant = []

# Iterate over each variant in the VCF file
for record in vcf_reader:
    # Access variant information using record attributes 
    if record.POS >= start and record.POS <= end:
        variant.append({
            "variant_id" : record.ID,
            "pos" : record.POS,
            "ref" : record.REF,
            "alt" : record.ALT[0],
            "variant": record.INFO["CLNVC"][0]
        })
        
fields = ['variant_id', 'pos', 'ref', 'alt', 'variant']

with open("variant_data_egfr.csv", 'w') as csvfile:
    # creating a csv dict writer object
    writer = csv.DictWriter(csvfile, fieldnames=fields, lineterminator = '\n')
    # writing headers (field names)
    writer.writeheader()
    writer.writerows(variant)

In [52]:
# Read mutations from CSV file
with open(mut_file , 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header if present
    for row in reader:
        if row:
            start_position = int(row[1])-start
            end_position = start_position
            consequence = row[4]
            ref = row[2]
            alt = row[3]
            mutations.append((start_position, ref, alt, consequence))

In [53]:
snv = "single_nucleotide_variant"
dup = "Duplication"
ins = "Insertion"
delt = "Deletion"
last_index = 0
def embed_mutations(sequence, mutations):
    alt_length = 0
    embedded_sequence = list(sequence)
    seq = list(sequence)
    mutation_embed = []
    for mutation in mutations:
        start_position, ref, alt, consequence = mutation
        start_position = start_position + alt_length
        if consequence == snv:
            if embedded_sequence[start_position] == ref:
                embedded_sequence[start_position] = alt
                mutation_embed.append([start_position, start_position, consequence])
        elif consequence == dup or consequence == ins:
            if embedded_sequence[start_position] == ref:
                embedded_sequence[start_position:start_position] = alt
                mutation_embed.append([start_position, start_position+len(alt), consequence])
                alt_length += len(alt)
        elif consequence == delt:
            if len(ref) > len(alt):
                if ''.join(embedded_sequence[start_position:(start_position+len(ref))]) == ref:
                    del embedded_sequence[start_position:start_position+len(ref)]
                    mutation_embed.append([start_position, start_position+len(ref), consequence])
                    alt_length -= len(ref)
            else:
                if ''.join(embedded_sequence[start_position:(start_position+len(alt))]) == ref:
                    del embedded_sequence[start_position:start_position+len(alt)]
                    mutation_embed.append([start_position, start_position+len(alt), consequence])
                    alt_length -= len(alt)
        
    return ''.join(embedded_sequence), mutation_embed
reference_sequence = sequence
embedded_sequence, mutation_embed = embed_mutations(reference_sequence, mutations)
print(len(embedded_sequence))
print(len(reference_sequence))

193149
193120


In [54]:
def split_sequence_with_mutations(sequence, mutations, subsequence_length=512):
    subsequences = []
    for i in range(0, len(sequence), subsequence_length):
        subsequence = sequence[i:i+subsequence_length]
        mutation_vector = [0, 0, 0, 0]  # Initialize vector for SNV, Duplication, Deletion
        for start_position, end_position, consequence in mutations:
            if start_position < i + subsequence_length and start_position >= i:
                if consequence == snv:
                    mutation_vector[0] = 1
                elif consequence == ins:
                    mutation_vector[1] = 1
                elif consequence == dup:
                    mutation_vector[2] = 1
                elif consequence == delt:
                    mutation_vector[3] = 1
        subsequences.append((subsequence, mutation_vector))
    return subsequences
subsequences_with_mutations = split_sequence_with_mutations(embedded_sequence, mutation_embed)
subsequences_without_mutation = split_sequence_with_mutations(sequence, [])
print(len(subsequences_without_mutation))


378


In [56]:
def write_sequences_to_csv(subsequences_without_mutation, subsequences_with_mutations, output_file):
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Sequence', 'Mutation_Vector'])  # Write header for reference sequence
#         for mutated_subsequence, mutation_vector in subsequences_without_mutation:
#             writer.writerow([mutated_subsequence, mutation_vector])
        
        # Write mutated subsequences with mutation vectors
        for mutated_subsequence, mutation_vector in subsequences_with_mutations:
            writer.writerow([mutated_subsequence, mutation_vector])
output_file = "egfr.csv"
write_sequences_to_csv(subsequences_without_mutation, subsequences_with_mutations, output_file)