425 Project

In [3]:
import csv
import sys
import torch
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import pyranges as pr
import sqlite3

import numpy as np
from Bio import SeqIO
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import os 
import warnings
#warnings.filterwarnings("ignore")

In [4]:
Chromosome_1_ = 'Final_bed_files\Chromosomes\Arabidopsis_thaliana.TAIR10.dna.chromosome.1.fa'
Chromosome_2_ = 'Final_bed_files\Chromosomes\Arabidopsis_thaliana.TAIR10.dna.chromosome.2.fa'
Chromosome_3_ = 'Final_bed_files\Chromosomes\Arabidopsis_thaliana.TAIR10.dna.chromosome.3.fa'
Chromosome_4_ = 'Final_bed_files\Chromosomes\Arabidopsis_thaliana.TAIR10.dna.chromosome.4.fa'
Chromosome_5_ = 'Final_bed_files\Chromosomes\Arabidopsis_thaliana.TAIR10.dna.chromosome.5.fa'


In [5]:
# Putting all the chromosomes in a list with a size of 5
seqs_List= []

file_paths = [Chromosome_1_,Chromosome_2_,Chromosome_3_,Chromosome_4_,Chromosome_5_]

# Iterate over each file
for file_path in file_paths:
   with open(file_path, "r") as handle:

    # Open the file
    for record in SeqIO.parse(handle, "fasta"):
            sequence_id = record.id
            #print("Sequence ID:", sequence_id)
            description = record.description
            #print("Description:", description)
            sequence = record.seq
            seqs_List.append(str(sequence))
            print("Sequence:", str(sequence[:50]), "...")  # Print first 50 characters of the sequence
            #print("Sequence length:", len(sequence))  # Print length of the sequence

Sequence: CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAA ...
Sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ...
Sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ...
Sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ...
Sequence: TATACCATGTACCCTCAACCTTAAAACCCTAAAACCTATACTATAAATCT ...


In [6]:
def process_dataframe(filename):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(filename, sep='\t', header=None)

    # Set column names
    df.columns = ['chromosome', 'start', 'end', 'name', 'score', 'strand', 'stuff', 'things', 'who', 'ahh', 'scream', 'ska', 'die', 'grr']

    # Calculate new positions based on existing start and end positions
    df['dif'] = df['end'] - df['start']
    df['mid'] = df['dif'] // 2
    df['midpoint'] = df['start'] + df['mid']
    df['new_start'] = df['midpoint'] - 400
    df['new_end'] = df['midpoint'] + 400
    df['dif_new'] = df['new_end'] - df['new_start']

    # Create a new DataFrame with selected columns
    df_new = pd.DataFrame({
        'chromosome': df['chromosome'],
        'start': df['start'],
        'end': df['end'],
        'midpoint': df['midpoint'],
        'dif': df['dif'],
        'new_start': df['new_start'], #new start and end are from the midpoint out 250 each way
        'new_end': df['new_end'],
        'position_difference_new': df['dif_new']
    })

    return df_new

In [7]:
testFile = "Final_bed_files/SRP034156_BedFiles/SRX391990.target.all.bed"
directory = "Final_bed_files//test_bedFiles"

# List to store dataframes
dataframes = []

for filename in os.listdir(directory):
    if filename.endswith(".bed"):
        file_path = os.path.join(directory, filename)
        df = process_dataframe(file_path)
        dataframes.append(df)


In [8]:
def add_seqs(df, data_list):
    dfs = []
    for index, row in df.iterrows():
        #Changing the chromosome to an int that can be used to find the correct index of the string of chromosomes
        chromosome_label = row['chromosome']
        chromosome_number = int(chromosome_label.replace('Chr', ''))
        data = data_list[chromosome_number - 1]
        extracted_data = data[row['new_start']:row['new_end']] #gets the peak sequence
        size = len(extracted_data)
        
        if(size < 800):
            Add_num_N = 800 - size
            #print(size)
            print("N to add: ", Add_num_N)
            extracted_data += 'N' * Add_num_N
            new_size = len(extracted_data)
            #print(new_size)
            size = new_size

        df_extracted = pd.DataFrame({ #saves it as a new dataframe
            'chromosome': [chromosome_label],
            'start': [row['new_start']],
            'end': [row['new_end']],
            'size': [size],
            'data': [extracted_data]
        })

        dfs.append(df_extracted)

    result_df = pd.concat(dfs, ignore_index=True)
    print("done")
    return result_df

In [9]:
#Check size of each dataframe within the list
for i, df in enumerate(dataframes):
    num_rows, num_columns = df.shape
    print(f"DataFrame {i+1}: {num_rows} rows x {num_columns} columns")

DataFrame 1: 24019 rows x 8 columns
DataFrame 2: 20457 rows x 8 columns
DataFrame 3: 19464 rows x 8 columns
DataFrame 4: 22100 rows x 8 columns
DataFrame 5: 16752 rows x 8 columns
DataFrame 6: 22932 rows x 8 columns
DataFrame 7: 29238 rows x 8 columns
DataFrame 8: 13905 rows x 8 columns
DataFrame 9: 13879 rows x 8 columns
DataFrame 10: 11987 rows x 8 columns


In [10]:
Sequences_DataFrame = []

for k in dataframes:
    Sequences_DataFrame.append(add_seqs(k,seqs_List))

done
done
N to add:  37
done
done
done
N to add:  44
done
N to add:  246
done
done
done
done


In [11]:
#Check size of each dataframe within the list
for i, df in enumerate(Sequences_DataFrame):
    num_rows, num_columns = df.shape
    print(f"DataFrame {i+1}: {num_rows} rows x {num_columns} columns")

DataFrame 1: 24019 rows x 5 columns
DataFrame 2: 20457 rows x 5 columns
DataFrame 3: 19464 rows x 5 columns
DataFrame 4: 22100 rows x 5 columns
DataFrame 5: 16752 rows x 5 columns
DataFrame 6: 22932 rows x 5 columns
DataFrame 7: 29238 rows x 5 columns
DataFrame 8: 13905 rows x 5 columns
DataFrame 9: 13879 rows x 5 columns
DataFrame 10: 11987 rows x 5 columns


In [12]:
# Iterate over each DataFrame in the list
for i, df in enumerate(dataframes):
    
    # Filter rows where position_difference_new is not equal to 600
    filtered_df = df[df['position_difference_new'] != 800]
    
    # Print the values of position_difference_new for filtered rows
    if not filtered_df.empty:
        print(f"DataFrame {i+1} - Values of position_difference_new not equal to 800:")
        print(filtered_df['position_difference_new'])
    else:
        print(f"DataFrame {i+1} - No rows found with position_difference_new not equal to 800.")

DataFrame 1 - No rows found with position_difference_new not equal to 800.
DataFrame 2 - No rows found with position_difference_new not equal to 800.
DataFrame 3 - No rows found with position_difference_new not equal to 800.
DataFrame 4 - No rows found with position_difference_new not equal to 800.
DataFrame 5 - No rows found with position_difference_new not equal to 800.
DataFrame 6 - No rows found with position_difference_new not equal to 800.
DataFrame 7 - No rows found with position_difference_new not equal to 800.
DataFrame 8 - No rows found with position_difference_new not equal to 800.
DataFrame 9 - No rows found with position_difference_new not equal to 800.
DataFrame 10 - No rows found with position_difference_new not equal to 800.


In [13]:
# Function to convert a sequence to one-hot encoding
def one_hot_encode(seq):
    # Mapping of nucleotides to integers
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
    # Create a zero matrix of size (len(seq), 5)
    one_hot = np.zeros((len(seq), 5), dtype=np.int8)
    # Fill the appropriate positions
    for i, nucleotide in enumerate(seq):
        if nucleotide in mapping:
            one_hot[i, mapping[nucleotide]] = 1
    return one_hot

# Applying one-hot encoding to extracted DNA sequences
def encode_sequences(df):
    # Applying the one-hot encoding to each row in the dataframe
    df['one_hot_data'] = df['data'].apply(one_hot_encode)
    return df

for i, df in enumerate(Sequences_DataFrame):
    Sequences_DataFrame[i] = encode_sequences(df)
    print(f"DataFrame {i+1} encoded. First encoded sequence shape: {Sequences_DataFrame[i]['one_hot_data'].iloc[0].shape}")


DataFrame 1 encoded. First encoded sequence shape: (800, 5)
DataFrame 2 encoded. First encoded sequence shape: (800, 5)
DataFrame 3 encoded. First encoded sequence shape: (800, 5)
DataFrame 4 encoded. First encoded sequence shape: (800, 5)
DataFrame 5 encoded. First encoded sequence shape: (800, 5)
DataFrame 6 encoded. First encoded sequence shape: (800, 5)
DataFrame 7 encoded. First encoded sequence shape: (800, 5)
DataFrame 8 encoded. First encoded sequence shape: (800, 5)
DataFrame 9 encoded. First encoded sequence shape: (800, 5)
DataFrame 10 encoded. First encoded sequence shape: (800, 5)


In [18]:
# Function to print an entry from the dataframe, just used to verify one hot function worked
def print_one_hot_entry(df, index):
    if index < len(df):
        # Extracting data for the specified index
        entry = df.iloc[index]
        chromosome = entry['chromosome']
        start = entry['start']
        end = entry['end']
        sequence = entry['data']
        one_hot_sequence = entry['one_hot_data']

        # Printing the details
        print(f"Chromosome: {chromosome}, Start: {start}, End: {end}")
        print(f"DNA Sequence: {sequence}")
        print("One-hot Encoded Sequence:")
        print(one_hot_sequence)
    else:
        print("Index out of range.")

print_one_hot_entry(Sequences_DataFrame[0], 0)  # Print the first entry of the first DataFrame

Chromosome: Chr1, Start: 2493, End: 3293
DNA Sequence: TCACAAGCATACCAACGACCATGATAAATCCAAAAAGTAGAAACAATCTATTATCTAAACCCCCAAAAGACAAAAGAAAAAAGTAGAAAGAAAAGGTAGGCAGAGATATAATGCTGGTTTTATTTGTTTGTTAAAAGATATTGCTATTTCTGCCAATATTAAAACTTCACTTAGGAAGACTTGAACCTACCACACGTTAGTGACTAATGAGAGCCACTAGATAATTGCATGCATCCCACACTAGTACTAATTTTCTAGGGATATTAGAGTTTTCTAATCACCTACTTCCTACTATGTGTATGTTATCTACTGGCGTGGATGCTTTTAAAGATGTTACGTTATTATTTTGTTCGGTTTGGAAAACGGCTCAATCGTTATGAGTTCGTAAGACACATACATTGTTCCATGATAAAATGCAACCCCACGAACCATTTGCGACAAGCAAAACAACATGGTCAAAATTAAAAGCTAACAATTAGCCAGCGATTCAAAAAGTCAACCTTCTAGATGGATTTAACAACATATCGATAGGATTCAAGATTAAAAATAAGCACACTCTTATTAATGTTAAAAAACGAATGAGATGAAAATATTTGGCGTGTTCACACACATAATCTAGAAGACAGATTCGAGTTGCTCTCCTTTGTTTTGCTTTGGGAGGGACCCATTATTACCGCCCAGCAGCTTCCCAGCCTTCCTTTATAAGGCTTAATTTATATTTATTTAAATTTTATATGTTCTTCTATTATAATACTAAAAGGGGAATACAAATTTCTACAGAGGATGATATTCAATCCACG
One-hot Encoded Sequence:
[[0 0 0 1 0]
 [0 1 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]]
