In [17]:
import pandas as pd
import random
import numpy as np
from Bio import SeqIO
import os 
import pickle

In [18]:
Chromosome_1_ = 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.1.fa'
Chromosome_2_ = 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.2.fa'
Chromosome_3_ = 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.3.fa'
Chromosome_4_ = 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.4.fa'
Chromosome_5_ = 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.5.fa'


In [19]:
# Putting all the chromosomes in a list with a size of 5
seqs_List= []

file_paths = [Chromosome_1_,Chromosome_2_,Chromosome_3_,Chromosome_4_,Chromosome_5_]

# Iterate over each file
for file_path in file_paths:
   with open(file_path, "r") as handle:

    # Open the file
    for record in SeqIO.parse(handle, "fasta"):
            sequence_id = record.id
            #print("Sequence ID:", sequence_id)
            description = record.description
            #print("Description:", description)
            sequence = record.seq
            seqs_List.append(str(sequence))
            print("Sequence:", str(sequence[:50]), "...")  # Print first 50 characters of the sequence
            #print("Sequence length:", len(sequence))  # Print length of the sequence

Sequence: CCCTAAACCCTAAACCCTAAACCCTAAACCTCTGAATCCTTAATCCCTAA ...
Sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ...
Sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ...
Sequence: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ...
Sequence: TATACCATGTACCCTCAACCTTAAAACCCTAAAACCTATACTATAAATCT ...


In [20]:
def process_dataframe(filename):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(filename, sep='\t', header=None)

    # Set column names
    df.columns = ['chromosome', 'start', 'end', 'name', 'score', 'strand', 'stuff', 'things', 'who', 'ahh', 'scream', 'ska', 'die', 'grr']

    # Calculate new positions based on existing start and end positions
    df['dif'] = df['end'] - df['start']
    df['mid'] = df['dif'] // 2
    df['midpoint'] = df['start'] + df['mid']
    df['new_start'] = df['midpoint'] - 400
    df['new_end'] = df['midpoint'] + 400
    df['dif_new'] = df['new_end'] - df['new_start']

    # Create a new DataFrame with selected columns
    df_new = pd.DataFrame({
        'chromosome': df['chromosome'],
        'start': df['start'],
        'end': df['end'],
        'midpoint': df['midpoint'],
        'dif': df['dif'],
        'new_start': df['new_start'], #new start and end are from the midpoint out 250 each way
        'new_end': df['new_end'],
        'position_difference_new': df['dif_new']
    })

    return df_new

In [21]:
testFile = "Final_bed_files/SRP034156_BedFiles/SRX391990.target.all.bed"
directory = "Final_bed_files//test_bedFiles"

# List to store dataframes
dataframes = []

for filename in os.listdir(directory):
    if filename.endswith(".bed"):
        file_path = os.path.join(directory, filename)
        df = process_dataframe(file_path)
        dataframes.append(df)


In [22]:
def add_seqs(df, data_list):
    dfs = []
    for index, row in df.iterrows():
        #Changing the chromosome to an int that can be used to find the correct index of the string of chromosomes
        chromosome_label = row['chromosome']
        chromosome_number = int(chromosome_label.replace('Chr', ''))
        data = data_list[chromosome_number - 1]
        extracted_data = data[row['new_start']:row['new_end']] #gets the peak sequence
        size = len(extracted_data)
        
        if(size < 800):
            Add_num_N = 800 - size
            #print(size)
            print("N to add: ", Add_num_N)
            extracted_data += 'N' * Add_num_N
            new_size = len(extracted_data)
            #print(new_size)
            size = new_size

        df_extracted = pd.DataFrame({ #saves it as a new dataframe
            'chromosome': [chromosome_label],
            'start': [row['new_start']],
            'end': [row['new_end']],
            'size': [size],
            'data': [extracted_data]
        })

        dfs.append(df_extracted)

    result_df = pd.concat(dfs, ignore_index=True)
    print("done")
    return result_df

In [23]:
# Check size and column names of each DataFrame within the list
for i, df in enumerate(dataframes):
    num_rows, num_columns = df.shape
    print(f"DataFrame {i+1}: {num_rows} rows x {num_columns} columns")
    print(f"Columns: {df.columns.tolist()}")

DataFrame 1: 29238 rows x 8 columns
Columns: ['chromosome', 'start', 'end', 'midpoint', 'dif', 'new_start', 'new_end', 'position_difference_new']
DataFrame 2: 13905 rows x 8 columns
Columns: ['chromosome', 'start', 'end', 'midpoint', 'dif', 'new_start', 'new_end', 'position_difference_new']
DataFrame 3: 16752 rows x 8 columns
Columns: ['chromosome', 'start', 'end', 'midpoint', 'dif', 'new_start', 'new_end', 'position_difference_new']
DataFrame 4: 19464 rows x 8 columns
Columns: ['chromosome', 'start', 'end', 'midpoint', 'dif', 'new_start', 'new_end', 'position_difference_new']
DataFrame 5: 20457 rows x 8 columns
Columns: ['chromosome', 'start', 'end', 'midpoint', 'dif', 'new_start', 'new_end', 'position_difference_new']
DataFrame 6: 24019 rows x 8 columns
Columns: ['chromosome', 'start', 'end', 'midpoint', 'dif', 'new_start', 'new_end', 'position_difference_new']
DataFrame 7: 22932 rows x 8 columns
Columns: ['chromosome', 'start', 'end', 'midpoint', 'dif', 'new_start', 'new_end', 'posi

In [24]:
Sequences_DataFrame = []

for k in dataframes:
    processed_df = add_seqs(k, seqs_List)
    Sequences_DataFrame.append(processed_df)
    # Print the columns of the DataFrame after processing
    print("Columns in processed DataFrame:", processed_df.columns.tolist())


N to add:  246
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
N to add:  37
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
N to add:  44
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']
done
Columns in processed DataFrame: ['chromosome', 'start', 'end', 'size', 'data']


In [25]:
# Check size and column names of each DataFrame within the list
for i, df in enumerate(Sequences_DataFrame):
    num_rows, num_columns = df.shape
    print(f"DataFrame {i+1}: {num_rows} rows x {num_columns} columns")
    print(f"Columns: {df.columns.tolist()}")


DataFrame 1: 29238 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 2: 13905 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 3: 16752 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 4: 19464 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 5: 20457 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 6: 24019 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 7: 22932 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 8: 11987 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 9: 22100 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']
DataFrame 10: 13879 rows x 5 columns
Columns: ['chromosome', 'start', 'end', 'size', 'data']


In [26]:
# This function generates labeled datasets by appending both original (positive) and shuffled (negative) DNA sequences, ensuring all sequences are of a specified length, and includes chromosome information.
def generate_labels_and_split_all(dataframes_list, data_list, seq_length=800):
    all_labeled_data = []
    for df in dataframes_list:
        labeled_data = []
        for index, row in df.iterrows():
            chromosome_label = row['chromosome']  # Storing the original chromosome label
            chromosome_number = int(chromosome_label.replace('Chr', ''))
            # Use 'start' and 'end' directly
            sequence = data_list[chromosome_number - 1][row['start']:row['end']]
            
            # Ensure the sequence is of required length
            if len(sequence) == seq_length:
                labeled_data.append({'chromosome': chromosome_label, 'sequence': sequence, 'label': 1})  # Positive example
                # Generating a negative example by shuffling
                negative_sequence = ''.join(random.sample(sequence, len(sequence)))
                labeled_data.append({'chromosome': chromosome_label, 'sequence': negative_sequence, 'label': 0})  # Negative example
        
        if labeled_data:
            all_labeled_data.append(pd.DataFrame(labeled_data))
    
    return pd.concat(all_labeled_data, ignore_index=True)

# Generate the labeled DataFrame
labeled_df = generate_labels_and_split_all(Sequences_DataFrame, seqs_List)


In [27]:
# Check size and column names of the labeled DataFrame
num_rows, num_columns = labeled_df.shape
print(f"Labeled DataFrame: {num_rows} rows x {num_columns} columns")
print(f"Columns: {labeled_df.columns.tolist()}")
print(labeled_df.iloc[0])


Labeled DataFrame: 389460 rows x 3 columns
Columns: ['chromosome', 'sequence', 'label']
chromosome                                                 Chr1
sequence      TAAGAAATCCATACATCAACATATCGCTTTCGTTACCTTAAATTTT...
label                                                         1
Name: 0, dtype: object


In [28]:
# Function to convert a sequence to one-hot encoding
def one_hot_encode(seq):
    # Mapping of nucleotides to integers
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
    # Create a zero matrix of size (len(seq), 5)
    one_hot = np.zeros((len(seq), 5), dtype=np.int8)
    # Fill the appropriate positions
    for i, nucleotide in enumerate(seq):
        if nucleotide in mapping:
            one_hot[i, mapping[nucleotide]] = 1
    return one_hot

# Applying one-hot encoding to extracted DNA sequences
def encode_sequences(df):
    # Applying the one-hot encoding to each row in the dataframe using the 'sequence' column
    df['one_hot_data'] = df['sequence'].apply(one_hot_encode)
    return df

# Apply encoding to the labeled DataFrame
encoded_df = encode_sequences(labeled_df)
print("DataFrame encoded. First encoded sequence shape:", encoded_df['one_hot_data'].iloc[0].shape)


DataFrame encoded. First encoded sequence shape: (800, 5)


In [29]:
# Function to print an entry from the dataframe, just used to verify one hot function worked
def print_one_hot_entry(df, index):
    if index < len(df):
        # Extracting data for the specified index
        entry = df.iloc[index]
        chromosome = entry['chromosome']
        sequence = entry['sequence']
        label = entry['label']  # Retrieving the label
        one_hot_sequence = entry['one_hot_data']

        # Printing the details
        print(f"Chromosome: {chromosome}")
        print(f"Label: {label}")
        print(f"DNA Sequence: {sequence}")
        print("One-hot Encoded Sequence:")
        for row in one_hot_sequence:  # Printing each row of the one-hot encoded sequence
            print(row)
    else:
        print("Index out of range.")

# Assuming encoded_df is the DataFrame you want to check
print_one_hot_entry(encoded_df, 1)  # Print the first entry of the labeled and encoded DataFrame


Chromosome: Chr1
Label: 0
DNA Sequence: GAAGTACTAAACCACAATCTCTGTATCTGTACGTCAATACAACGACTCATCGACATTCGGTCAAAATGGATATGTGTAACTGGATGAAAATCGCAACTAAAGTGAAAAGAACTTACGCATGCCTAAGTATTCCTAAAGTGATAGTCACATTTAAAATGAATCATGCTGTCCACCGTCGTATAATGAATGAGTGTATATAATGCAGTTAATGAATATCACTGTATATGTCACTATAAGCGACCGTGGCAAACACAGTTTATTTTAAACGACGGTTTACGATGCGAACACCAATCACACAGACTTGTCCATAGAAAATCTTAGCGCCAATCCCGTAAAGAATTATTCTCTGAAGAGATTGAAAAGCTACCACCTAGTTTATACAATCCTGTAAAGCTAGTAATTAATGTCAGGTAAAATTGTTATATGAATCTAATCAGTTTGGCGGAGTAACTCAGCGATAAGCACCGACATTTAACACTCTAGGGTAGTCGTGCTAGCTTTTGATTATAAATATATAAAGCTATTTATACGGAGTTATATATTTATAAACCAGAATCAGAGAAGACGTGGATTTATCATCACTTAAGCTATGACATCAGATGCTATAGTTTTCGATAAACCACTGTTTGACAACATAACCGACATGCCATTTGCCAGGCATATATCTTATTCCACAATATCTACACCAAAAAATCCTAACGTTAGATCGTATGTACGTTTCAGTTACACTAGATTCAAATGGATGAGTTCTGCCTACTTCAATACCTATGACCTTAACATGTCAAATTAACCCGATTTTT
One-hot Encoded Sequence:
[0 0 1 0 0]
[1 0 0 0 0]
[1 0 0 0 0]
[0 0 1 0 0]
[0 0 0 1 0]
[1 0 0 0 0]
[0 1 0 0 0]
[0 0 0 1 0]
[1 0 0 0 0]
[1 0 0 0 0]
[1 0 0 0 0]
[

In [30]:
# Function to save DataFrame to a pickle file
def save_encoded_df(df, filename):
    with open(filename, 'wb') as f:
        pickle.dump(df, f)
    print(f"DataFrame saved to {filename}")

# Save the encoded DataFrame
save_encoded_df(encoded_df, 'encoded_df.pkl')


DataFrame saved to encoded_df.pkl
