In [7]:
import pandas as pd
from tqdm import tqdm
 
# Function to read chromosome sequences from FASTA files
def read_chromosome_sequence(filepath):
    with open(filepath, 'r') as f:
        sequence = ''.join(line.strip() for line in f.readlines()[1:])
    return sequence
 
# File paths for chromosomes
Chromosome_files = {
    'Chromosome_1': 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.1.fa',
    'Chromosome_2': 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.2.fa',
    'Chromosome_3': 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.3.fa',
    'Chromosome_4': 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.4.fa',
    'Chromosome_5': 'Final_bed_files/Chromosomes/Arabidopsis_thaliana.TAIR10.dna.chromosome.5.fa'
}
 
# Read chromosome sequences
chromosome_sequences = {}
for chromosome, filepath in Chromosome_files.items():
    chromosome_sequences[chromosome.split('_')[1]] = read_chromosome_sequence(filepath)
 
def one_hot_encode_dna(sequence):
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1], 'N': [0, 0, 0, 0]}
    one_hot_encoded = [mapping.get(nucleotide, [0, 0, 0, 0]) for nucleotide in sequence]
    return one_hot_encoded
 
def extract_peak_sequences(bed_file, chromosome_sequences):
    peaks = []
    with open(bed_file, 'r') as f:
        lines = f.readlines()
 
    for line in tqdm(lines, desc=f"Extracting peaks from {bed_file}"):
        parts = line.strip().split('\t')
        chromosome = parts[0].split('Chr')[1]
        start = int(parts[1])
        end = int(parts[2])
        peak_name = parts[3].split('_')[0]
        peak_sequence = chromosome_sequences[chromosome][start-400:end+400]
 
        # Padding logic: extend the sequence to 800 if it is shorter
        target_length = 800
        if len(peak_sequence) < target_length:
            peak_sequence += 'N' * (target_length - len(peak_sequence))
        elif len(peak_sequence) > target_length:
            peak_sequence = peak_sequence[:target_length]
 
        one_hot_sequence = one_hot_encode_dna(peak_sequence)
        peaks.append({
            'Chromosome': chromosome,
            'Start': start,
            'End': end,
            'Peak Name': peak_name,
            'Sequence': peak_sequence,
            'One_Hot_Sequence': one_hot_sequence
        })
    return peaks
 
SRX_bed_files = [
    'Final_bed_files/test_bedFiles/SRX391990.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX391991.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX391992.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX391993.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX391994.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX391995.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX391996.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX391997.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX1096548.target.all.bed',
    'Final_bed_files/test_bedFiles/SRX1096549.target.all.bed'
]
 
all_peaks = []
for bed_file in SRX_bed_files:
    peaks = extract_peak_sequences(bed_file, chromosome_sequences)
    all_peaks.extend(peaks)
 
df = pd.DataFrame(all_peaks)
print(df.head())
print("Shape of DataFrame:", df.shape)


Extracting peaks from Final_bed_files/test_bedFiles/SRX391990.target.all.bed: 100%|██████████| 19464/19464 [00:02<00:00, 7385.33it/s] 
Extracting peaks from Final_bed_files/test_bedFiles/SRX391991.target.all.bed: 100%|██████████| 22100/22100 [00:01<00:00, 11960.51it/s]
Extracting peaks from Final_bed_files/test_bedFiles/SRX391992.target.all.bed: 100%|██████████| 16752/16752 [00:01<00:00, 11372.38it/s]
Extracting peaks from Final_bed_files/test_bedFiles/SRX391993.target.all.bed: 100%|██████████| 22932/22932 [00:05<00:00, 3830.51it/s]
Extracting peaks from Final_bed_files/test_bedFiles/SRX391994.target.all.bed: 100%|██████████| 29238/29238 [00:05<00:00, 5826.96it/s]
Extracting peaks from Final_bed_files/test_bedFiles/SRX391995.target.all.bed: 100%|██████████| 13905/13905 [00:04<00:00, 3102.38it/s]
Extracting peaks from Final_bed_files/test_bedFiles/SRX391996.target.all.bed: 100%|██████████| 13879/13879 [00:10<00:00, 1359.30it/s]
Extracting peaks from Final_bed_files/test_bedFiles/SRX3919

  Chromosome  Start    End  Peak Name  \
0          1   2374   3625  SRX391990   
1          1   8607   8909  SRX391990   
2          1   9512   9807  SRX391990   
3          1  14639  15688  SRX391990   
4          1  20587  21661  SRX391990   

                                            Sequence  \
0  ATATTAACCATGTATTCATAGTAAAATGTTTCATGTGATATCAAAC...   
1  AAGAACTTGAATTGAAATAGTTTTTTACCTGAATATTGACAGTTGC...   
2  TTTATAAGAAAATAAATTATTTATTACAATTCAACAGTGAAGAAAT...   
3  GCCATGATTACTTAAAACTTTGTATGCATATACCATGTACATATGA...   
4  AAATTATGTTTTCATAGTCAAGTAACTAGTTTGTGTTATTTCCATT...   

                                    One_Hot_Sequence  
0  [[1, 0, 0, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0,...  
1  [[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [1,...  
2  [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1,...  
3  [[0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0], [1,...  
4  [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0,...  
Shape of DataFrame: (194733, 6)


In [8]:
def label_peaks(df, bed_files, overlap_threshold=300):
    # Extract the SRX identifier from each bed file name
    srx_identifiers = [file.split('/')[-1].split('.')[0] for file in bed_files]
    
    # Create a dictionary to map SRX identifiers to their index positions
    srx_index_map = {srx: idx for idx, srx in enumerate(srx_identifiers)}
    
    # Create a dictionary to store peak intervals by SRX identifier
    peak_intervals = {srx: [] for srx in srx_identifiers}
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
        srx_id = row['Peak Name'].split('_')[0]
        if srx_id in peak_intervals:
            # Store tuple of (start, end, index_in_df)
            peak_intervals[srx_id].append((row['Start'], row['End'], index))
    
    # Initialize labels for all rows
    df['Label'] = [[0] * len(srx_identifiers) for _ in range(len(df))]
    
    # Function to set labels considering overlaps
    def set_labels():
        for srx, intervals in tqdm(peak_intervals.items(), desc="Setting labels"):
            for start, end, idx in intervals:
                # Check for overlap with all other intervals in the same group
                for other_srx, other_intervals in peak_intervals.items():
                    if srx == other_srx:  # Skip same SRX to avoid self-comparison
                        continue
                    for other_start, other_end, other_idx in other_intervals:
                        # Check if the intervals overlap by at least 300
                        if (min(end, other_end) - max(start, other_start) >= overlap_threshold):
                            df.at[idx, 'Label'][srx_index_map[other_srx]] = 1
                # Ensure the current SRX is always marked as 1
                df.at[idx, 'Label'][srx_index_map[srx]] = 1
    
    # Apply labels setting function
    set_labels()
    
    # Drop columns not required and retain only 'Chromosome', 'Label', and 'One_Hot_Sequence'
    final_df = df[['Chromosome', 'Label', 'One_Hot_Sequence']]
    
    return final_df


In [12]:
# Example usage:
fraction = 0.1  # Adjust this value to the desired fraction
sampled_df = df.sample(frac=fraction, random_state=42)

labeled_df = label_peaks(sampled_df, SRX_bed_files)
print(labeled_df.head)

Processing rows:   0%|          | 0/19473 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 19473/19473 [00:00<00:00, 26319.49it/s]
Setting labels: 100%|██████████| 10/10 [01:37<00:00,  9.75s/it]

<bound method NDFrame.head of        Chromosome                           Label  \
194149          5  [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]   
137333          5  [0, 0, 0, 1, 0, 0, 1, 0, 1, 0]   
118008          3  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]   
16540           5  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]   
167015          4  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]   
...           ...                             ...   
58903           1  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]   
126620          1  [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]   
163656          3  [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]   
44612           1  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0]   
79524           5  [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]   

                                         One_Hot_Sequence  
194149  [[0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0], [0,...  
137333  [[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,...  
118008  [[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0,...  
16540   [[0, 0, 1, 0], [0, 0, 0, 1], [1, 0, 0, 0], [0,...  
167015  [[1, 0, 0, 0], [0, 0, 0, 




In [13]:
# Print the shape of the DataFrame
print("Shape of labeled_df:", labeled_df.shape)

Shape of labeled_df: (19473, 3)


I wanted to check that my code worked correctly, and there was entries with more than one 1 label.

In [14]:
# Filter rows where the sum of 1s in the 'Label' vector is greater than 1
entries_with_multiple_ones = labeled_df[labeled_df['Label'].apply(lambda x: sum(x) > 1)]

# Print these entries
print(entries_with_multiple_ones)

# If you want to print a specific entry, such as the first one if it exists:
if not entries_with_multiple_ones.empty:
    print("First entry with multiple 1s in 'Label':")
    print(entries_with_multiple_ones.iloc[0])
else:
    print("No entries with multiple 1s found.")

       Chromosome                           Label  \
137333          5  [0, 0, 0, 1, 0, 0, 1, 0, 1, 0]   
143658          3  [0, 1, 0, 0, 0, 0, 0, 1, 0, 0]   
188381          4  [0, 1, 0, 1, 0, 0, 0, 0, 0, 1]   
17902           5  [1, 0, 0, 0, 1, 0, 0, 0, 1, 0]   
159362          2  [1, 0, 0, 0, 0, 0, 0, 0, 1, 0]   
...           ...                             ...   
185910          3  [1, 0, 0, 0, 1, 0, 0, 0, 1, 1]   
97311           3  [0, 0, 0, 1, 1, 0, 0, 0, 0, 0]   
104009          5  [0, 0, 0, 0, 1, 0, 1, 0, 1, 0]   
134104          4  [0, 0, 0, 0, 0, 0, 1, 0, 1, 0]   
56616           5  [0, 1, 1, 0, 0, 1, 0, 0, 0, 0]   

                                         One_Hot_Sequence  
137333  [[1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0,...  
143658  [[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0,...  
188381  [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0,...  
17902   [[0, 0, 1, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0,...  
159362  [[0, 0, 0, 1], [0, 0, 0, 1], [1, 0, 0, 0], [0,...  
...

In [15]:
# Save the labeled DataFrame to a pickle file
labeled_df.to_pickle("labeled_df.pkl")

# Output a message indicating the file has been saved
print("The DataFrame has been saved to 'labeled_df.pkl'")

The DataFrame has been saved to 'labeled_df.pkl'
