In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
fb = pd.read_csv('../data_source/Factorbook-ChIP-seq.csv')
b168 = pd.read_csv('../data_source/B.-subtilis-subtilis-168_cleaned.csv')
collecttf = pd.read_csv('../data_source/CollectTF_cleaned.csv')
mg1655 = pd.read_csv('../data_source/E.-coli-K-12-substr.-MG1655_cleaned_v2.csv')

In [18]:
b168.rename(columns={'TF Sequence': 'TF sequence'}, inplace=True)
b168.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                107 non-null    object
 1   TF name                107 non-null    object
 2   TF sequence            107 non-null    object
 3   binding site sequence  107 non-null    object
dtypes: object(4)
memory usage: 3.5+ KB


In [19]:
combined_df = pd.concat([b168, collecttf, mg1655], ignore_index=True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14121 entries, 0 to 14120
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                11940 non-null  object
 1   TF name                11940 non-null  object
 2   TF sequence            11940 non-null  object
 3   binding site sequence  11940 non-null  object
dtypes: object(4)
memory usage: 441.4+ KB


In [20]:
# First count the number of ambiguous nucleotides in each binding sequence
additional_df = fb.copy()

In [21]:
# Define the ambiguous nucleotides
ambiguous_nucleotides = set('RYWSKMBDHVN')

# Function to count ambiguous nucleotides in a sequence
def count_ambiguous_nucleotides(sequence):
    return sum(1 for nucleotide in sequence if nucleotide in ambiguous_nucleotides)/len(sequence)

# Apply the function to the 'binding site sequence' column
additional_df['ambiguous_count'] = additional_df['binding site sequence'].apply(count_ambiguous_nucleotides)

In [22]:
threshold = 0.0

In [23]:
zero_ambiguity_df = additional_df[additional_df['ambiguous_count'] <= threshold]
zero_ambiguity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1021 entries, 27 to 14269
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   species                1021 non-null   object 
 1   TF name                1021 non-null   object 
 2   TF sequence            1021 non-null   object 
 3   binding site sequence  1021 non-null   object 
 4   ambiguous_count        1021 non-null   float64
dtypes: float64(1), object(4)
memory usage: 47.9+ KB


In [24]:
combined_df = pd.concat([combined_df, zero_ambiguity_df], ignore_index=True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15142 entries, 0 to 15141
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   species                12961 non-null  object 
 1   TF name                12961 non-null  object 
 2   TF sequence            12961 non-null  object 
 3   binding site sequence  12961 non-null  object 
 4   ambiguous_count        1021 non-null   float64
dtypes: float64(1), object(4)
memory usage: 591.6+ KB


In [25]:
combined_df.drop(columns=['ambiguous_count'], inplace=True)
combined_df.dropna(inplace=True)

In [26]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12961 entries, 0 to 15141
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                12961 non-null  object
 1   TF name                12961 non-null  object
 2   TF sequence            12961 non-null  object
 3   binding site sequence  12961 non-null  object
dtypes: object(4)
memory usage: 506.3+ KB


In [27]:
combined_df.to_csv('../dataset/positive_set_v3.csv', index=False)

### Negative set

In each individual data source, the given DNA-protein pairs are truth set. Hence the pairs not listed in the dataset can be considered as negative set. We can shuffle the pairing within each dataset (or even across different dataset) to make "fake" pairing, which are negative set. 

In [16]:
import random

# Create a negative set by shuffling TF sequences and binding sites across different species
random.seed(42)  # for reproducibility

# Get unique species in the dataset
species_list = combined_df['species'].unique()

# Function to generate fake pairs that don't exist in combined_df
def generate_negative_samples(df, n_samples=10000):
    # Create a set of existing pairs for quick lookup
    existing_pairs = set(zip(df['TF sequence'], df['binding site sequence']))
    
    negative_samples = []
    
    # Track how many attempts we make to avoid infinite loops
    attempts = 0
    max_attempts = n_samples * 10
    
    while len(negative_samples) < n_samples and attempts < max_attempts:
        attempts += 1
        
        # Pick two different species
        species1, species2 = random.sample(list(species_list), 2)
        
        # Get TF from species1
        tf_rows = df[df['species'] == species1]
        if len(tf_rows) == 0:
            continue
        tf_idx = random.randint(0, len(tf_rows) - 1)
        tf_row = tf_rows.iloc[tf_idx]
        tf_name = tf_row['TF name']
        tf_sequence = tf_row['TF sequence']
        
        # Get binding site from species2
        bs_rows = df[df['species'] == species2]
        if len(bs_rows) == 0:
            continue
        bs_idx = random.randint(0, len(bs_rows) - 1)
        binding_site = bs_rows.iloc[bs_idx]['binding site sequence']
        
        # Check if this pair already exists in the positive set
        if (tf_sequence, binding_site) not in existing_pairs:
            negative_samples.append({
                'species': 'fake', # fake species
                'TF name': tf_name,
                'TF sequence': tf_sequence,
                'binding site sequence': binding_site,
                'label': 0  # 0 for negative samples
            })
            
            # Add to existing pairs to avoid duplicates in negative set
            existing_pairs.add((tf_sequence, binding_site))
    
    return pd.DataFrame(negative_samples)

# Generate negative samples
negative_df = generate_negative_samples(combined_df, n_samples=len(combined_df)*2)

# Add label column to combined_df (positive samples)
combined_df_labeled = combined_df.copy()
combined_df_labeled['label'] = 1  # 1 for positive samples

# Combine positive and negative datasets
final_dataset = pd.concat([combined_df_labeled, negative_df], ignore_index=True)

# Shuffle the final dataset
final_dataset = final_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Positive samples: {len(combined_df_labeled)}")
print(f"Negative samples: {len(negative_df)}")
print(f"Total dataset size: {len(final_dataset)}")

# Save the final dataset
final_dataset.to_csv('../dataset/combined_dataset_with_negatives_v3.csv', index=False)

Positive samples: 12961
Negative samples: 25922
Total dataset size: 38883


In [28]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38883 entries, 0 to 38882
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   species                38883 non-null  object
 1   TF name                38883 non-null  object
 2   TF sequence            38883 non-null  object
 3   binding site sequence  38883 non-null  object
 4   label                  38883 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


## Division into training set and test set

In [30]:
from sklearn.model_selection import train_test_split

# Import required libraries

# Get the list of unique species from the final dataset
species_list = final_dataset['species'].unique()

# Initialize empty dataframes for train and test sets
train_df = pd.DataFrame()
test_df = pd.DataFrame()

# For each species, split the data into train and test sets
# Use stratified sampling to ensure both positive and negative examples are represented
for species in species_list:
    species_data = final_dataset[final_dataset['species'] == species]
    
    # If there are fewer than 10 samples for a species, add all to training
    if len(species_data) < 10:
        train_df = pd.concat([train_df, species_data])
    else:
        # Stratified split to maintain the same ratio of positive and negative examples
        species_train, species_test = train_test_split(
            species_data, 
            test_size=0.1,  # 15% for testing
            random_state=42,
            stratify=species_data['label']  # Stratify by label
        )
        
        # Add to the respective dataframes
        train_df = pd.concat([train_df, species_train])
        test_df = pd.concat([test_df, species_test])

# Reset indices
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Print statistics
print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Training set positive examples: {len(train_df[train_df['label'] == 1])}")
print(f"Training set negative examples: {len(train_df[train_df['label'] == 0])}")
print(f"Test set positive examples: {len(test_df[test_df['label'] == 1])}")
print(f"Test set negative examples: {len(test_df[test_df['label'] == 0])}")

# Save the datasets
train_df.to_csv('../dataset/train_set_v3.csv', index=False)
test_df.to_csv('../dataset/test_set_v3.csv', index=False)

Training set size: 34993
Test set size: 3890
Training set positive examples: 11664
Training set negative examples: 23329
Test set positive examples: 1297
Test set negative examples: 2593
