In [1]:
import random
import pandas as pd
# Create a negative set by shuffling TF sequences and binding sites across different species
random.seed(42)  # for reproducibility

train_df = pd.read_csv('../dataset/train_positive_set_v4.csv')
test_df = pd.read_csv('../dataset/test_positive_set_v4.csv')


In [5]:
# Get unique species in the dataset
species_list = train_df['species'].unique()

# Function to generate fake pairs that don't exist in combined_df
def generate_negative_samples(df, n_samples=20000):
    # Create a set of existing pairs for quick lookup
    existing_pairs = set(zip(df['TF sequence'], df['binding site sequence']))
    
    negative_samples = []
    
    # Track how many attempts we make to avoid infinite loops
    attempts = 0
    max_attempts = n_samples * 10
    
    while len(negative_samples) < n_samples and attempts < max_attempts:
        attempts += 1
        
        # Pick two different species
        species1, species2 = random.sample(list(species_list), 2)
        
        # Get TF from species1
        tf_rows = df[df['species'] == species1]
        if len(tf_rows) == 0:
            continue
        tf_idx = random.randint(0, len(tf_rows) - 1)
        tf_row = tf_rows.iloc[tf_idx]
        tf_name = tf_row['TF name']
        tf_sequence = tf_row['TF sequence']
        
        # Get binding site from species2
        bs_rows = df[df['species'] == species2]
        if len(bs_rows) == 0:
            continue
        bs_idx = random.randint(0, len(bs_rows) - 1)
        binding_site = bs_rows.iloc[bs_idx]['binding site sequence']
        
        # Check if this pair already exists in the positive set
        if (tf_sequence, binding_site) not in existing_pairs:
            negative_samples.append({
                'species': 'fake', # fake species
                'TF name': tf_name,
                'TF sequence': tf_sequence,
                'binding site sequence': binding_site,
                'label': 0  # 0 for negative samples
            })
            
            # Add to existing pairs to avoid duplicates in negative set
            existing_pairs.add((tf_sequence, binding_site))
    
    return pd.DataFrame(negative_samples)

# Generate negative samples
negative_df = generate_negative_samples(train_df, n_samples=len(train_df)*2.05)

negative_df_test = negative_df.sample(frac=0.05, random_state=42)
negative_df = negative_df.drop(negative_df_test.index)

# Add label column to combined_df (positive samples)
train_df_labeled = train_df.copy()
train_df_labeled['label'] = 1  # 1 for positive samples

# Combine positive and negative datasets
final_dataset = pd.concat([train_df_labeled, negative_df], ignore_index=True)

# Shuffle the final dataset
final_dataset = final_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Positive samples: {len(train_df_labeled)}")
print(f"Negative samples: {len(negative_df)}")
print(f"Total dataset size: {len(final_dataset)}")

# Save the final dataset
final_dataset.to_csv('../dataset/training_dataset_with_negatives_v4.csv', index=False)

Positive samples: 12117
Negative samples: 23598
Total dataset size: 35715


In [6]:
len(negative_df_test)

1242

In [8]:
final_dataset.value_counts('label')

label
0    23598
1    12117
Name: count, dtype: int64

In [None]:
negative_df_generated_by_test = generate_negative_samples(test_df, n_samples=len(test_df))
test_df_labeled = test_df.copy()
test_df_labeled['label'] = 1
final_negative_test = pd.concat([negative_df_generated_by_test, negative_df_test], ignore_index=True)
finall_test_dataset = pd.concat([test_df_labeled, final_negative_test], ignore_index=True)
finall_test_dataset = finall_test_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
finall_test_dataset.to_csv('../dataset/test_dataset_with_negatives_v4.csv', index=False)

1331

In [10]:
finall_test_dataset.value_counts('label')

label
0    1331
1     844
Name: count, dtype: int64