In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold


In [3]:
balanced_df = pd.read_csv('Dataset/BioGrid_Human_SIP.csv')
balanced_df = balanced_df.rename(columns={'uniProtKBId': 'Protein', 'sequence': 'seq', 'Label': 'label'})
balanced_df.drop(columns='sequence length', inplace=True)
print(balanced_df.head())

       Protein                                                seq  label
0  TM129_HUMAN  MDSPEVTFTLAYLVFAVCFVFTPNEFHAAGLTVQNLLSGWLGSEDA...      1
1  MED19_HUMAN  MENFTALFGAQADPPPPPTALGFGPGKPPPPPPPPAGGGPGTAPPP...      1
2  BLT3B_HUMAN  MAGIIKKQILKHLSRFTKNLSPDKINLSTLKGEGELKNLELDEEVL...      0
3  SC5AA_HUMAN  MAANSTSDLHTPGTQLSVADIIVITVYFALNVAVGIWSSCRASRNT...      0
4  T120B_HUMAN  MSGQLERCEREWHELEGEFQELQETHRIYKQKLEELAALQTLCSSS...      0


In [3]:
balanced_df['seq_len'] = balanced_df['seq'].str.len()

# Create bins for sequence lengths to convert them into categorical values
balanced_df['len_bin'] = pd.qcut(balanced_df['seq_len'], q=5, labels=False, duplicates='drop')

# Create a new stratification column by combining class labels and sequence length bins
balanced_df['stratify_col'] = balanced_df['label'].astype(str) + "_" + balanced_df['len_bin'].astype(str)

# Initialize StratifiedKFold for 11 folds
skf = StratifiedKFold(n_splits=11, shuffle=True, random_state=42)

# Store each fold in a list
split_dfs = []
for train_index, test_index in skf.split(balanced_df, balanced_df['stratify_col']):
    fold_df = balanced_df.iloc[test_index].drop(columns=['seq_len', 'len_bin', 'stratify_col'])  # Drop unnecessary columns
    split_dfs.append(fold_df)

# Check the size of each fold
for i, df in enumerate(split_dfs):
    print(f"Fold {i+1}: {df.shape}")
    label_counts = df['label'].value_counts()
    print("Count of Label 0:", label_counts.get(0, 0))
    print("Count of Label 1:", label_counts.get(1, 0))


Fold 1: (740, 3)
Count of Label 0: 440
Count of Label 1: 300
Fold 2: (740, 3)
Count of Label 0: 441
Count of Label 1: 299
Fold 3: (740, 3)
Count of Label 0: 441
Count of Label 1: 299
Fold 4: (740, 3)
Count of Label 0: 441
Count of Label 1: 299
Fold 5: (740, 3)
Count of Label 0: 441
Count of Label 1: 299
Fold 6: (740, 3)
Count of Label 0: 441
Count of Label 1: 299
Fold 7: (739, 3)
Count of Label 0: 441
Count of Label 1: 298
Fold 8: (739, 3)
Count of Label 0: 441
Count of Label 1: 298
Fold 9: (739, 3)
Count of Label 0: 441
Count of Label 1: 298
Fold 10: (739, 3)
Count of Label 0: 439
Count of Label 1: 300
Fold 11: (739, 3)
Count of Label 0: 440
Count of Label 1: 299


In [4]:
def split_sequence(sequence, max_length=254, chunk_size=254, stride=64):
    if len(sequence) > max_length:
        # Split the sequence into overlapping chunks of chunk_size with the given stride
        chunks = []
        for i in range(0, len(sequence), stride):
            chunk = sequence[i:i + chunk_size]
            if len(chunk) == chunk_size:
                chunks.append(chunk)
        return chunks
    else:
        return [sequence]

In [None]:
for i, split_df in enumerate(split_dfs):
    split_rows = []  # List to hold new rows with split sequences
    for _, row in split_df.iterrows():
        split_seqs = split_sequence(row['seq'])
        for seq in split_seqs:
            # Append a new row with split sequence
            split_rows.append({
                'Protein': row['Protein'],
                'seq': seq,
                'label': row['label']
            })

    # Convert list of dictionaries into a new DataFrame
    new_split_df = pd.DataFrame(split_rows)
    new_split_df.to_csv(f'Dataset/biogrid_human/split_{i+1}.csv', index=False)
    print(f"Split {i+1} saved./n")
    label_counts = new_split_df['label'].value_counts()
    print("Count of Label 0:", label_counts.get(0, 0))
    print("Count of Label 1:", label_counts.get(1, 0))

Split 1 saved.

Count of Label 0: 2337
Count of Label 1: 1874
Split 2 saved.

Count of Label 0: 2270
Count of Label 1: 1957
Split 3 saved.

Count of Label 0: 2221
Count of Label 1: 1867
Split 4 saved.

Count of Label 0: 2266
Count of Label 1: 1777
Split 5 saved.

Count of Label 0: 2290
Count of Label 1: 1906
Split 6 saved.

Count of Label 0: 2444
Count of Label 1: 1950
Split 7 saved.

Count of Label 0: 2278
Count of Label 1: 2117
Split 8 saved.

Count of Label 0: 2379
Count of Label 1: 2114
Split 9 saved.

Count of Label 0: 2324
Count of Label 1: 2030
Split 10 saved.

Count of Label 0: 2317
Count of Label 1: 1902
Split 11 saved.

Count of Label 0: 2279
Count of Label 1: 1964


In [None]:
test_df = pd.read_csv('Dataset/biogrid_human/split_11.csv')
test_df.to_csv('Dataset/biogrid_human/biogrid_human_dataset/Biogrid_human.test.csv', index=False)
print("Test set saved as test.csv")

# Loop through 10 folds, using each as validation once
for i in range(1, 11):
    valid_df = pd.read_csv(f'Dataset/biogrid_human/split_{i}.csv')

    # Merge the remaining 9 folds into train_df
    train_dfs = []
    for j in range(1, 11):
        if j != i:
            train_dfs.append(pd.read_csv(f'Dataset/biogrid_human/split_{j}.csv'))

    train_df = pd.concat(train_dfs, ignore_index=True)

    # Save train and validation sets
    train_df.to_csv(f'Dataset/biogrid_human/biogrid_human_dataset/Biogrid_human_{i}.train.csv', index=False)
    valid_df.to_csv(f'Dataset/biogrid_human/biogrid_human_dataset/Biogrid_human_{i}.valid.csv', index=False)

    print(f"Train and validation sets saved for fold {i}:")
    print(f"  Train -> train_fold{i}.csv ({train_df.shape[0]} samples)")
    print(f"  Valid -> valid_fold{i}.csv ({valid_df.shape[0]} samples)")

Test set saved as test.csv
Train and validation sets saved for fold 1:
  Train -> train_fold1.csv (38409 samples)
  Valid -> valid_fold1.csv (4211 samples)
Train and validation sets saved for fold 2:
  Train -> train_fold2.csv (38393 samples)
  Valid -> valid_fold2.csv (4227 samples)
Train and validation sets saved for fold 3:
  Train -> train_fold3.csv (38532 samples)
  Valid -> valid_fold3.csv (4088 samples)
Train and validation sets saved for fold 4:
  Train -> train_fold4.csv (38577 samples)
  Valid -> valid_fold4.csv (4043 samples)
Train and validation sets saved for fold 5:
  Train -> train_fold5.csv (38424 samples)
  Valid -> valid_fold5.csv (4196 samples)
Train and validation sets saved for fold 6:
  Train -> train_fold6.csv (38226 samples)
  Valid -> valid_fold6.csv (4394 samples)
Train and validation sets saved for fold 7:
  Train -> train_fold7.csv (38225 samples)
  Valid -> valid_fold7.csv (4395 samples)
Train and validation sets saved for fold 8:
  Train -> train_fold8.csv