In [3]:
import numpy as np
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from imblearn.over_sampling import ADASYN
data = pd.read_csv("Data/Train_data_tgt.csv")
data = data.drop(columns = ['Unnamed: 0'])


In [2]:
data.head()

Unnamed: 0.1,Unnamed: 0,gene_id,Transcript_ID,Position,label,Base_seq,PC1,PC2,PC3,PC4,PC5,PC6
0,0,ENSG00000004059,ENST00000000233,244,0,AAGACCA,3.873912,-0.048746,0.091896,-0.033665,0.678716,0.374987
1,1,ENSG00000004059,ENST00000000233,261,0,CAAACTG,-1.306044,-0.849555,0.785109,-0.889804,0.136465,1.249942
2,2,ENSG00000004059,ENST00000000233,316,0,GAAACAG,-2.335634,0.196457,-0.352186,-1.326693,-0.490834,-0.612081
3,3,ENSG00000004059,ENST00000000233,332,0,AGAACAT,-1.071298,2.60222,1.207862,2.100595,0.161306,0.859797
4,4,ENSG00000004059,ENST00000000233,368,0,AGGACAA,2.403689,3.145058,0.492082,-1.602638,-0.330433,0.704648


# Random Undersampling 

In [94]:
# Random undersampling, the input is the train dataset in the data folder
# Output is a dataframe with equal number of positive and negative labels
def random_undersample(train_data):
    train_data = train_data.drop(columns = ["label","gene_id","Transcript_ID","Base_seq"])
    negative_class = train_data[train_data['label'] == 0]
    positive_class = train_data[train_data['label'] == 1]
    sampled_majority_class = negative_class.sample(n=len(positive_class), random_state=45)
    output = pd.concat([sampled_majority_class, positive_class],axis = 0,ignore_index = True)
    return output

# GAN sampling

In [95]:
# GAN based oversampling, input is the PC train data and number of positive samples you want to generate
def GANsampling(train_data,no_of_samples = 10):
    data = train_data.drop(columns = ["label","gene_id","Transcript_ID","Base_seq"])
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=data)
    metadata.update_column("label",sdtype="categorical")
    CTGAN = CTGANSynthesizer(
        metadata, 
        enforce_rounding=False,
        epochs=50,
        verbose=False
    )
    CTGAN.fit(data)
    CT_samples = CTGAN.sample(num_rows=no_of_samples)
    New_positive = CT_samples[CT_samples["label"] == 1]
    output = pd.concat([train_data,New_positive],axis = 0, ignore_index = True) 
    return output

# Adasyn

In [9]:
# The input is the uploaded PC train dataset
def adasyn_sample(train):
    X = train.drop(columns = ["label","gene_id","Transcript_ID","Base_seq"])
    y = train['label']
    adasyn_model = ADASYN(sampling_strategy='auto', random_state=42)
    X_adasyn_sample, y_adasyn_sample = adasyn_model.fit_resample(X, y)
    output = pd.concat([X_adasyn_sample, y_adasyn_sample], axis=1)
    return output