In [91]:
import numpy as np
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from imblearn.over_sampling import ADASYN
data = pd.read_csv("Processed_data.csv")
data = data.drop(columns = ['Unnamed: 0'])


# Train test split

In [92]:
gene_ids = list(set(data.gene_id))
#gene_details = data.groupby("gene_id").label.sum().sort_values(ascending = False)
#gene_details = gene_details.to_frame()
#gene_details = gene_details.reset_index()

In [93]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(data,test_size=0.2,random_state = 42)

# Random Undersampling 

In [94]:
# Random undersampling, the input is the train dataset derived from the merged data
# Output is a dataframe with equal number of positive and negative labels
def random_undersample(train_data):
    negative_class = train_data[train_data['label'] == 0]
    positive_class = train_data[train_data['label'] == 1]
    sampled_majority_class = negative_class.sample(n=len(positive_class), random_state=45)
    output = pd.concat([sampled_majority_class, positive_class])
    return output

# GAN sampling

In [95]:
# GAN based oversampling
def GANsampling(train_data,no_of_samples = 10):
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=train_data)
    metadata.update_column("label",sdtype="categorical")
    CTGAN = CTGANSynthesizer(
        metadata, 
        enforce_rounding=False,
        epochs=20,
        verbose=False
    )
    CTGAN.fit(train_data)
    CT_samples = CTGAN.sample(num_rows=no_of_samples)
    New_positive = CT_samples[CT_samples["label"] == 1]
    output = pd.concat([train_data,New_positive]) 
    return output

# Adasyn

In [98]:
def adasyn_sample(train_data):
    X = train.drop(columns = ["label"])
    y = train['label']
    adasyn_model = ADASYN(sampling_strategy='auto', random_state=42)
    X_adasyn_sample, y_adasyn_sample = adasyn_model.fit_resample(X, y)
    output = pd.concat([X_adasyn_sample, y_adasyn_sample], axis=1)
    return output


In [100]:
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import accuracy_score
#balanced_sample = adasyn_sample(train)

#X_train = train_data[["Position","transcript_position",'mean1', 'mean2', 'mean3', 'mean4', 'mean5', 'mean6', 'mean7', 'mean8', 'mean9', 'sd1', 'sd2', 'sd3', 'sd4', 'sd5', 'sd6', 'sd7', 'sd8', 'sd9']]
##Y_train = train_data['label']
#X_test = test_data[['mean1', 'mean2', 'mean3', 'mean4', 'mean5', 'mean6', 'mean7', 'mean8', 'mean9', 'sd1', 'sd2', 'sd3', 'sd4', 'sd5', 'sd6', 'sd7', 'sd8', 'sd9']]
#Y_test = test_data['label']