In [297]:
# import numpy as np
# import pandas as pd

# # Set the seed for reproducibility
# np.random.seed(42)

# # Generate 1000 rows of random data for 30 columns
# random_data = np.random.rand(100000, 30)

# # Convert to a DataFrame for better visualization
# random_df = pd.DataFrame(random_data, columns=[f'Feature_{i+1}' for i in range(30)])

# # Show the first few rows of the generated data



In [298]:
# np.random.exponential(scale=1, size=(100000, 30))

In [299]:
# !pip install fetch_openml

In [300]:
# import numpy as np
# import pandas as pd
# from sklearn.datasets import load_breast_cancer
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler


# #Random generations for Adult
# # Generate data from a Poisson distribution with lambda=3
# # random_data = np.random.poisson(lam=3, size=(1000, 30))
# # random_data = np.random.exponential(scale=1, size=(1000, 30))
# random_data = np.abs(np.random.beta(a=2, b=5, size=(50000, 30)))
# # random_data = np.random.gamma(shape=2, scale=1, size=(1000, 30))
# # random_data = np.random.lognormal(mean=0, sigma=1, size=(1000, 30))

# scaler = StandardScaler()
# random_data = scaler.fit_transform(random_data)


# # Convert to DataFrame for better visualization
# random_df = pd.DataFrame(random_data, columns=[f'Feature_{i+1}' for i in range(30)])

# # Show the first few rows of the generated data
# print(random_df.head())

In [301]:
# random_df.columns

In [302]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, MaxAbsScaler, QuantileTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import shuffle

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Hyperparameters
LEARNING_RATE = 0.001
EPOCHS = 100
BATCH_SIZE = 32
SYNTHETIC_SAMPLES = 1000000  # Total synthetic samples
CHUNK_SIZE = 100000
CLASS_PROPORTIONS = {0: 0.5, 1: 0.5}  # Equal distribution

# class VictimModel(nn.Module):
#     def __init__(self, input_dim, output_dim):
#         super(VictimModel, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(input_dim, 128),
#             nn.ReLU(),
#             nn.Linear(128, 128),
#             nn.ReLU(),
#             nn.Linear(128, 128),
#             nn.ReLU(),
#             nn.Linear(128, 128),
#             nn.ReLU(),
#             nn.Linear(128, 128),
#             nn.ReLU(),
#             nn.Linear(128, 128),
#             nn.ReLU(),
#             nn.Linear(128, 128),
#             nn.ReLU(),
#             nn.Linear(128, output_dim)
#         )

#     def forward(self, x):
#         return self.model(x)

class VictimModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(VictimModel, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),      # Add Batch Normalization
            nn.ReLU(),
            nn.Dropout(0.3),          # Add Dropout

            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),




            nn.Linear(128, output_dim)  # No Sigmoid here, as using BCEWithLogitsLoss
        )

    def forward(self, x):
        return self.model(x)






def load_and_preprocess_data():

    y = diabetes['Outcome']
    X = diabetes.drop(columns=['Outcome'], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=RANDOM_SEED)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)




    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

    return X_train_tensor, y_train_tensor, X_test_tensor, y_test, scaler

def train_model(model, criterion, optimizer, X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE):
    for epoch in range(epochs):
        model.train()
        for i in range(0, len(X_train), batch_size):
            batch_X = X_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        y_pred = torch.sigmoid(model(X_test)).numpy().flatten()
        y_pred_binary = (y_pred > 0.5).astype(int)

    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    # print(f"Accuracy: {accuracy:.4f}")
    # print(f"Precision: {precision:.4f}")
    # print(f"Recall: {recall:.4f}")
    # print(f"F1 Score: {f1:.4f}")
    return y_pred_binary

def query_teacher_model(model, data):
    # st_scaler = StandardScaler()
    # data_scaled = st_scaler.fit_transform(data)
    data_scaled = data

    data_tensor = torch.tensor(data_scaled, dtype=torch.float32)
    with torch.no_grad():
        outputs = torch.sigmoid(model(data_tensor)).numpy().flatten()
        outputs = (outputs >= 0.97).astype(int)
    return outputs

In [303]:
model = VictimModel(8, 1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [304]:
# from sklearn.datasets import fetch_openml
# from sklearn.preprocessing import LabelEncoder
# # Load the Adult dataset using fetch_openml
# adult_data = fetch_openml(name='adult', version=2, as_frame=True)

# # Extract features and target
# # X = adult_data.data
# # y = adult_data.target

# def encode_categorical_data(X, y=None):
#     # Create a copy of the dataframe to avoid modifying the original data
#     X_encoded = X.copy()

#     # Identify categorical columns
#     categorical_cols = X_encoded.select_dtypes(include=['category', 'object']).columns

#     # Apply LabelEncoder to each categorical column
#     for col in categorical_cols:
#         le = LabelEncoder()
#         X_encoded[col] = le.fit_transform(X_encoded[col])

#     # If target `y` is provided, encode it as well
#     y_encoded = None
#     if y is not None:
#         le_target = LabelEncoder()
#         y_encoded = le_target.fit_transform(y)

#     return X_encoded, y_encoded

# # Example usage:
# # adult, cancer, diabetes, arrhythmia = load_datasets()

# # Apply encoding to the features and target:
# adult_X_encoded, adult_y_encoded = encode_categorical_data(adult_data.data, adult_data.target)







In [305]:
diabetes = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/diabetes.csv')

y = diabetes['Outcome']
X = diabetes.drop(columns=['Outcome'], axis=1)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # Standardize the features
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
# # Convert the data to PyTorch tensors
# X = torch.tensor(X, dtype=torch.float32).to(device)
# y = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(device)

In [306]:
X_train_tensor, y_train_tensor, X_test_tensor, y_test, scaler = load_and_preprocess_data()

# Pass the correct tensor to the train_model function (X_train_tensor)
train_model(model, criterion, optimizer, X_train_tensor, y_train_tensor, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch [10/100], Loss: 0.1785
Epoch [20/100], Loss: 0.0748
Epoch [30/100], Loss: 0.1279
Epoch [40/100], Loss: 0.0685
Epoch [50/100], Loss: 0.0646
Epoch [60/100], Loss: 0.0272
Epoch [70/100], Loss: 0.0300
Epoch [80/100], Loss: 0.0448
Epoch [90/100], Loss: 0.0309
Epoch [100/100], Loss: 0.0123


In [307]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


#Random generations for Adult
# Generate data from a Poisson distribution with lambda=3
# random_data = np.random.poisson(lam=3, size=(50000, 14))
# random_data = np.random.exponential(scale=1, size=(50000, 14))
random_data = np.abs(np.random.beta(a=2, b=5, size=(5000, 8)))
# random_data = np.random.gamma(shape=2, scale=1, size=(50000, 14))
# random_data = np.random.lognormal(mean=0, sigma=1, size=(50000, 14) )

# scaler = StandardScaler()
# random_data = scaler.fit_transform(random_data)


# Convert to DataFrame for better visualization
random_df = pd.DataFrame(random_data, columns=[f'Feature_{i+1}' for i in range(8)])

# Show the first few rows of the generated data
print(random_df.head())

   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0   0.353677   0.248558   0.415959   0.159968   0.550283   0.110945   
1   0.198290   0.376237   0.543467   0.084764   0.357929   0.273321   
2   0.132790   0.326434   0.165810   0.207734   0.263408   0.390014   
3   0.203314   0.183877   0.367824   0.202097   0.186340   0.283520   
4   0.309638   0.044386   0.124579   0.211003   0.358189   0.652242   

   Feature_7  Feature_8  
0   0.509897   0.177270  
1   0.134376   0.353255  
2   0.153645   0.305502  
3   0.100399   0.207729  
4   0.162266   0.069424  


In [308]:
# random_df['output'] = list(query_teacher_model(model, random_df.values))  # Use all columns
random_df['output'] = list(query_teacher_model(model, random_df.values))

In [309]:
  # random_df.to_csv('/content/drive/MyDrive/Colab Notebooks/my_dataframe.csv', index=False)

In [310]:

features = list(X.columns)

features.append('output')


print (len(features))


random_df.columns = features

9


In [311]:
# Separate features (X) and target variable (y)
y = random_df['output']
X = random_df.drop('output', axis=1)


# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


In [312]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [313]:
rfr = RandomForestClassifier()
rfr.fit(X_train,y_train)

In [314]:
# data = load_breast_cancer()
# X = adult_X_encoded
# y = adult_y_encoded

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)




# X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
# X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

y = diabetes['Outcome']
X = diabetes.drop(columns=['Outcome'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Convert the data to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32).to(device)
y = torch.tensor(y, dtype=torch.float32).view(-1, 1).to(device)

In [315]:
y_pred =rfr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.44805194805194803


In [316]:
# Evaluate fidelity
fidelity_outputs = evaluate_model(model, X_test_tensor, y_test)
print("teacher Accuracy:", accuracy_score(fidelity_outputs, y_test))
print(" surrogate Accuracy:",accuracy_score(y_pred, y_test))
print("Fidelity Accuracy:", accuracy_score(fidelity_outputs, y_pred))


teacher Accuracy: 0.7142857142857143
 surrogate Accuracy: 0.44805194805194803
Fidelity Accuracy: 0.461038961038961


In [317]:
evaluate_model(model, X_test_tensor, y_test)

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [318]:
len(X.columns)

AttributeError: 'Tensor' object has no attribute 'columns'