# Week 12 Assignment: Deep Learning with Synthetic Data
The notebook replicates the R-based synthetic data generation approach using logistic regression coefficients from the Pima Indians Diabetes dataset. It then trains deep learning models with different architectures and evaluates their performance in terms of training error, validation error, and execution time.

## Step 1: Import Libraries

In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import statsmodels.api as sm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scipy.special import expit as sigmoid

## Step 2: Load and Fit Logistic Regression Model on Pima Data

In [3]:
df = pd.read_csv('pima-indians-diabetes.data.csv', header=None)
df.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
              'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = df.dropna()
X_real = df.drop(columns='Outcome')
y_real = df['Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_real)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_real.columns)
X_scaled_df['Intercept'] = 1
X_scaled_df = X_scaled_df[['Intercept'] + list(X_real.columns)]
model = sm.Logit(y_real, X_scaled_df).fit(disp=0)
coeffs = model.params

## Step 3: Generate Synthetic Datasets Using the Logistic Model

In [4]:
def generate_synthetic_dataset(size):
    sampled_X = X_real.sample(n=size, replace=True, random_state=42).reset_index(drop=True)
    X_scaled = scaler.transform(sampled_X)
    X_df = pd.DataFrame(X_scaled, columns=sampled_X.columns)
    X_df['Intercept'] = 1
    X_df = X_df[['Intercept'] + list(sampled_X.columns)]
    logits = np.dot(X_df, coeffs)
    probs = sigmoid(logits)
    y_syn = np.random.binomial(1, probs)
    return sampled_X, y_syn

## Step 4: Train Deep Learning Models and Record Results

In [5]:
# Define deep learning model with required configuration: 4 nodes per hidden layer
def train_deep_model(X, y, layers):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model = Sequential()
    model.add(Dense(4, activation='relu', input_dim=X.shape[1]))
    if layers == 2:
        model.add(Dense(4, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy')
    start = time.time()
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    end = time.time()
    train_preds = model.predict(X_train).flatten()
    val_preds = model.predict(X_val).flatten()
    return log_loss(y_train, train_preds), log_loss(y_val, val_preds), round(end - start, 2)

## Step 5: Run Experiments and Collect Results

In [6]:
results = []
for size in [1000, 10000, 100000]:
    X_syn, y_syn = generate_synthetic_dataset(size)
    for layer_count in [1, 2]:
        tr_err, val_err, exec_time = train_deep_model(X_syn, y_syn, layer_count)
        results.append({
            'Data Size': size,
            'Hidden Layers': layer_count,
            'Training Error': tr_err,
            'Validation Error': val_err,
            'Execution Time (s)': exec_time
        })
pd.DataFrame(results)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step  


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 979us/step
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


Unnamed: 0,Data Size,Hidden Layers,Training Error,Validation Error,Execution Time (s)
0,1000,1,3.868732,3.712242,2.72
1,1000,2,0.709113,0.783757,2.22
2,10000,1,0.652338,0.651322,6.16
3,10000,2,0.623352,0.631448,6.97
4,100000,1,0.480824,0.485211,42.65
5,100000,2,0.569406,0.572754,50.8


## Conclusion
The table above shows the model performance across different dataset sizes and deep learning configurations. These results can now be compared against XGBoost from Week 11 to determine the superior model for this task.