<a href="https://colab.research.google.com/github/Aaron25m/Aaron_Projects/blob/Sequential-Clinical-Trials/Sequential_Clinical_Trial_STPA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
data=pd.read_csv("diabetes.csv")
data.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [2]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [3]:
model = data.iloc[:768]

In [4]:
len(data)

768

In [5]:
len(model)

768

In [6]:
model.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Split the data into features and target
X = model.drop(columns=["Outcome"])
y = model["Outcome"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create a Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Fit the model to the training data
gnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gnb.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)


Accuracy: 0.7597402597402597


Traditional Clinical Trial

In [8]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create a Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Fit the model to the training data
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
y_prob = gnb.predict_proba(X_test)

df = pd.DataFrame(y_prob, columns=["Class 0 Probability", "Class 1 Probability"])
df["Obs"] = range(1, len(df) + 1)

for i in range(len(df)):
    if df.loc[i, "Class 0 Probability"] < df.loc[i, "Class 1 Probability"]:
        df.loc[i, "Actual Class"] = 1 
    else:
        df.loc[i, "Actual Class"] = 0

df.set_index("Obs", inplace=True)
print(df)
class_counts = (df["Actual Class"].value_counts(normalize=True) * 10).round(decimals=1)
print(class_counts)


Accuracy: 0.7597402597402597
     Class 0 Probability  Class 1 Probability  Actual Class
Obs                                                        
1               0.718211             0.281789           0.0
2               0.939441             0.060559           0.0
3               0.970721             0.029279           0.0
4               0.808641             0.191359           0.0
5               0.473029             0.526971           1.0
..                   ...                  ...           ...
304             0.992963             0.007037           0.0
305             0.975498             0.024502           0.0
306             0.995971             0.004029           0.0
307             0.853746             0.146254           0.0
308             0.879324             0.120676           0.0

[308 rows x 3 columns]
0.0    6.5
1.0    3.5
Name: Actual Class, dtype: float64


Sequential Clinical Trial

In [10]:
import numpy as np

# Set the number of stages to run
num_stages = 10

# Initialize a list to store the actual class value counts for each stage
actual_class_counts = []

for stage in range(num_stages):
    # Randomly select 10 observations from the test set
    random_indices = np.random.choice(X_test.index, size=10, replace=False)
    X_stage = X_test.loc[random_indices]
    y_stage = y_test.loc[random_indices]
    
    # Initialize counters for class 0 and class 1
    count_0 = 0
    count_1 = 0
    
    # Initialize a list to store the predicted class for each observation
    y_pred_list = []
    
    # Create a Gaussian Naive Bayes classifier
    gnb = GaussianNB()
    
    # Fit the model to the training data
    gnb.fit(X_train, y_train)
    
    # Loop through each observation in the stage
    for i in range(len(X_stage)):
        # Split the stage into a single observation
        X_single = X_stage.iloc[[i]]
        y_single = y_stage.iloc[[i]]
        
        # Make a prediction for the single observation
        y_single_pred = gnb.predict(X_single)[0]
        
        # Update the counters based on the actual class of the single observation
        if y_single.iloc[0] == 0:
            count_0 += 1
        else:
            count_1 += 1
        
        # Add the predicted class to the list
        y_pred_list.append(y_single_pred)
    
    # Calculate the percentage of each class based on the predicted values
    count_0_pred = y_pred_list.count(0)
    count_1_pred = y_pred_list.count(1)
    percent_0_pred = count_0_pred / len(y_pred_list) * 100
    percent_1_pred = count_1_pred / len(y_pred_list) * 100
    
    # Print the percentage of each class based on the predicted values
    print(f"Stage {stage+1}: Class 0 = {percent_0_pred:.2f}%, Class 1 = {percent_1_pred:.2f}%")
    
    # Add the actual class value counts to the list for this stage
    actual_class_counts.append(y_stage.value_counts())
    
# Calculate the average actual class value counts across all stages
avg_class_counts = sum(actual_class_counts) / num_stages

# Print the average actual class value counts
print(f"\nAverage Class Value Counts Across All Stages:\n{avg_class_counts}")


Stage 1: Class 0 = 70.00%, Class 1 = 30.00%
Stage 2: Class 0 = 70.00%, Class 1 = 30.00%
Stage 3: Class 0 = 70.00%, Class 1 = 30.00%
Stage 4: Class 0 = 90.00%, Class 1 = 10.00%
Stage 5: Class 0 = 70.00%, Class 1 = 30.00%
Stage 6: Class 0 = 40.00%, Class 1 = 60.00%
Stage 7: Class 0 = 80.00%, Class 1 = 20.00%
Stage 8: Class 0 = 50.00%, Class 1 = 50.00%
Stage 9: Class 0 = 50.00%, Class 1 = 50.00%
Stage 10: Class 0 = 70.00%, Class 1 = 30.00%

Average Class Value Counts Across All Stages:
0    6.8
1    3.2
Name: Outcome, dtype: float64


As we can observe, the ratio for both sequential and traditional trials are almost the same even after Traditional trial used 308 patients whereas sequential trial used only 100 patients