In [None]:
import h2o
h2o.init(nthreads=40)  # Limit to 40 CPUs

In [21]:
from h2o.automl import H2OAutoML

In [78]:
import numpy as np
import pandas as pd
import os

In [None]:
%cd '/opt/home/buckcenter.org/fwu/PICseq'

In [None]:
%pwd
print(os.getcwd())

In [None]:
count_matrix = h2o.import_file("TB_combined_data.csv")

In [None]:
count_matrix.head()

In [33]:
count_matrix = count_matrix.drop("C1")

In [34]:
train, test = count_matrix.split_frame(ratios=[0.8], seed=42)

In [None]:
train.head()

In [36]:
# Identify predictors and response
x = train.columns
y = "AD_label"
x.remove(y)

In [37]:
# For classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [38]:
test[y].types

{'AD_label': 'enum'}

In [40]:
aml = H2OAutoML(max_models=5, seed = 10, verbosity='info',
                max_runtime_secs=72000,
                max_runtime_secs_per_model = 36000,
               include_algos = ["XGBoost", "GLM", "DRF", "StackedEnsemble"],
               export_checkpoints_dir="/opt/home/buckcenter.org/fwu/PICseq")

In [None]:
aml.train(x = x, y = y, training_frame = train)

In [None]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
lb_all = h2o.automl.get_leaderboard(aml, extra_columns = "All")
lb_all

In [None]:
# The leader model is stored here
aml.leader

In [None]:
aml.leader.explain(test)

In [55]:
# load the saved best model after training, here is named 'StackedEnsemble_AllModels_1_AutoML_2_20240408_152426'
aml=h2o.load_model("/opt/home/buckcenter.org/fwu/PICseq/StackedEnsemble_AllModels_1_AutoML_2_20240408_152426")

In [None]:
# To generate predictions on a test set for the additional accuracy tests
# directly on the `H2OAutoML` object or on the leader model
# object directly
preds = aml.predict(test)

In [None]:
preds.head()

In [None]:
# Convert the H2O Frame to a pandas DataFrame
preds_df = preds.as_data_frame()

# Show the DataFrame
print(preds_df)

In [58]:
preds_df.to_csv("h2o_pred_test.csv")

In [None]:
# Step 1: Find the maximum probability for each prediction
max_probs = preds_df.iloc[:, 1:].max(axis=1)  # Skip the first column which contains the predicted class labels

# Step 2: Apply the threshold and set predictions with max probability < 0.5 to "unknown"
threshold = 0.5
preds_df['predict'] = preds_df.apply(lambda row: "unknown" if max_probs[row.name] < threshold else row['predict'], axis=1)

# Now preds_df contains the adjusted predictions with 'unknown' for those below the threshold
print(preds_df[['predict']])

In [None]:
# Convert H2O Frame to pandas DataFrame for easier manipulation
test_df = h2o.as_list(test[y])

# Ensure the indices align between the predictions and the true labels
test_df.index = preds_df.index

# Filter out 'unknown' predictions
filtered_preds_df = preds_df[preds_df['predict'] != 'unknown']

# Align true labels with filtered predictions
true_labels_filtered = test_df.loc[filtered_preds_df.index]

# Calculate the number of correct predictions
correct_predictions = (filtered_preds_df['predict'] == true_labels_filtered[y]).sum()

# Calculate accuracy
accuracy = correct_predictions / len(filtered_preds_df)
print(f"Accuracy after ignoring low-confidence predictions: {accuracy:.4f}")


In [None]:
########## prediction for real PICs ##########
count_matrix_PICs = h2o.import_file("PICs_pred.csv")

In [None]:
count_matrix_PICs.head()

In [63]:
# Step 1: Load testing dataset into an H2O Frame, excluding the 'Cell_ID' column for prediction
test_h2o = count_matrix_PICs.drop('C1')

In [None]:
# Step 2: Make predictions
preds_PICs = aml.predict(test_h2o)

In [None]:
# Step 3: Convert predictions to pandas DataFrame
preds_PICs_df = h2o.as_list(preds_PICs)

In [None]:
preds_PICs_df.head

In [None]:
# Convert the 'C1' column from the H2O Frame to a pandas Series
cell_id_series = h2o.as_list(count_matrix_PICs['C1'])

# Concatenate the Cell_ID Series with the predictions DataFrame
final_df = pd.concat([cell_id_series.reset_index(drop=True), preds_PICs_df.reset_index(drop=True)], axis=1)

# Optionally, you can rename the columns of the final DataFrame for clarity
final_df.columns = ['Cell_ID'] + list(preds_PICs_df.columns)

# Now 'final_df' contains the 'Cell_ID' matched with each prediction
print(final_df.head())

In [None]:
# Step 1 & 2: Find the maximum probability and set predictions below the threshold to "unknown"
threshold = 0.5
# Skip the first column ('Cell_ID') when looking for the maximum probability
max_probs = preds_PICs_df.iloc[:, 1:].max(axis=1)
final_df['pred'] = preds_PICs_df.iloc[:, 1:].idxmax(axis=1)  # Get the column names of the max values
final_df['pred'] = final_df.apply(lambda row: "unknown" if max_probs[row.name] < threshold else final_df.loc[row.name, 'pred'], axis=1)

# Now 'final_df' contains a new column 'pred' with the adjusted predictions
print(final_df.head())

In [73]:
final_df.to_csv("PICs_predict_v1.csv")

In [None]:
# save the whole environment
import pickle

def save_all_objects(filename="/data/array2/fwu/PICs/saved_session_022624.pkl"):
    global_objects = globals().copy()  # Copy the global namespace
    serializable_objects = {}

    for name, obj in global_objects.items():
        # Attempt to serialize each object; skip those that cannot be serialized
        try:
            pickle.dumps(obj)
            serializable_objects[name] = obj
        except Exception as e:  # Broad exception handling to catch all errors
            print(f"Skipping {name}: {e}")

    # Save the serializable objects to a file
    with open(filename, 'wb') as file:
        pickle.dump(serializable_objects, file)
        print(f"Saved session to {filename}")

# Call the function to save your session objects
save_all_objects()

In [None]:
h2o.shutdown(prompt=False)