In [1]:
# Step 1: Install libraries
!pip install streamlit pandas numpy matplotlib seaborn scikit-learn xgboost shap scikit-optimize pyngrok -q

In [2]:
from google.colab import files
uploaded = files.upload()  # Upload Monkeypox_Dataset.csv

Saving Monkeypox_Dataset.csv to Monkeypox_Dataset.csv


In [3]:
# Step 3: Write the Streamlit app
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, StackingClassifier
from xgboost import XGBClassifier
import shap
from skopt import BayesSearchCV
from sklearn.utils import resample
from sklearn.inspection import PartialDependenceDisplay
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Preprocessing Function (Unchanged)
def preprocess_data(df):
    def MPOX_Result_to_binary(value):
        return 1 if value == 'Positive' else 0
    df['MPOX_Result'] = df['MPOX PCR Result'].apply(MPOX_Result_to_binary)
    binary_cols = ['Rectal Pain', 'Sore Throat', 'Penile Oedema', 'Oral Lesions',
                   'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection',
                   'Health Insurance', 'Sexually Transmitted Infection']
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: 1 if str(x).strip().upper() in ['YES', '1'] else 0 if str(x).strip().upper() in ['NO', '0'] else np.nan)
            df[col] = df[col].fillna(0).astype(int)
    columns_drop = ['Month of Birth', 'White blood cells count', 'Red blood cells count',
                    'MPOX PCR Result', 'Home ownership', 'Systemic Illness', 'Test ID']
    new_df = df.drop(columns=[col for col in columns_drop if col in df.columns])
    new_df['Age'] = pd.to_numeric(new_df['Age'], errors='coerce')
    new_df['Age'] = new_df['Age'].replace({0: np.nan, -23: np.nan})
    new_df['Age'] = new_df['Age'].where(new_df['Age'] <= 120, np.nan)
    median_age = new_df['Age'].median(skipna=True)
    new_df['Age'] = new_df['Age'].fillna(median_age).astype(int)
    columns_to_fill = new_df.columns[new_df.isnull().any()]
    for col in columns_to_fill:
        if new_df[col].dtype == 'object':
            new_df[col] = new_df[col].fillna(new_df[col].mode()[0])
        else:
            new_df[col] = new_df[col].fillna(new_df[col].median())
    return new_df

# EDA Function (Unchanged)
def perform_eda(new_df):
    st.subheader("Exploratory Data Analysis")
    st.write("Dataset Info:")
    st.write(new_df.info())
    st.write("Descriptive Statistics:")
    st.write(new_df.describe())
    st.write("Missing Values per Column:")
    st.write(new_df.isnull().sum())
    st.write("First Few Rows:")
    st.write(new_df.head())
    if st.checkbox("Show Unique Values After Cleaning"):
        st.write("Unique values after cleaning:")
        for column in new_df.columns:
            st.write(f"{column}: {new_df[column].unique()}")
    if st.checkbox("Show MPOX Result Distribution"):
        fig, ax = plt.subplots(figsize=(6, 4))
        sns.countplot(x='MPOX_Result', data=new_df, palette='Set2', ax=ax)
        ax.set_title('Distribution of MPOX Results')
        st.pyplot(fig)
    if st.checkbox("Show Age Distribution"):
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.histplot(new_df['Age'], kde=True, color='skyblue', ax=ax)
        ax.set_title('Age Distribution')
        st.pyplot(fig)
    if st.checkbox("Show Correlation Heatmap"):
        fig, ax = plt.subplots(figsize=(12, 8))
        sns.heatmap(new_df.corr(), annot=True, cmap='coolwarm', fmt=".2f", ax=ax)
        ax.set_title('Correlation Heatmap')
        st.pyplot(fig)

# Training and Evaluation Function (Unchanged)
def train_and_evaluate_models(X, y):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    models = {
        'Logistic Regression': LogisticRegression(),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Naive Bayes': GaussianNB(),
        'SVM': SVC(kernel='rbf', probability=True, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
        'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({'Model': name, 'Accuracy': accuracy})
        st.write(f"{name} Accuracy: {accuracy:.4f}")
        st.write(f"{name} Classification Report:\n", classification_report(y_test, y_pred))
        if st.checkbox(f"Show ROC Curve for {name}"):
            fig, ax = plt.subplots()
            RocCurveDisplay.from_estimator(model, X_test, y_test, ax=ax)
            ax.set_title(f'ROC Curve - {name}')
            st.pyplot(fig)
    ensemble = VotingClassifier(estimators=[('dt', models['Decision Tree']), ('svm', models['SVM'])], voting='soft')
    ensemble.fit(X_train, y_train)
    ensemble_pred = ensemble.predict(X_test)
    ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
    results.append({'Model': 'Voting Ensemble', 'Accuracy': ensemble_accuracy})
    st.write(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")
    bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
    bagging_model.fit(X_train, y_train)
    bagging_accuracy = bagging_model.score(X_test, y_test)
    results.append({'Model': 'Bagging', 'Accuracy': bagging_accuracy})
    st.write(f"Bagging Classifier Accuracy: {bagging_accuracy:.4f}")
    stacking_model = StackingClassifier(estimators=[('dt', DecisionTreeClassifier()), ('knn', KNeighborsClassifier())], final_estimator=LogisticRegression())
    stacking_model.fit(X_train, y_train)
    stacking_accuracy = stacking_model.score(X_test, y_test)
    results.append({'Model': 'Stacking', 'Accuracy': stacking_accuracy})
    st.write(f"Stacking Model Accuracy: {stacking_accuracy:.4f}")
    results_df = pd.DataFrame(results)
    st.write("Model Comparison:")
    st.write(results_df.sort_values(by='Accuracy', ascending=False))
    if st.checkbox("Show Model Comparison Plot"):
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.barplot(x='Accuracy', y='Model', data=results_df, palette='coolwarm', ax=ax)
        ax.set_title('Model Comparison')
        st.pyplot(fig)
    return models['XGBoost'], scaler, X_scaled, X_train, X_test, y_train, y_test

# Updated Advanced Analysis Function
def advanced_analysis(model, X_scaled, X_train, y_train, X_test, y_test, y, feature_cols):
    param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]}
    grid_search = GridSearchCV(GaussianNB(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    st.write("Best Parameters for Naïve Bayes:", grid_search.best_params_)
    param_grid_gb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5]}
    grid_search_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=5, scoring='accuracy')
    grid_search_gb.fit(X_train, y_train)
    st.write("Best Parameters for Gradient Boosting:", grid_search_gb.best_params_)
    st.write("Best Gradient Boosting Accuracy:", accuracy_score(y_test, grid_search_gb.best_estimator_.predict(X_test)))
    param_space = {'n_estimators': (50, 300), 'learning_rate': (0.01, 0.2), 'max_depth': (3, 10), 'colsample_bytree': (0.3, 1.0)}
    bayes_search = BayesSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_space, cv=5, n_iter=30, random_state=42)
    bayes_search.fit(X_train, y_train)
    st.write("Best Parameters from Bayesian Search:", bayes_search.best_params_)
    st.write("Best Cross-Validation Score:", bayes_search.best_score_)
    cv_scores = cross_val_score(GradientBoostingClassifier(random_state=42), X_scaled, y, cv=10, scoring='accuracy')
    st.write(f"Cross-Validation Scores: {cv_scores}")
    st.write(f"Mean CV Accuracy: {np.mean(cv_scores):.4f}")
    st.write(f"Std CV Accuracy: {np.std(cv_scores):.4f}")
    bootstrap_accuracies = [accuracy_score(y_resampled, model.predict(X_resampled))
                            for X_resampled, y_resampled in [resample(X_test, y_test, replace=True, random_state=i) for i in range(1000)]]
    ci_lower, ci_upper = np.percentile(bootstrap_accuracies, [2.5, 97.5])
    st.write(f"95% Confidence Interval for XGBoost Accuracy: [{ci_lower:.4f}, {ci_upper:.4f}]")
    if st.checkbox("Show SHAP Summary"):
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        fig = plt.figure()
        shap.summary_plot(shap_values, X_test, feature_names=feature_cols, show=False)
        st.pyplot(fig)
    if st.checkbox("Show Feature Importance (XGBoost)"):
        fig, ax = plt.subplots(figsize=(10, 6))
        xgb_importances = model.feature_importances_
        sns.barplot(x=xgb_importances, y=feature_cols, palette="magma", ax=ax)
        ax.set_title('Feature Importance - XGBoost')
        st.pyplot(fig)
    if st.checkbox("Show Outlier Detection"):
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.boxplot(data=pd.DataFrame(X_scaled, columns=feature_cols), palette="Set2", ax=ax)
        plt.xticks(rotation=45)
        ax.set_title("Outlier Detection in Features")
        st.pyplot(fig)
    if st.checkbox("Show Feature Interactions"):
        sns.pairplot(pd.DataFrame(X_scaled, columns=feature_cols), diag_kind='kde', corner=True)
        plt.suptitle('Pairplot of Feature Interactions', y=1.02)
        st.pyplot(plt)
    if st.checkbox("Show Partial Dependence Plots"):
        PartialDependenceDisplay.from_estimator(model, X_test, features=[0, 1, 2], feature_names=feature_cols)
        plt.suptitle("Partial Dependence Plots for Top Features", y=1.02)
        st.pyplot(plt)

# Prediction Function (Unchanged)
def predict_mpox(model, scaler, features):
    scaled_features = scaler.transform([features])
    prediction = model.predict(scaled_features)[0]
    return "Positive" if prediction == 1 else "Negative"

# Updated Main Function
def main():
    st.title("Monkeypox Prediction and Analysis App")
    st.write("Using pre-uploaded 'Monkeypox_Dataset.csv' by default. Optionally upload your own dataset.")
    uploaded_file = st.file_uploader("Upload your own Monkeypox dataset (optional, CSV format)", type="csv")
    if uploaded_file is not None:
        data_frame = pd.read_csv(uploaded_file)
        st.success("Custom dataset uploaded successfully!")
    else:
        try:
            data_frame = pd.read_csv('Monkeypox_Dataset.csv')
            st.info("Using default dataset 'Monkeypox_Dataset.csv'.")
        except FileNotFoundError:
            st.error("Default dataset 'Monkeypox_Dataset.csv' not found. Please upload a dataset to proceed.")
            return
    new_data_frame = preprocess_data(data_frame)
    feature_cols = ['Rectal Pain', 'Sore Throat', 'Penile Oedema', 'Oral Lesions', 'Solitary Lesion',
                    'Swollen Tonsils', 'HIV Infection', 'Age', 'Health Insurance', 'Sexually Transmitted Infection']
    X = new_data_frame[feature_cols]
    y = new_data_frame['MPOX_Result']
    st.sidebar.title("Navigation")
    page = st.sidebar.radio("Go to", ["EDA", "Model Training", "Advanced Analysis", "Prediction"])
    if page == "EDA":
        perform_eda(new_data_frame)
    elif page == "Model Training":
        st.subheader("Model Training and Evaluation")
        model, scaler, X_scaled, X_train, X_test, y_train, y_test = train_and_evaluate_models(X, y)
        st.session_state['model'] = model
        st.session_state['scaler'] = scaler
        st.session_state['X_scaled'] = X_scaled
        st.session_state['X_train'] = X_train
        st.session_state['X_test'] = X_test
        st.session_state['y_train'] = y_train
        st.session_state['y_test'] = y_test
    elif page == "Advanced Analysis":
        if 'model' in st.session_state:
            advanced_analysis(st.session_state['model'], st.session_state['X_scaled'],
                              st.session_state['X_train'], st.session_state['y_train'],
                              st.session_state['X_test'], st.session_state['y_test'],
                              y, feature_cols)  # Pass y here
        else:
            st.warning("Please train the model first in the 'Model Training' section.")
    elif page == "Prediction":
        if 'model' in st.session_state and 'scaler' in st.session_state:
            st.subheader("Predict Monkeypox Result")
            input_features = []
            for col in feature_cols:
                if new_data_frame[col].dtype in ['int64', 'float64']:
                    value = st.number_input(f"{col}", min_value=float(new_data_frame[col].min()),
                                            max_value=float(new_data_frame[col].max()),
                                            value=float(new_data_frame[col].median()))
                else:
                    value = st.selectbox(f"{col}", options=new_data_frame[col].unique())
                    value = 1 if value in ['YES', 1] else 0
                input_features.append(value)
            if st.button("Predict"):
                prediction = predict_mpox(st.session_state['model'], st.session_state['scaler'], input_features)
                st.write(f"Predicted MPOX Result: **{prediction}**")
        else:
            st.warning("Please train the model first in the 'Model Training' section.")

if __name__ == "__main__":
    main()


Overwriting app.py


In [4]:
from pyngrok import ngrok

# Set your ngrok auth token
ngrok.set_auth_token('2tZ6mqHFZ9n2B4HsTOzAPVA3Jnw_6qB1RFncPLxV8kcYUxNcJ')

# Start ngrok tunnel for Streamlit on port 8501
public_url = ngrok.connect(8501)
print("Streamlit URL:", public_url)

# Start Streamlit in the background
!streamlit run app.py --server.port 8501 &>/content/logs.txt &

Streamlit URL: NgrokTunnel: "https://5b09-34-168-160-215.ngrok-free.app" -> "http://localhost:8501"


In [5]:
# Step 5: Check logs
!cat /content/logs.txt