In [16]:
!pip install ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = '/content/diabetic_data.csv'  # Make sure the file is uploaded in Colab
data = pd.read_csv(file_path)

# Handle missing values
data = data.replace('?', None).fillna(method='ffill')

# Keep a copy of the original data for display purposes
original_data = data.copy()

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Split the data into features and target
X = data.drop(columns=['readmitted'])
y = data['readmitted']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
import ipywidgets as widgets
from IPython.display import display

# Get unique values from the original dataset
unique_ages = sorted(original_data['age'].unique())
unique_races = sorted(original_data['race'].unique())
unique_genders = sorted(original_data['gender'].unique())
unique_admission_types = sorted(original_data['admission_type_id'].unique())

# Create widgets for user input with valid options
age_widget = widgets.Dropdown(
    options=unique_ages,
    description='Age Group:'
)

race_widget = widgets.Dropdown(
    options=unique_races,
    description='Race:'
)

gender_widget = widgets.Dropdown(
    options=unique_genders,
    description='Gender:'
)

admission_type_widget = widgets.Dropdown(
    options=unique_admission_types,
    description='Admission Type:'
)

# Display widgets
display(age_widget, race_widget, gender_widget, admission_type_widget)


Dropdown(description='Age Group:', options=('[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)', '[50-60)', '…

Dropdown(description='Race:', options=('AfricanAmerican', 'Asian', 'Caucasian', 'Hispanic', 'Other'), value='A…

Dropdown(description='Gender:', options=('Female', 'Male', 'Unknown/Invalid'), value='Female')

Dropdown(description='Admission Type:', options=(1, 2, 3, 4, 5, 6, 7, 8), value=1)

In [35]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def display_data(age, race, gender, admission_type):
    # Filter data based on user input
    filtered_data = original_data[
        (original_data['age'] == age) &
        (original_data['race'] == race) &
        (original_data['gender'] == gender) &
        (original_data['admission_type_id'] == admission_type)
    ]

    # Display filtered data
    if not filtered_data.empty:
        display(filtered_data.head())

        # Encode the filtered data
        encoded_filtered_data = filtered_data.copy()
        for column in encoded_filtered_data.select_dtypes(include=['object']).columns:
            encoded_filtered_data[column] = label_encoders[column].transform(encoded_filtered_data[column])

        # Train a model on the encoded filtered data
        X_filtered = encoded_filtered_data.drop(columns=['readmitted'])
        y_filtered = encoded_filtered_data['readmitted']

        if len(X_filtered) > 1:
            # Split the filtered data
            X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42)

            # Initialize and train the model
            model = RandomForestClassifier(random_state=42)
            model.fit(X_train, y_train)

            # Predict on the test set
            y_pred = model.predict(X_test)

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            classification_rep = classification_report(y_test, y_pred, output_dict=True)

            display("Model Accuracy: {}".format(accuracy))
            display(pd.DataFrame(classification_rep).transpose())

            # Feature importance
            feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

            # Plot feature importance
            plt.figure(figsize=(10, 6))
            sns.barplot(x=feature_importances.importance, y=feature_importances.index)
            plt.title('Feature Importance')
            plt.show()
        else:
            display("Not enough data to split, using all data for training and testing.")

            # Use the whole filtered data for training and testing
            model = RandomForestClassifier(random_state=42)
            model.fit(X_filtered, y_filtered)
            y_pred = model.predict(X_filtered)

            # Evaluate the model
            accuracy = accuracy_score(y_filtered, y_pred)
            classification_rep = classification_report(y_filtered, y_pred, output_dict=True)

            display("Model Accuracy: {}".format(accuracy))
            display(pd.DataFrame(classification_rep).transpose())

            # Feature importance
            feature_importances = pd.DataFrame(model.feature_importances_, index=X_filtered.columns, columns=['importance']).sort_values('importance', ascending=False)

            # Plot feature importance
            plt.figure(figsize=(10, 6))
            sns.barplot(x=feature_importances.importance, y=feature_importances.index)
            plt.title('Feature Importance')
            plt.show()
    else:
        display("No data available for the selected filters.")

# Interact with the display function
widgets.interact(display_data, age=age_widget, race=race_widget, gender=gender_widget, admission_type=admission_type_widget)


interactive(children=(Dropdown(description='Age Group:', options=('[0-10)', '[10-20)', '[20-30)', '[30-40)', '…

In [37]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def display_data(age, race, gender, admission_type):
    # Filter data based on user input
    filtered_data = original_data[
        (original_data['age'] == age) &
        (original_data['race'] == race) &
        (original_data['gender'] == gender) &
        (original_data['admission_type_id'] == admission_type)
    ]

    # Display filtered data
    if not filtered_data.empty:
        display(filtered_data.head())

        # Encode the filtered data
        encoded_filtered_data = filtered_data.copy()
        for column in encoded_filtered_data.select_dtypes(include=['object']).columns:
            encoded_filtered_data[column] = label_encoders[column].transform(encoded_filtered_data[column])

        # Train a model on the encoded filtered data
        X_filtered = encoded_filtered_data.drop(columns=['readmitted'])
        y_filtered = encoded_filtered_data['readmitted']

        if len(X_filtered) > 1:
            # Split the filtered data
            X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.2, random_state=42)

            # Initialize and train the model
            model = RandomForestClassifier(random_state=42)
            model.fit(X_train, y_train)

            # Predict on the test set
            y_pred = model.predict(X_test)

            # Evaluate the model
            accuracy = accuracy_score(y_test, y_pred)
            classification_rep = classification_report(y_test, y_pred, output_dict=True)

            display("Model Accuracy: {}".format(accuracy))
            display(pd.DataFrame(classification_rep).transpose())

            # Feature importance
            feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

            # Plot feature importance
            plt.figure(figsize=(10, 6))
            sns.barplot(x=feature_importances.importance, y=feature_importances.index)
            plt.title('Feature Importance')
            plt.show()
        else:
            display("Not enough data to split, using all data for training and testing.")

            # Use the whole filtered data for training and testing
            model = RandomForestClassifier(random_state=42)
            model.fit(X_filtered, y_filtered)
            y_pred = model.predict(X_filtered)

            # Evaluate the model
            accuracy = accuracy_score(y_filtered, y_pred)
            classification_rep = classification_report(y_filtered, y_pred, output_dict=True)

            display("Model Accuracy: {}".format(accuracy))
            display(pd.DataFrame(classification_rep).transpose())

            # Feature importance
            feature_importances = pd.DataFrame(model.feature_importances_, index=X_filtered.columns, columns=['importance']).sort_values('importance', ascending=False)

            # Plot feature importance
            plt.figure(figsize=(10, 6))
            sns.barplot(x=feature_importances.importance, y=feature_importances.index)
            plt.title('Feature Importance')
            plt.show()

        # Additional visualizations
        # Histogram of numerical features
        plt.figure(figsize=(12, 6))
        X_filtered.hist(bins=20, figsize=(20, 15))
        plt.suptitle('Histograms of Numerical Features')
        plt.show()

        # Count plot of categorical features
        categorical_columns = original_data.select_dtypes(include=['object']).columns
        for column in categorical_columns:
            plt.figure(figsize=(10, 6))
            sns.countplot(data=filtered_data, x=column)
            plt.title(f'Count Plot of {column}')
            plt.xticks(rotation=90)
            plt.show()

        # Correlation heatmap
        plt.figure(figsize=(12, 10))
        correlation_matrix = X_filtered.corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
        plt.title('Correlation Heatmap')
        plt.show()

        # Box plot of numerical features
        plt.figure(figsize=(12, 6))
        X_filtered.boxplot(rot=90)
        plt.title('Box Plot of Numerical Features')
        plt.xticks(rotation=90)
        plt.show()
    else:
        display("No data available for the selected filters.")

# Interact with the display function
widgets.interact(display_data, age=age_widget, race=race_widget, gender=gender_widget, admission_type=admission_type_widget)


interactive(children=(Dropdown(description='Age Group:', index=3, options=('[0-10)', '[10-20)', '[20-30)', '[3…