# Interactive Dashboard for Lung Cancer Survival Analysis
### DSA 2040A Group Project - Data Mining and Visualization
### Team Members: Calvin, Tanveer, Samantha, Patricia, Susan and Arlen

## Import Required Libraries

In [22]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import joblib

## Load Data and Model

In [23]:
# Load the cleaned data
df = pd.read_csv('../data/transformed_data.csv')

# Load the trained model
model = joblib.load('../models/survival_predictor.pkl')

print("Data and model loaded successfully!")
print(f"Dataset shape: {df.shape}")

Data and model loaded successfully!
Dataset shape: (8814, 27)


## 1. Survival Rate Analysis by Demographics

In [24]:
# Create survival rate by age group and gender
survival_by_age_gender = df.groupby(['age_group', 'gender'])['survived'].mean().reset_index()

# Create an interactive bar plot
fig = px.bar(survival_by_age_gender,
             x='age_group',
             y='survived',
             color='gender',
             barmode='group',
             title='Survival Rate by Age Group and Gender',
             labels={'survived': 'Survival Rate', 'age_group': 'Age Group'})

fig.show()

## 2. Treatment Effectiveness Analysis

In [25]:
# Calculate survival rates by treatment type and cancer stage
treatment_effectiveness = df.groupby(['treatment_type', 'cancer_stage'])['survived'].agg(['mean', 'count']).reset_index()
treatment_effectiveness.columns = ['treatment_type', 'cancer_stage', 'survival_rate', 'patient_count']

# Create bubble chart
fig = px.scatter(treatment_effectiveness,
                x='cancer_stage',
                y='survival_rate',
                size='patient_count',
                color='treatment_type',
                title='Treatment Effectiveness by Cancer Stage',
                labels={'survival_rate': 'Survival Rate'},
                hover_data=['patient_count'])

fig.show()

## 3. Risk Factor Analysis

In [26]:
# Create a correlation matrix for numerical features
numerical_cols = ['treatment_duration', 'comorbidities_count']
correlation_matrix = df[numerical_cols + ['survived']].corr()

# Create heatmap
fig = px.imshow(correlation_matrix,
                title='Correlation Heatmap of Risk Factors',
                color_continuous_scale='RdBu')

fig.show()

## 4. Survival Prediction Dashboard

In [None]:
# Update the widgets section to include all necessary features
from ipywidgets import widgets
from IPython.display import display

# Create input widgets for all required features
widgets_dict = {
    'age': widgets.IntSlider(min=0, max=100, description='Age:'),
    'gender': widgets.Dropdown(options=df['gender'].unique(), description='Gender:'),
    'country': widgets.Dropdown(options=df['country'].unique(), description='Country:'),
    'cancer_stage': widgets.Dropdown(options=df['cancer_stage'].unique(), description='Stage:'),
    'smoking_status': widgets.Dropdown(options=df['smoking_status'].unique(), description='Smoking:'),
    'bmi': widgets.FloatSlider(min=10, max=50, description='BMI:'),
    'cholesterol_level': widgets.IntSlider(min=100, max=300, description='Cholesterol:'),
    'treatment_type': widgets.Dropdown(options=df['treatment_type'].unique(), description='Treatment:'),
    'treatment_duration': widgets.IntSlider(min=0, max=365, description='Duration (days):'),
    'comorbidities_count': widgets.IntSlider(min=0, max=4, description='Comorbidities:'),
}

# Display widgets in a organized layout
for widget in widgets_dict.values():
    display(widget)

# Add prediction button and output
button = widgets.Button(description='Predict Survival')
output = widgets.Output()

display(button, output)

def on_button_clicked(b):
    with output:
        output.clear_output()
        
        # Get current date for diagnosis date
        current_date = pd.Timestamp.now()
        
        # Create a sample input with all required features
        sample_input = pd.DataFrame({
            'age': [widgets_dict['age'].value],
            'gender': [widgets_dict['gender'].value],
            'country': [widgets_dict['country'].value],
            'cancer_stage': [widgets_dict['cancer_stage'].value],
            'smoking_status': [widgets_dict['smoking_status'].value],
            'bmi': [widgets_dict['bmi'].value],
            'cholesterol_level': [widgets_dict['cholesterol_level'].value],
            'treatment_type': [widgets_dict['treatment_type'].value],
            'treatment_duration': [widgets_dict['treatment_duration'].value],
            'comorbidities_count': [widgets_dict['comorbidities_count'].value],
            # Add missing time-related columns
            'id': [1],  # Dummy ID
            'diagnosis_year': [current_date.year],
            'diagnosis_month': [current_date.month],
            'diagnosis_quarter': [pd.Timestamp(current_date).quarter],
            # Set default values for binary features
            'hypertension': [0],
            'asthma': [0],
            'cirrhosis': [0],
            'other_cancer': [0],
            'family_history': [0]
        })
        
        # Calculate derived features
        sample_input['age_group'] = pd.cut(sample_input['age'], 
                                         bins=[0, 18, 30, 45, 60, 75, 100],
                                         labels=['<18', '18-29', '30-44', '45-59', '60-74', '75+'])
        
        sample_input['bmi_category'] = pd.cut(sample_input['bmi'],
                                            bins=[0, 18.5, 24.9, 29.9, 100],
                                            labels=['underweight', 'normal', 'overweight', 'obese'])
        
        sample_input['cholesterol_category'] = pd.cut(sample_input['cholesterol_level'],
                                                    bins=[0, 200, 239, 1000],
                                                    labels=['Desirable', 'Borderline high', 'High'])
        
        # Make prediction
        prediction = model.predict_proba(sample_input)[0]
        
        print(f"Survival Probability: {prediction[1]:.2%}")

# Update the button click handler
button.on_click(on_button_clicked)

IntSlider(value=0, description='Age:')

Dropdown(description='Gender:', options=('Female', 'Male'), value='Female')

Dropdown(description='Country:', options=('Hungary', 'Croatia', 'Latvia', 'Spain', 'Estonia', 'Austria', 'Neth…

Dropdown(description='Stage:', options=('Stage III', 'Stage IV', 'Stage I', 'Stage II'), value='Stage III')

Dropdown(description='Smoking:', options=('Passive Smoker', 'Never Smoked', 'Current Smoker', 'Former Smoker')…

FloatSlider(value=10.0, description='BMI:', max=50.0, min=10.0)

IntSlider(value=100, description='Cholesterol:', max=300, min=100)

Dropdown(description='Treatment:', options=('Combined', 'Radiation', 'Surgery', 'Chemotherapy'), value='Combin…

IntSlider(value=0, description='Duration (days):', max=365)

IntSlider(value=0, description='Comorbidities:', max=4)

Button(description='Predict Survival', style=ButtonStyle())

Output()