<a href="https://colab.research.google.com/github/DelMashiry-dev/DelMashiry-dev/blob/main/REALTIMEPREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount the Drive Containing the folder with the dataset


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls /content/drive/MyDrive/Medical_Resource_Prediction

In [None]:
import pandas as pd
# Read first 100000 rows but only specific columns
df = pd.read_csv('/content/drive/MyDrive/Medical_Resource_Prediction/owid-covid-data.csv', nrows=100000, usecols=['iso_code', 'continent', 'location', 'date','total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'reproduction_rate',
    'icu_patients', 'hosp_patients', 'weekly_icu_admissions', 'weekly_hosp_admissions','total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
    'total_tests', 'new_tests', 'positive_rate','population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older','cardiovasc_death_rate', 'diabetes_prevalence'])
df.fillna(0, inplace=True)
from IPython.display import display
display(df.head(5000))

# Identify Relevant Features



In [None]:
relevant_columns = [
    'iso_code', 'continent', 'location', 'date','total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'reproduction_rate',
    'icu_patients', 'hosp_patients', 'weekly_icu_admissions', 'weekly_hosp_admissions','total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
    'total_tests', 'new_tests', 'positive_rate', 'population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
    'hospital_beds_per_thousand', 'cardiovasc_death_rate', 'diabetes_prevalence'
]

# Data Cleaning
# Remove Irrelevant Columns
# Filter the dataset to keep only the selected columns:



In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Medical_Resource_Prediction/owid-covid-data.csv')

# Keep relevant columns
df = df[relevant_columns]
from IPython.display import display
display(df.head(5))

# Handle Missing Values



#Imputation Strategy



In [None]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows with missing iso_code, location, or date
df = df.dropna(subset=['iso_code', 'location', 'date'])

# Time-series imputation for disease and healthcare metrics
time_series_cols = [
    'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'reproduction_rate',
    'icu_patients', 'hosp_patients', 'weekly_icu_admissions', 'weekly_hosp_admissions',
    'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations',
    'total_tests', 'new_tests', 'positive_rate'
]
for col in time_series_cols:
    df[col] = df.groupby('location')[col].fillna(method='ffill').interpolate()

# Zero imputation for ICU/hospital metrics if still missing (early outbreak)
zero_impute_cols = ['icu_patients', 'hosp_patients', 'weekly_icu_admissions', 'weekly_hosp_admissions']
df[zero_impute_cols] = df[zero_impute_cols].fillna(0)

# Median imputation for demographic/healthcare capacity features
median_impute_cols = [
    'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
    'hospital_beds_per_thousand', 'cardiovasc_death_rate', 'diabetes_prevalence'
]
for col in median_impute_cols:
    df[col] = df.groupby('continent')[col].transform(lambda x: x.fillna(x.median()))

# Handle remaining missing values (if any) with global median
for col in median_impute_cols:
    df[col] = df[col].fillna(df[col].median())

    display(df.head(5000))

# Check for Outliers



In [None]:
for col in time_series_cols:
    df[col] = df.groupby('location')[col].transform(lambda x: x.clip(upper=x.quantile(0.99)))

# Feature Engineering



In [None]:
# Lag features
for col in ['new_cases', 'icu_patients', 'hosp_patients']:
    df[f'{col}_lag7'] = df.groupby('location')[col].shift(7)
    df[f'{col}_lag14'] = df.groupby('location')[col].shift(14)

# Rolling averages
df['new_cases_7d_avg'] = df.groupby('location')['new_cases'].rolling(7, min_periods=1).mean().reset_index(level=0, drop=True)
df['new_deaths_7d_avg'] = df.groupby('location')['new_deaths'].rolling(7, min_periods=1).mean().reset_index(level=0, drop=True)

# Proxy for ventilator demand (e.g., 50% of ICU patients)
df['ventilator_demand'] = df['icu_patients'] * 0.5

# Final Dataset
# Ensure the dataset is sorted by location and date for time-series modeling:



In [None]:
df = df.sort_values(['location', 'date'])

In [None]:
df.info()

# Time Series Plot Example (Plotly)



# This creates an interactive line plot where you can hover over points to see exact values, zoom in on specific time periods, and compare trends across locations. This is particularly useful for understanding temporal patterns in disease progression and resource demand



In [None]:
import plotly.express as px

# Plot multiple metrics over time, colored by location
fig = px.line(df, x='date', y=['total_cases', 'new_cases', 'icu_patients'], color='location', title='Key Metrics Over Time')
fig.show()

# Scatter Plot Example (Seaborn)



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot total_cases vs. icu_patients, colored by location
sns.scatterplot(data=df, x='total_cases', y='icu_patients', hue='location')
plt.title('Total Cases vs. ICU Patients')
plt.show()

# This scatter plot helps visualize the relationship between disease spread and ICU demand, which is a proxy for ventilator and medical personnel needs. You can extend this to plot total_vaccinations vs. new_cases to assess vaccination impact.



# Correlation Heatmap (Seaborn)



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select numeric columns for correlation
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute and plot correlation heatmap
plt.figure(figsize=(40, 38))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

#This heatmap shows correlations between all numeric features, helping identify which factors (e.g., population_density, median_age) are most related to resource demand



# Bar Chart(Plotly)



In [None]:
import plotly.express as px

# Get the latest data for each location
latest_df = df[df['date'] == df['date'].max()]

# Create bar chart for latest total_cases
fig = px.bar(latest_df, x='location', y='total_cases', title='Latest Total Cases by Location')
fig.show()

#This bar chart compares the most recent total cases across locations, which can help identify regions with high demand for real-time monitoring



In [None]:
import plotly.express as px

# Get the latest data for each location
latest_df = df[df['date'] == df['date'].max()]

# Sort by total cases in descending order
latest_df = latest_df.sort_values(by='total_cases', ascending=False)

# Create a horizontal bar chart with color by continent
fig = px.bar(latest_df, x='total_cases', y='location', color='continent', orientation='h',
             title='Latest Total Cases by Location, Colored by Continent')
fig.show()

# Geographic Map (Plotly)



In [None]:
import plotly.express as px

# Create choropleth map for total_cases by country, animated over time
fig = px.choropleth(df, locations='iso_code', locationmode='ISO-3', color='total_cases', hover_name='location', animation_frame='date', title='Total Cases Over Time by Country')
fig.show()

# This animated map shows the spatial distribution of total cases over time, which is useful for understanding regional disparities and can be sourced from real-time APIs like OWID



In [None]:
import pandas as pd
from IPython.display import display

# Assuming your cleaned DataFrame is named 'df_clean'
# If not, replace with your actual cleaned DataFrame name

def display_final_dataset(df, num_rows=5, all_columns=False):
    """
    Displays the final prepared dataset in a clean table format

    Parameters:
    - df: Your cleaned DataFrame
    - num_rows: Number of rows to display (default: 5)
    - all_columns: Whether to show all columns (default: False for truncated view)
    """

    # Configure display options
    pd.set_option('display.max_columns', None if all_columns else 10)
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 20)

    # Create a styled table
    styled_df = (df.head(num_rows)
                 .style
                 .set_properties(**{'text-align': 'center'})
                 .set_table_styles([{
                     'selector': 'th',
                     'props': [('background-color', '#40466e'),
                              ('color', 'white'),
                              ('font-weight', 'bold')]
                 }])
                 .background_gradient(cmap='Blues', subset=df.select_dtypes(include='number').columns)
                 .format(None, na_rep="NA"))

    # Display in notebook
    display(styled_df)

    # Show dataset info
    print("\n\033[1mDataset Summary:\033[0m")
    print(f"Total Rows: {len(df):,}")
    print(f"Total Columns: {len(df.columns)}")
    print("\n\033[1mColumn Types:\033[0m")
    print(df.dtypes.value_counts())

    # Show NA counts if any exist
    if df.isna().sum().sum() > 0:
        print("\n\033[1mMissing Values:\033[0m")
        missing = df.isna().sum()[df.isna().sum() > 0]
        print(missing)
    else:
        print("\n\033[1mNo missing values found!\033[0m")

# Usage - call the function with your cleaned DataFrame
display_final_dataset(df, num_rows=5, all_columns=True)

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

def display_and_export_dataset(df, num_rows=5, export_name="cleaned_dataset"):
    """
    Displays the final dataset with enhanced formatting and export options

    Parameters:
    - df: Your cleaned DataFrame
    - num_rows: Number of rows to display
    - export_name: Base name for exported files
    """

    # ==============================================
    # 1. SPECIAL COLUMN FORMATTING
    # ==============================================
    format_rules = {
        # Medical resource columns (integers with comma separators)
        r'(beds|patients|ventilators|staff|ppe)': '{:,.0f}',

        # Percentage columns (show as % with 1 decimal)
        r'(rate|ratio|percent|per_hundred)': '{:.1%}',

        # Date columns (standard date format)
        'date': '{:%Y-%m-%d}',

        # Small decimal numbers (3 decimal places)
        r'(growth|factor|index)': '{:.3f}'
    }

    # ==============================================
    # 2. CREATE STYLED TABLE
    # ==============================================
    styler = (df.head(num_rows)
              .style
              .set_properties(**{'text-align': 'center'})
              .set_table_styles([{
                  'selector': 'th',
                  'props': [
                      ('background-color', '#2a3f5f'),
                      ('color', 'white'),
                      ('font-weight', 'bold'),
                      ('position', 'sticky'),
                      ('top', '0')
                  ]
              }]))

    # Apply special formatting
    for regex, formatter in format_rules.items():
        cols = df.filter(regex=regex, axis=1).columns
        if not cols.empty:
            styler.format(formatter, subset=cols)

    # Highlight important metrics
    medical_cols = df.filter(regex='icu|hosp|ventilator|ppe').columns
    if not medical_cols.empty:
        styler.background_gradient(
            cmap='YlOrRd',
            subset=medical_cols,
            vmin=0, vmax=df[medical_cols].max().max()
        )

    # ==============================================
    # 3. DISPLAY RESULTS
    # ==============================================
    print("="*80)
    print(f"\033[1m{'CLEANED DATASET PREVIEW':^80}\033[0m")
    print("="*80)
    display(styler)

    # ==============================================
    # 4. STATISTICAL SUMMARIES
    # ==============================================
    print("\n\033[1mSTATISTICAL SUMMARIES\033[0m")
    print("-"*80)

    # Numeric columns summary
    numeric_df = df.select_dtypes(include=np.number)
    if not numeric_df.empty:
        print("\n\033[4mNumeric Columns:\033[0m")
        display(numeric_df.describe().style.format("{:.2f}"))

    # Categorical columns summary
    categorical_df = df.select_dtypes(include='object')
    if not categorical_df.empty:
        print("\n\033[4mCategorical Columns:\033[0m")
        for col in categorical_df.columns:
            print(f"\n• {col}:")
            print(df[col].value_counts(dropna=False).head())

    # ==============================================
    # 5. EXPORT OPTIONS
    # ==============================================
    print("\n\033[1mEXPORT OPTIONS\033[0m")
    print("-"*80)

    try:
        # Excel Export
        excel_file = f"{export_name}.xlsx"
        with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name='Data', index=False)

            # Add summary sheets
            numeric_df.describe().to_excel(writer, sheet_name='Numeric Summary')
            if not categorical_df.empty:
                pd.concat([
                    df[col].value_counts(dropna=False).rename(col)
                    for col in categorical_df.columns
                ], axis=1).to_excel(writer, sheet_name='Category Counts')

            # Get workbook objects
            workbook = writer.book
            worksheet = writer.sheets['Data']

            # Add Excel formatting
            header_format = workbook.add_format({
                'bold': True,
                'text_wrap': True,
                'valign': 'top',
                'fg_color': '#2a3f5f',
                'font_color': 'white',
                'border': 1
            })

            # Apply header format
            for col_num, value in enumerate(df.columns.values):
                worksheet.write(0, col_num, value, header_format)

            # Auto-adjust column widths
            for i, col in enumerate(df.columns):
                max_len = max((
                    df[col].astype(str).map(len).max(),  # Data length
                    len(str(col))  # Header length
                )) + 2
                worksheet.set_column(i, i, min(max_len, 50))

        print(f"✓ Excel file saved as: {excel_file}")

        # HTML Export
        html_file = f"{export_name}.html"
        styler.to_html(html_file)
        print(f"✓ HTML file saved as: {html_file}")

    except Exception as e:
        print(f"Export failed: {str(e)}")

# Usage example:
# display_and_export_dataset(df_clean, num_rows=10, export_name="medical_resources")

In [None]:
display_and_export_dataset(df,
                         num_rows=10,
                         export_name="my_cleaned_data")

In [None]:
!pip install xlsxwriter


In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

def display_and_export_dataset(df, num_rows=5, export_name="cleaned_dataset"):
    """
    Displays the final dataset with enhanced formatting and export options

    Parameters:
    - df: Your cleaned DataFrame
    - num_rows: Number of rows to display
    - export_name: Base name for exported files
    """

    # ==============================================
    # 1. SPECIAL COLUMN FORMATTING
    # ==============================================
    format_rules = {
        # Medical resource columns
        r'(beds|patients|ventilators|staff|ppe)': '{:,.0f}',
        # Percentage columns
        r'(rate|ratio|percent|per_hundred)': '{:.1%}',
        # Date columns
        'date': '{:%Y-%m-%d}',
        # Small decimal numbers
        r'(growth|factor|index)': '{:.3f}'
    }

    # ==============================================
    # 2. CREATE STYLED TABLE
    # ==============================================
    styler = (df.head(num_rows)
              .style
              .set_properties(**{'text-align': 'center'})
              .set_table_styles([{
                  'selector': 'th',
                  'props': [
                      ('background-color', '#2a3f5f'),
                      ('color', 'white'),
                      ('font-weight', 'bold')
                  ]
              }]))

    # Apply formatting
    for regex, formatter in format_rules.items():
        cols = df.filter(regex=regex, axis=1).columns
        if not cols.empty:
            styler.format(formatter, subset=cols)

    # Highlight medical metrics
    medical_cols = df.filter(regex='icu|hosp|ventilator|ppe').columns
    if not medical_cols.empty:
        styler.background_gradient(
            cmap='YlOrRd',
            subset=medical_cols,
            vmin=0, vmax=df[medical_cols].max().max()
        )

    # ==============================================
    # 3. DISPLAY RESULTS
    # ==============================================
    print("="*80)
    print(f"\033[1m{'CLEANED DATASET PREVIEW':^80}\033[0m")
    print("="*80)
    display(styler)

    # ==============================================
    # 4. STATISTICAL SUMMARIES
    # ==============================================
    print("\n\033[1mSTATISTICAL SUMMARIES\033[0m")
    print("-"*80)

    # Numeric summary
    numeric_df = df.select_dtypes(include=np.number)
    if not numeric_df.empty:
        print("\n\033[4mNumeric Columns:\033[0m")
        display(numeric_df.describe().style.format("{:.2f}"))

    # Categorical summary
    categorical_df = df.select_dtypes(include='object')
    if not categorical_df.empty:
        print("\n\033[4mCategorical Columns:\033[0m")
        for col in categorical_df.columns:
            print(f"\n• {col}:")
            print(df[col].value_counts(dropna=False).head())

    # ==============================================
    # 5. EXPORT OPTIONS
    # ==============================================
    print("\n\033[1mEXPORT OPTIONS\033[0m")
    print("-"*80)

    # HTML Export (always available)
    html_file = f"{export_name}.html"
    styler.to_html(html_file)
    print(f"✓ HTML file saved as: {html_file}")

    # Excel Export (only if xlsxwriter is available)
    try:
        import xlsxwriter
        excel_file = f"{export_name}.xlsx"
        with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name='Data', index=False)

            # Add summary sheets
            numeric_df.describe().to_excel(writer, sheet_name='Numeric Summary')
            if not categorical_df.empty:
                pd.concat([
                    df[col].value_counts(dropna=False).rename(col)
                    for col in categorical_df.columns
                ], axis=1).to_excel(writer, sheet_name='Category Counts')

            # Formatting
            workbook = writer.book
            worksheet = writer.sheets['Data']
            header_format = workbook.add_format({
                'bold': True,
                'text_wrap': True,
                'fg_color': '#2a3f5f',
                'font_color': 'white',
                'border': 1
            })

            for col_num, value in enumerate(df.columns.values):
                worksheet.write(0, col_num, value, header_format)

            # Auto-adjust columns
            for i, col in enumerate(df.columns):
                max_len = max((df[col].astype(str).map(len).max(), len(str(col)))) + 2
                worksheet.set_column(i, i, min(max_len, 50))

        print(f"✓ Excel file saved as: {excel_file}")

    except ImportError:
        print("ℹ️ Excel export requires xlsxwriter. Install with: !pip install xlsxwriter")
    except Exception as e:
        print(f"⚠️ Excel export failed: {str(e)}")

# Usage example
# display_and_export_dataset(df_clean, num_rows=10, export_name="medical_data")

In [None]:
display_and_export_dataset(df,
                         num_rows=10,
                         export_name="my_medical_data")

In [None]:
import pandas as pd

# Assuming your cleaned DataFrame is named 'df_clean'
df.to_csv('cleaned_dataset.csv', index=False)  # index=False avoids saving row numbers

# Importation of Libraries

In [None]:
# Core data manipulation and analysis
import pandas as pd  # For data handling, cleaning, and imputation
import numpy as np  # For numerical operations and handling missing values

# Data visualization for exploring data)
import matplotlib.pyplot as plt  # For plotting distributions and trends
import seaborn as sns  # For advanced visualizations (e.g., correlation heatmaps)

# Feature engineering and preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler  # For normalization and standardization
from sklearn.impute import SimpleImputer  # For imputation of missing values

# Time-series handling
from datetime import datetime  # For parsing and manipulating dates
from pandas.tseries.offsets import Day, Week  # For creating lag and rolling features

# Statistical analysis (optional, for feature selection)
from scipy.stats import pearsonr  # For correlation analysis

# Machine learning (optional, for later modeling)
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.ensemble import RandomForestRegressor  # Example model for feature importance
from sklearn.metrics import mean_squared_error, r2_score  # For model evaluation

# Suppress warnings (optional, for cleaner output)
import warnings
warnings.filterwarnings('ignore')

# LOADING DATA

In [None]:
data_df =  pd.read_csv('/content/drive/MyDrive/Medical_Resource_Prediction/owid-covid-data.csv')
data_df.sample(10)

# DATA ANALYSIS
#Checking for missing values & categorical variables

In [None]:
# Checking for missing values and categorical variables in the dataset
data_df.info()

# Note: ¶

# Datatype of features 'tests_units' is "object" which needs to be converted into numerical variable (will be done in data preprocessing) before we feed the data to algorithms.

# Descriptive Statistics

In [None]:
# Doing Univariate Analysis for statistical description and understanding of dispersion of data
data_df.describe().T

In [None]:
# Removing the feature "Unnamed"
#data_df = data_df.drop(["Unnamed: 0"], axis=1)
data_df.shape

# MODEL BUILDING



# Defining Independent and Dependent Variables

# Based on the cleaned dataset, define:
# 1. Independent Variables (X): Features that influence medical resource demand.

# 2. Dependent Variables (y): Target variables representing resource demand (e.g., icu_patients, total_vaccinations, hosp_patients).



In [None]:
features = ['new_cases', 'total_cases', 'new_deaths', 'total_deaths', 'positive_rate', 'tests_per_case',
           'new_tests', 'hosp_patients', 'weekly_hosp_admissions', 'total_vaccinations', 'new_vaccinations',
           'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_people_vaccinated_smoothed',
           'stringency_index', 'population', 'population_density', 'median_age', 'aged_65_older',
           'aged_70_older', 'gdp_per_capita', 'hospital_beds_per_thousand', 'life_expectancy',
           'human_development_index', 'new_cases_lag1', 'icu_patients_roll7', 'day_of_week', 'month',
           'is_weekend', 'continent_Africa', 'continent_Asia', 'continent_Australia', 'continent_Europe',
           'continent_North America', 'continent_South America']



#  List of Pipelines and Dictionary



# List of pipelines and a dictionary for easy reference

In [None]:
#Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor



# Pipeline Creation



In [None]:
# Define pipelines
pipelines = [
    Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
    ]),
    Pipeline([
        ('scaler', StandardScaler()),
        ('model', GradientBoostingRegressor(random_state=42))
    ]),
    Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBRegressor(random_state=42, n_jobs=-1))
    ]),
    Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVR())
    ])
]

# Dictionary for reference
pipeline_dict = {
    'LinearRegression': pipelines[0],
    'RandomForest': pipelines[1],
    'GradientBoosting': pipelines[2],
    'XGBoost': pipelines[3],
    'SVR': pipelines[4]
}

# Fit the Pipelines

# Load the cleaned dataset.

# Split the data into training and test sets.

# Fit each pipeline on the training data.

# Evaluate performance using RMSE on the test set.



In [None]:
import os

# List all files in /content
print("Files in /content:")
print(os.listdir('/content'))

# Alternative for Colab:
!ls /content

In [None]:
# Try these common variations
possible_paths = [
    '/content/cleaned_dataset.csv',  # Default name from earlier
    '/content/drive/MyDrive/cleaned_data.csv',  # Common Google Drive path
    'cleaned_data.csv',  # Current working directory
    './cleaned_data.csv'  # Explicit current directory
]

for path in possible_paths:
    if os.path.exists(path):
        df_clean = pd.read_csv(path)
        print(f"Successfully loaded from {path}")
        break
else:
    print("File not found in common locations")

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Try loading from Drive
drive_path = '/content/drive/MyDrive/cleaned_data.csv'  # Adjust folder if needed
if os.path.exists(drive_path):
    df_clean = pd.read_csv(drive_path)
else:
    print(f"File not found at {drive_path}")
    print("Saving current data to Drive...")
    df_clean.to_csv(drive_path, index=False)

In [None]:
# Save your cleaned DataFrame first
df_clean.to_csv('/content/cleaned_data.csv', index=False)
print("File saved successfully")

# Now load it
df_clean= pd.read_csv('/content/cleaned_data.csv')

In [None]:
import pandas as pd
import os

# 1. Create sample cleaned data (if you don't have df_clean yet)
data = {'patient_id': [1, 2, 3], 'cases': [100, 150, 200]}
df_clean = pd.DataFrame(data)

# 2. Save to CSV
save_path = '/content/cleaned_data.csv'
df_clean.to_csv(save_path, index=False)
print(f"Saved to {save_path}")

# 3. Verify file exists
if os.path.exists(save_path):
    print("File verification:")
    print(pd.read_csv(save_path).head())
else:
    print("❗ File not found - check permissions")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# Load cleaned dataset
df_clean = pd.read_csv('/content/cleaned_data.csv')

# Print available columns to diagnose missing features
print("Available columns in df_clean:\n", df_clean.columns.tolist())

# Define pipelines (as before)
pipelines = [
    Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())]),
    Pipeline([('scaler', StandardScaler()), ('model', RandomForestRegressor(random_state=42, n_jobs=-1))]),
    Pipeline([('scaler', StandardScaler()), ('model', GradientBoostingRegressor(random_state=42))]),
    Pipeline([('scaler', StandardScaler()), ('model', XGBRegressor(random_state=42, n_jobs=-1))]),
    Pipeline([('scaler', StandardScaler()), ('model', SVR())])
]
pipeline_dict = {
    'LinearRegression': pipelines[0],
    'RandomForest': pipelines[1],
    'GradientBoosting': pipelines[2],
    'XGBoost': pipelines[3],
    'SVR': pipelines[4]
}

# Feature engineering (to ensure missing features are created)
# Convert date to datetime (if not already done)
if 'date' in df_clean.columns:
    df_clean['date'] = pd.to_datetime(df_clean['date'])
    df_clean['day_of_week'] = df_clean['date'].dt.dayofweek
    df_clean['month'] = df_clean['date'].dt.month
    df_clean['is_weekend'] = df_clean['day_of_week'].isin([5, 6]).astype(int)

# Create lag and rolling features
if 'new_cases' in df_clean.columns and 'location' in df_clean.columns:
    df_clean['new_cases_lag1'] = df_clean.groupby('location')['new_cases'].shift(1)
if 'icu_patients' in df_clean.columns and 'location' in df_clean.columns:
    df_clean['icu_patients_roll7'] = df_clean.groupby('location')['icu_patients'].rolling(window=7, min_periods=1).mean().reset_index(level=0, drop=True)

# One-hot encode continent (if not already done)
if 'continent' in df_clean.columns:
    df_clean = pd.get_dummies(df_clean, columns=['continent'], prefix='continent')

# Define features (adjusted to available columns)
available_features = [col for col in [
    'new_cases', 'total_cases', 'new_deaths', 'total_deaths', 'positive_rate', 'tests_per_case',
    'new_tests', 'hosp_patients', 'weekly_hosp_admissions', 'total_vaccinations', 'new_vaccinations',
    'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_people_vaccinated_smoothed',
    'stringency_index', 'population', 'population_density', 'median_age', 'aged_65_older',
    'aged_70_older', 'gdp_per_capita', 'hospital_beds_per_thousand', 'life_expectancy',
    'human_development_index', 'new_cases_lag1', 'icu_patients_roll7', 'day_of_week', 'month',
    'is_weekend', 'continent_Africa', 'continent_Asia', 'continent_Australia', 'continent_Europe',
    'continent_North America', 'continent_South America'
] if col in df_clean.columns]

target = 'icu_patients'

# Check if target exists
if target not in df_clean.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset.")

# Split data
X = df_clean[available_features]
y = df_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle any remaining missing values in X
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())  # Use train median to avoid leakage
y_train = y_train.fillna(y_train.median())  # Impute target if needed
y_test = y_test.fillna(y_train.median())

# Fit pipelines and evaluate
rmse_scores = {}
for name, pipeline in pipeline_dict.items():
    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Predict on test set
    y_pred = pipeline.predict(X_test)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores[name] = rmse
    print(f'{name} RMSE: {rmse:.4f}')

# Find the best model
best_model_name = min(rmse_scores, key=rmse_scores.get)
best_rmse = rmse_scores[best_model_name]
print(f'Best model: {best_model_name} with RMSE: {best_rmse:.4f}')

# Use the best pipeline for predictions
best_pipeline = pipeline_dict[best_model_name]
y_pred_best = best_pipeline.predict(X_test)

# Save predictions
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_best})
predictions.to_csv('icu_patients_predictions.csv', index=False)

# Feature importance (for tree-based models)
if best_model_name in ['RandomForest', 'GradientBoosting', 'XGBoost']:
    model = best_pipeline.named_steps['model']
    importances = pd.DataFrame({'Feature': available_features, 'Importance': model.feature_importances_})
    importances = importances.sort_values(by='Importance', ascending=False)
    print('Feature Importances:\n', importances)

# Save the best model
import joblib
joblib.dump(best_pipeline, 'best_icu_patients_model.pkl')

In [None]:
import pandas as pd

# Try loading the correct dataset
try:
    df_clean = pd.read_csv('/content/cleaned_dataset.csv')  # Adjust path if needed
    print("Available columns in df_clean:\n", df_clean.columns.tolist())
except FileNotFoundError:
    print("File not found. Please check the file path or name.")

In [None]:
# Define features (only those present in df_clean)
available_features = [col for col in [
    'new_cases', 'total_cases', 'new_deaths', 'total_deaths', 'positive_rate', 'tests_per_case',
    'new_tests', 'hosp_patients', 'weekly_hosp_admissions', 'total_vaccinations', 'new_vaccinations',
    'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_people_vaccinated_smoothed',
    'stringency_index', 'population', 'population_density', 'median_age', 'aged_65_older',
    'aged_70_older', 'gdp_per_capita', 'hospital_beds_per_thousand', 'life_expectancy',
    'human_development_index', 'new_cases_lag1', 'icu_patients_roll7', 'day_of_week', 'month',
    'is_weekend', 'continent_Africa', 'continent_Asia', 'continent_Australia', 'continent_Europe',
    'continent_North America', 'continent_South America'
] if col in df_clean.columns]

target = 'icu_patients'

# Proceed with model building (as in previous response)

In [None]:
df_clean = pd.read_csv('/content/cleaned_dataset.csv')
print("Available columns in df_clean:\n", df_clean.columns.tolist())

In [None]:
df_clean.info()

# Model Building



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import joblib

# Load cleaned dataset
#df_clean = pd.read_csv('/content/cleaned_medical_resource_dataset.csv')
df_clean = pd.read_csv('/content/cleaned_dataset.csv')
# Define features and target
available_features = [col for col in [
    'new_cases', 'total_cases', 'new_deaths', 'total_deaths', 'positive_rate', 'tests_per_case',
    'new_tests', 'hosp_patients', 'weekly_hosp_admissions', 'total_vaccinations', 'new_vacciliations',
    'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_people_vaccinated_smoothed',
    'stringency_index', 'population', 'population_density', 'median_age', 'aged_65_older',
    'aged_70_older', 'gdp_per_capita', 'hospital_beds_per_thousand', 'life_expectancy',
    'human_development_index', 'new_cases_lag1', 'icu_patients_roll7', 'day_of_week', 'month',
    'is_weekend', 'continent_Africa', 'continent_Asia', 'continent_Australia', 'continent_Europe',
    'continent_North America', 'continent_South America'
] if col in df_clean.columns]

target = 'icu_patients'

# Check if target exists
if target not in df_clean.columns:
    raise ValueError(f"Target column '{target}' not found in the dataset.")

# Split data
X = df_clean[available_features]
y = df_clean[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())
y_train = y_train.fillna(y_train.median())
y_test = y_test.fillna(y_train.median())

# Define pipelines
pipelines = [
    Pipeline([('scaler', StandardScaler()), ('model', LinearRegression())]),
    Pipeline([('scaler', StandardScaler()), ('model', RandomForestRegressor(random_state=42, n_jobs=-1))]),
    Pipeline([('scaler', StandardScaler()), ('model', GradientBoostingRegressor(random_state=42))]),
    Pipeline([('scaler', StandardScaler()), ('model', XGBRegressor(random_state=42, n_jobs=-1))]),
    Pipeline([('scaler', StandardScaler()), ('model', SVR())])
]
pipeline_dict = {
    'LinearRegression': pipelines[0],
    'RandomForest': pipelines[1],
    'GradientBoosting': pipelines[2],
    'XGBoost': pipelines[3],
    'SVR': pipelines[4]
}

# Fit pipelines and evaluate
rmse_scores = {}
for name, pipeline in pipeline_dict.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores[name] = rmse
    print(f'{name} RMSE: {rmse:.4f}')

# Find the best model
best_model_name = min(rmse_scores, key=rmse_scores.get)
best_rmse = rmse_scores[best_model_name]
print(f'Best model: {best_model_name} with RMSE: {best_rmse:.4f}')

# Use the best pipeline for predictions
best_pipeline = pipeline_dict[best_model_name]
y_pred_best = best_pipeline.predict(X_test)

# Save predictions
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_best})
predictions.to_csv('icu_patients_predictions.csv', index=False)

# Feature importance (for tree-based models)
if best_model_name in ['RandomForest', 'GradientBoosting', 'XGBoost']:
    model = best_pipeline.named_steps['model']
    importances = pd.DataFrame({'Feature': available_features, 'Importance': model.feature_importances_})
    importances = importances.sort_values(by='Importance', ascending=False)
    print('Feature Importances:\n', importances)

# Save the best model
joblib.dump(best_pipeline, 'best_icu_patients_model.pkl')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv")

# Load Excel
#df = pd.read_excel("/content/drive/MyDrive/your_folder/data.xlsx")

# Random Forest Implemetation

In [None]:
# Random Forest Implementation for Medical Resource Demand Prediction
# This code assumes pre-processed data is available in the format specified

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# -------------------------------------------------------------------------
# 1. Load the pre-processed data
# -------------------------------------------------------------------------
def load_preprocessed_data(file_path='/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'):
    """
    Load the pre-processed dataset from a CSV file
    """
    try:
  df_clean = pd.read_csv("/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv")

        print(f"Dataset loaded with shape: {df.shape}")

        # Convert date column to datetime if it exists
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')

        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# -------------------------------------------------------------------------
# 2. Prepare data for time series modeling
# -------------------------------------------------------------------------
def prepare_time_series_data(df, target_cols, feature_cols=None, test_size=0.2):
    """
    Prepare data for time series modeling by creating appropriate train/test splits

    Parameters:
    - df: DataFrame containing the pre-processed data
    - target_cols: List of target columns (resource demands to predict)
    - feature_cols: List of feature columns to use (if None, all non-target columns except date)
    - test_size: Proportion of data to use for testing

    Returns:
    - X_train, X_test, y_train, y_test, dates_train, dates_test
    """
    if 'date' not in df.columns:
        raise ValueError("DataFrame must contain a 'date' column for time series splitting")

    # If feature columns not specified, use all columns except targets and date
    if feature_cols is None:
        feature_cols = [col for col in df.columns if col not in target_cols and col != 'date']

    # Get the cutoff point for time-based split
    split_idx = int(len(df) * (1 - test_size))

    # Create train/test splits preserving time order
    train_df = df.iloc[:split_idx]
    test_df = df.iloc[split_idx:]

    # Extract features and targets
    X_train = train_df[feature_cols]
    X_test = test_df[feature_cols]

    # For multiple targets, create a dictionary of y values
    y_train = {target: train_df[target] for target in target_cols}
    y_test = {target: test_df[target] for target in target_cols}

    # Save dates for plotting
    dates_train = train_df['date'] if 'date' in train_df.columns else None
    dates_test = test_df['date'] if 'date' in test_df.columns else None

    print(f"Training data shape: {X_train.shape}")
    print(f"Testing data shape: {X_test.shape}")

    return X_train, X_test, y_train, y_test, dates_train, dates_test, feature_cols

# -------------------------------------------------------------------------
# 3. Implement Random Forest model
# -------------------------------------------------------------------------
def build_random_forest_model(X_train, y_train, target_name, optimize=True):
    """
    Build and train a Random Forest model with optional hyperparameter optimization

    Parameters:
    - X_train: Training features
    - y_train: Training target values (single target)
    - target_name: Name of the target variable (for logging)
    - optimize: Whether to perform hyperparameter optimization

    Returns:
    - Trained model
    """
    print(f"\nBuilding Random Forest model for {target_name}...")

    if optimize:
        # Define parameter grid for hyperparameter tuning
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['auto', 'sqrt']
        }

        # Use TimeSeriesSplit for cross-validation to respect temporal order
        tscv = TimeSeriesSplit(n_splits=5)

        # Initialize Random Forest model
        rf = RandomForestRegressor(random_state=42)

        # Perform grid search with time series cross-validation
        grid_search = GridSearchCV(
            estimator=rf,
            param_grid=param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            verbose=1
        )

        # Fit the grid search to find optimal parameters
        grid_search.fit(X_train, y_train)

        # Get the best model
        best_model = grid_search.best_estimator_

        print(f"Best parameters for {target_name}: {grid_search.best_params_}")
        print(f"Best CV score: {-grid_search.best_score_:.4f} MSE")

        return best_model
    else:
        # Use default parameters
        rf = RandomForestRegressor(
            n_estimators=200,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            max_features='auto',
            random_state=42
        )

        # Train the model
        rf.fit(X_train, y_train)
        return rf

# -------------------------------------------------------------------------
# 4. Evaluate model performance
# -------------------------------------------------------------------------
def evaluate_model(model, X_test, y_test, target_name):
    """
    Evaluate the model using multiple metrics

    Parameters:
    - model: Trained model
    - X_test: Test features
    - y_test: Test target values
    - target_name: Name of the target variable

    Returns:
    - Dictionary of evaluation metrics
    """
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / np.maximum(np.abs(y_test), 1))) * 100

    # Print metrics
    print(f"\nPerformance metrics for {target_name}:")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")
    print(f"MAPE: {mape:.2f}%")

    metrics = {
        'target': target_name,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'mape': mape
    }

    return metrics, y_pred

# -------------------------------------------------------------------------
# 5. Feature importance analysis
# -------------------------------------------------------------------------
def analyze_feature_importance(model, feature_names, target_name):
    """
    Analyze and visualize feature importance

    Parameters:
    - model: Trained Random Forest model
    - feature_names: List of feature names
    - target_name: Name of the target variable
    """
    # Get feature importances
    importances = model.feature_importances_

    # Sort features by importance
    indices = np.argsort(importances)[::-1]

    # Create DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': [feature_names[i] for i in indices],
        'Importance': [importances[i] for i in indices]
    })

    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title(f'Top 20 Feature Importances for {target_name}')
    plt.tight_layout()
    plt.savefig(f'feature_importance_{target_name}.png')
    plt.close()

    print(f"\nTop 10 important features for {target_name}:")
    for i, feature in enumerate(importance_df['Feature'].head(10)):
        print(f"{i+1}. {feature} ({importance_df['Importance'].iloc[i]:.4f})")

    return importance_df

# -------------------------------------------------------------------------
# 6. Visualize predictions vs actual values
# -------------------------------------------------------------------------
def visualize_predictions(y_test, y_pred, dates_test, target_name):
    """
    Visualize predictions vs actual values

    Parameters:
    - y_test: Actual values
    - y_pred: Predicted values
    - dates_test: Dates corresponding to test data
    - target_name: Name of the target variable
    """
    plt.figure(figsize=(15, 8))

    # Plot actual vs predicted
    if dates_test is not None:
        plt.plot(dates_test, y_test, label='Actual', marker='o', linestyle='-', alpha=0.7)
        plt.plot(dates_test, y_pred, label='Predicted', marker='x', linestyle='--', alpha=0.7)
        plt.xlabel('Date')
    else:
        plt.plot(y_test.index, y_test, label='Actual', marker='o', linestyle='-', alpha=0.7)
        plt.plot(y_test.index, y_pred, label='Predicted', marker='x', linestyle='--', alpha=0.7)
        plt.xlabel('Index')

    plt.ylabel(target_name)
    plt.title(f'Random Forest Predictions vs Actual Values: {target_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'predictions_vs_actual_{target_name}.png')
    plt.close()

    # Plot residuals
    residuals = y_test - y_pred
    plt.figure(figsize=(15, 8))

    if dates_test is not None:
        plt.scatter(dates_test, residuals, alpha=0.7)
        plt.axhline(y=0, color='r', linestyle='-')
        plt.xlabel('Date')
    else:
        plt.scatter(range(len(residuals)), residuals, alpha=0.7)
        plt.axhline(y=0, color='r', linestyle='-')
        plt.xlabel('Index')

    plt.ylabel('Residuals')
    plt.title(f'Residual Analysis: {target_name}')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'residuals_{target_name}.png')
    plt.close()

# -------------------------------------------------------------------------
# 7. Save the trained model
# -------------------------------------------------------------------------
def save_model(model, target_name):
    """
    Save the trained model to disk

    Parameters:
    - model: Trained model
    - target_name: Name of the target variable
    """
    model_filename = f"random_forest_model_{target_name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, model_filename)
    print(f"\nModel saved as {model_filename}")

# -------------------------------------------------------------------------
# 8. Create forecast for future periods
# -------------------------------------------------------------------------
def create_forecast(model, X_test, feature_cols, dates_test, days_to_forecast=30):
    """
    Create a forecast for future periods

    Parameters:
    - model: Trained model
    - X_test: Latest features
    - feature_cols: List of feature names
    - dates_test: Latest dates
    - days_to_forecast: Number of days to forecast

    Returns:
    - DataFrame with forecasted values
    """
    # Get the last date in the test set
    last_date = dates_test.iloc[-1]

    # Initialize a list to store forecasted values
    forecast_dates = [last_date + timedelta(days=i+1) for i in range(days_to_forecast)]
    forecasted_values = []

    # Get the last row of actual data to start forecasting from
    latest_data = X_test.iloc[-1:].copy()

    # Create forecast for each day
    for i in range(days_to_forecast):
        # Predict for current data point
        pred = model.predict(latest_data)[0]
        forecasted_values.append(pred)

        # Update features for next prediction (this is simplified and would need
        # to be adapted based on your actual feature engineering process)
        latest_data = latest_data.copy()

        # Example: Update lag features if they exist
        for col in feature_cols:
            if 'lag_1_' in col:
                original_col = col.replace('lag_1_', '')
                lag_2_col = f'lag_2_{original_col}'

                if lag_2_col in feature_cols:
                    latest_data[lag_2_col] = latest_data[col]
                latest_data[col] = pred

            # Update other time-based features as needed
            # This is highly dependent on your feature engineering process

    # Create DataFrame with forecasted values
    forecast_df = pd.DataFrame({
        'date': forecast_dates,
        'forecasted_value': forecasted_values
    })

    # Plot forecast
    plt.figure(figsize=(15, 8))

    # Plot historical data
    plt.plot(dates_test, y_test, label='Historical', marker='o', linestyle='-', alpha=0.7)

    # Plot forecast
    plt.plot(forecast_df['date'], forecast_df['forecasted_value'],
             label='Forecast', marker='x', linestyle='--', color='red', alpha=0.7)

    plt.xlabel('Date')
    plt.ylabel(target_name)
    plt.title(f'Random Forest Forecast: {target_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'forecast_{target_name}.png')
    plt.close()

    return forecast_df

# -------------------------------------------------------------------------
# 9. Main function to run the full pipeline
# -------------------------------------------------------------------------
def run_random_forest_pipeline(data_path, target_cols, optimize=True):
    """
    Run the complete Random Forest pipeline

    Parameters:
    - data_path: Path to pre-processed data
    - target_cols: List of target columns (resource demands to predict)
    - optimize: Whether to perform hyperparameter optimization

    Returns:
    - Dictionary of trained models and evaluation metrics
    """
    # Load pre-processed data
    df = load_preprocessed_data(data_path)
    if df is None:
        return None

    # Prepare data for time series modeling
    X_train, X_test, y_train, y_test, dates_train, dates_test, feature_cols = prepare_time_series_data(
        df, target_cols=target_cols
    )

    # Dictionary to store models and metrics
    results = {}

    # Train and evaluate models for each target
    for target in target_cols:
        # Build and train model
        model = build_random_forest_model(
            X_train, y_train[target], target, optimize=optimize
        )

        # Evaluate model
        metrics, y_pred = evaluate_model(model, X_test, y_test[target], target)

        # Analyze feature importance
        importance_df = analyze_feature_importance(model, feature_cols, target)

        # Visualize predictions
        visualize_predictions(y_test[target], y_pred, dates_test, target)

        # Save model
        save_model(model, target)

        # Create forecast (optional)
        # forecast_df = create_forecast(model, X_test, feature_cols, dates_test)

        # Store results
        results[target] = {
            'model': model,
            'metrics': metrics,
            'importance': importance_df,
            'predictions': y_pred
        }

    return results

# -------------------------------------------------------------------------
# Example usage
# -------------------------------------------------------------------------
if __name__ == "__main__":
    # Define target variables (medical resources to predict)
    target_cols = [
        'hospital_beds_required',
        'icu_beds_required',
        'ventilators_required',
        'ppe_daily_consumption',
        'staff_hours_required'
    ]

    # Run the pipeline
    results = run_random_forest_pipeline(
        data_path='preprocessed_medical_data.csv',
        target_cols=target_cols,
        optimize=True  # Set to False for faster execution with default parameters
    )

    # Summary of results
    if results:
        print("\n=== SUMMARY OF RANDOM FOREST MODEL PERFORMANCE ===")
        summary_metrics = []

        for target, result in results.items():
            metrics = result['metrics']
            summary_metrics.append(metrics)

        # Create summary DataFrame
        summary_df = pd.DataFrame(summary_metrics)
        print(summary_df)

        # Save summary to CSV
        summary_df.to_csv('random_forest_performance_summary.csv', index=False)
        print("Summary saved to random_forest_performance_summary.csv")

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set random seed for reproducibility
np.random.seed(42)

# Load the pre-processed data
df_clean = pd.read_csv("/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv")

# Display basic information about the dataset
print("Dataset shape:", df_clean.shape)
print("\nDataset columns:", df_clean.columns.tolist())
print("\nData types:")
print(df_clean.dtypes)
print("\nSummary statistics:")
print(df_clean.describe())

# Check for missing values
print("\nMissing values per column:")
print(df_clean.isnull().sum())

# Convert date column to datetime if it exists
if 'date' in df_clean.columns:
    df_clean['date'] = pd.to_datetime(df_clean['date'])
    df_clean = df_clean.sort_values('date')
    print("\nDate range:", df_clean['date'].min(), "to", df_clean['date'].max())

# Define target variables (adjust based on your specific dataset)
target_cols = [
    'hospital_beds_required',
    'icu_beds_required',
    'ventilators_required',
    'ppe_daily_consumption',
    'medical_staff_required'
]

# Check if target columns exist in the dataset
available_targets = [col for col in target_cols if col in df_clean.columns]
if not available_targets:
    print("\nWARNING: None of the specified target columns found in the dataset!")
    print("Available columns:", df_clean.columns.tolist())
    # You might want to manually define your target column here
    # For example: target_cols = ['actual_target_column_name']
else:
    print("\nTarget variables found:", available_targets)
    target_cols = available_targets

# Define feature columns (all columns except targets and non-feature columns)
exclude_cols = target_cols + ['date'] if 'date' in df_clean.columns else target_cols
feature_cols = [col for col in df_clean.columns if col not in exclude_cols]

print("\nFeature columns:", len(feature_cols))
print("Target columns:", len(target_cols))

# Function to prepare data for time series modeling
def prepare_time_series_data(df, target_cols, feature_cols, test_size=0.2):
    """
    Prepare data for time series modeling by creating appropriate train/test splits
    """
    # Get the cutoff point for time-based split
    if 'date' in df.columns:
        # Sort by date to ensure chronological order
        df = df.sort_values('date')
        split_idx = int(len(df) * (1 - test_size))
        train_df = df.iloc[:split_idx]
        test_df = df.iloc[split_idx:]
        dates_train = train_df['date'] if 'date' in train_df.columns else None
        dates_test = test_df['date'] if 'date' in test_df.columns else None
    else:
        # If no date column, use simple indices
        split_idx = int(len(df) * (1 - test_size))
        train_df = df.iloc[:split_idx]
        test_df = df.iloc[split_idx:]
        dates_train = None
        dates_test = None

    # Extract features and targets
    X_train = train_df[feature_cols]
    X_test = test_df[feature_cols]

    # For multiple targets, create a dictionary of y values
    y_train = {target: train_df[target] for target in target_cols}
    y_test = {target: test_df[target] for target in target_cols}

    print(f"Training data shape: {X_train.shape}")
    print(f"Testing data shape: {X_test.shape}")

    return X_train, X_test, y_train, y_test, dates_train, dates_test

# Prepare the data
X_train, X_test, y_train, y_test, dates_train, dates_test = prepare_time_series_data(
    df_clean, target_cols, feature_cols
)

# Function to build and train a Random Forest model
def build_random_forest_model(X_train, y_train, target_name, n_estimators=200, max_depth=None):
    """
    Build and train a Random Forest model
    """
    print(f"\nBuilding Random Forest model for {target_name}...")

    # Initialize Random Forest model with specified parameters
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='auto',
        random_state=42,
        n_jobs=-1  # Use all available cores
    )

    # Train the model
    rf.fit(X_train, y_train)
    print(f"Model for {target_name} trained successfully.")

    return rf

# Function to evaluate model performance
def evaluate_model(model, X_test, y_test, target_name):
    """
    Evaluate the model using multiple metrics
    """
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Calculate MAPE with handling for zero values
    epsilon = 1e-10  # Small value to prevent division by zero
    mape = np.mean(np.abs((y_test - y_pred) / (np.abs(y_test) + epsilon))) * 100

    # Print metrics
    print(f"\nPerformance metrics for {target_name}:")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")
    print(f"MAPE: {mape:.2f}%")

    metrics = {
        'target': target_name,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'mape': mape
    }

    return metrics, y_pred

# Function to analyze feature importance
def analyze_feature_importance(model, feature_cols, target_name):
    """
    Analyze and visualize feature importance
    """
    # Get feature importances
    importances = model.feature_importances_

    # Sort features by importance
    indices = np.argsort(importances)[::-1]

    # Create DataFrame for better visualization
    importance_df = pd.DataFrame({
        'Feature': [feature_cols[i] for i in indices],
        'Importance': [importances[i] for i in indices]
    })

    # Plot feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title(f'Top 20 Feature Importances for {target_name}')
    plt.tight_layout()
    plt.savefig(f'feature_importance_{target_name}.png')

    print(f"\nTop 10 important features for {target_name}:")
    for i, feature in enumerate(importance_df['Feature'].head(10)):
        print(f"{i+1}. {feature} ({importance_df['Importance'].iloc[i]:.4f})")

    return importance_df

# Function to visualize predictions vs actual values
def visualize_predictions(y_test, y_pred, dates_test, target_name):
    """
    Visualize predictions vs actual values
    """
    plt.figure(figsize=(15, 8))

    # Plot actual vs predicted
    if dates_test is not None:
        plt.plot(dates_test, y_test.values, label='Actual', marker='o', linestyle='-', alpha=0.7)
        plt.plot(dates_test, y_pred, label='Predicted', marker='x', linestyle='--', alpha=0.7)
        plt.xlabel('Date')
    else:
        plt.plot(y_test.index, y_test.values, label='Actual', marker='o', linestyle='-', alpha=0.7)
        plt.plot(y_test.index, y_pred, label='Predicted', marker='x', linestyle='--', alpha=0.7)
        plt.xlabel('Index')

    plt.ylabel(target_name)
    plt.title(f'Random Forest Predictions vs Actual Values: {target_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'predictions_vs_actual_{target_name}.png')

    # Plot residuals
    residuals = y_test.values - y_pred
    plt.figure(figsize=(15, 8))

    if dates_test is not None:
        plt.scatter(dates_test, residuals, alpha=0.7)
        plt.axhline(y=0, color='r', linestyle='-')
        plt.xlabel('Date')
    else:
        plt.scatter(range(len(residuals)), residuals, alpha=0.7)
        plt.axhline(y=0, color='r', linestyle='-')
        plt.xlabel('Index')

    plt.ylabel('Residuals')
    plt.title(f'Residual Analysis: {target_name}')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'residuals_{target_name}.png')

# Dictionary to store results
results = {}

# Train and evaluate models for each target variable
for target in target_cols:
    # Build and train model
    model = build_random_forest_model(
        X_train,
        y_train[target],
        target,
        n_estimators=200,  # You can adjust these parameters
        max_depth=None
    )

    # Evaluate model
    metrics, y_pred = evaluate_model(model, X_test, y_test[target], target)

    # Analyze feature importance
    importance_df = analyze_feature_importance(model, feature_cols, target)

    # Visualize predictions
    visualize_predictions(y_test[target], y_pred, dates_test, target)

    # Save model
    model_filename = f"random_forest_model_{target.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, model_filename)
    print(f"\nModel saved as {model_filename}")

    # Store results
    results[target] = {
        'model': model,
        'metrics': metrics,
        'importance': importance_df,
        'predictions': y_pred
    }

# Create summary of results
print("\n=== SUMMARY OF RANDOM FOREST MODEL PERFORMANCE ===")
summary_metrics = []

for target, result in results.items():
    metrics = result['metrics']
    summary_metrics.append(metrics)

# Create summary DataFrame
summary_df = pd.DataFrame(summary_metrics)
print(summary_df)

# Save summary to CSV
summary_df.to_csv('random_forest_performance_summary.csv', index=False)
print("Summary saved to random_forest_performance_summary.csv")

In [None]:
# Random Forest Implementation for Medical Resource Demand Prediction

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# 1. Load the pre-processed data
def load_preprocessed_data(file_path='/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'):
    """Load the pre-processed dataset from a CSV file"""
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded with shape: {df.shape}")
        # Convert date column to datetime if it exists
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# 2. Prepare data for time series modeling
def prepare_time_series_data(df, target_cols, feature_cols=None, test_size=0.2):
    """Prepare data for time series modeling"""
    if 'date' not in df.columns:
        raise ValueError("DataFrame must contain a 'date' column for time series splitting")
    # If feature columns not specified, use all except target and date
    if feature_cols is None:
        feature_cols = [col for col in df.columns if col not in target_cols + ['date']]
    # Split data into train and test sets
    n_samples = df.shape[0]
    n_test = int(test_size * n_samples)
    df_train = df[:-n_test]
    df_test = df[-n_test:]
    X_train = df_train[feature_cols].values
    X_test = df_test[feature_cols].values
    y_train = df_train[target_cols].values
    y_test = df_test[target_cols].values
    dates_train = df_train['date'].values
    dates_test = df_test['date'].values
    return X_train, X_test, y_train, y_test, dates_train, dates_test

# 3. Train and evaluate the Random Forest model
def train_and_evaluate_rf_model(X_train, y_train, X_test, y_test, target_cols, param_grid=None):
    """Train and evaluate a Random Forest model using time series cross-validation"""
    # Create a Random Forest Regressor
    rf_model = RandomForestRegressor(random_state=42)
    # Use TimeSeriesSplit for cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    # Perform GridSearchCV if param_grid is provided
    if param_grid:
        grid_search = GridSearchCV(rf_model, param_grid, cv=tscv, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        rf_model = grid_search.best_estimator_
        print(f"Best hyperparameters: {grid_search.best_params_}")
    else:
        rf_model.fit(X_train, y_train)
    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared

In [None]:
# Random Forest Implementation for Medical Resource Demand Prediction

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# 1. Load the pre-processed data
def load_preprocessed_data(file_path='/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'):
    """Load the pre-processed dataset from a CSV file"""
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded with shape: {df.shape}")
        # Convert date column to datetime if it exists
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# 2. Prepare data for time series modeling
def prepare_time_series_data(df, target_cols, feature_cols=None, test_size=0.2):
    """Prepare data for time series modeling"""
    if 'date' not in df.columns:
        raise ValueError("DataFrame must contain a 'date' column for time series splitting")
    # If feature columns not specified, use all except target and date
    if feature_cols is None:
        feature_cols = [col for col in df.columns if col not in target_cols + ['date']]
    # Split data into train and test sets
    n_samples = df.shape[0]
    n_test = int(test_size * n_samples)
    df_train = df[:-n_test]
    df_test = df[-n_test:]
    X_train = df_train[feature_cols].values
    X_test = df_test[feature_cols].values
    y_train = df_train[target_cols].values
    y_test = df_test[target_cols].values
    dates_train = df_train['date'].values
    dates_test = df_test['date'].values
    return X_train, X_test, y_train, y_test, dates_train, dates_test

# 3. Train and evaluate the Random Forest model
def train_and_evaluate_rf_model(X_train, y_train, X_test, y_test, target_cols, param_grid=None):
    """Train and evaluate a Random Forest model using time series cross-validation"""
    # Create a Random Forest Regressor
    rf_model = RandomForestRegressor(random_state=42)
    # Use TimeSeriesSplit for cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    # Perform GridSearchCV if param_grid is provided
    if param_grid:
        grid_search = GridSearchCV(rf_model, param_grid, cv=tscv, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        rf_model = grid_search.best_estimator_
        print(f"Best hyperparameters: {grid_search.best_params_}")
    else:
        rf_model.fit(X_train, y_train)
    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")
    return rf_model

# 4. Save the trained model
def save_trained_model(model, file_path='trained_rf_model.pkl'):
    """Save the trained Random Forest model to a file"""
    try:
        joblib.dump(model, file_path)
        print(f"Model saved to: {file_path}")
    except Exception as e:
        print(f"Error saving model: {e}")

# Example usage
if __name__ == "__main__":
    # Load pre-processed data
    data = load_preprocessed_data()
    if data is not None:
        # Define target and feature columns
        target_cols = ['icu_patients']  # Replace with your desired target columns
        # Prepare data for time series modeling
        X_train, X_test, y_train, y_test, dates_train, dates_test = prepare_time_series_data(
            data, target_cols, test_size=0.2
        )
        # Train and evaluate the Random Forest model
        rf_model = train_and_evaluate_rf_model(X_train, y_train, X_test, y_test, target_cols)
        # Save the trained model
        save_trained_model(rf_model)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

# Set random seed for reproducibility
np.random.seed(42)

# 1. Enhanced Data Loading with Categorical Handling
def load_preprocessed_data(file_path='/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'):
    """Load and preprocess data with categorical handling"""
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded with shape: {df.shape}")

        # Convert date column to datetime
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')

        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# 2. Data Preparation with Feature Engineering
def prepare_time_series_data(df, target_cols, categorical_cols=None, test_size=0.2):
    """Prepare data with categorical encoding"""
    if 'date' not in df.columns:
        raise ValueError("DataFrame must contain a 'date' column")

    # Identify numerical features (exclude target and date)
    numerical_cols = [col for col in df.columns
                     if col not in target_cols + ['date'] + (categorical_cols or [])]

    # Preprocess categorical variables
    if categorical_cols:
        # Label Encoding for ordinal categories
        label_encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    # Split data
    n_test = int(test_size * len(df))
    df_train = df[:-n_test]
    df_test = df[-n_test:]

    # Separate features and target
    X_train = df_train[numerical_cols + (categorical_cols or [])].values
    X_test = df_test[numerical_cols + (categorical_cols or [])].values
    y_train = df_train[target_cols].values
    y_test = df_test[target_cols].values

    return X_train, X_test, y_train, y_test

# 3. Model Training with Enhanced Configuration
def train_and_evaluate_rf_model(X_train, y_train, X_test, y_test):
    """Train Random Forest with robust settings"""
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'max_features': ['sqrt', 'log2']
    }

    model = RandomForestRegressor(random_state=42, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=3)

    grid_search = GridSearchCV(model, param_grid, cv=tscv,
                             scoring='neg_mean_squared_error',
                             verbose=2)
    grid_search.fit(X_train, y_train.ravel())

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Enhanced evaluation
    metrics = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred),
        'Best Params': grid_search.best_params_
    }

    print("\nModel Evaluation:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

    return best_model

# 4. Main Execution
if __name__ == "__main__":
    # Load data
    data = load_preprocessed_data()

    if data is not None:
        # Configure based on your data
        target_cols = ['icu_patients']
        categorical_cols = ['country']  # Add all categorical columns here

        # Prepare data
        X_train, X_test, y_train, y_test = prepare_time_series_data(
            data, target_cols, categorical_cols
        )

        # Train model
        model = train_and_evaluate_rf_model(X_train, y_train, X_test, y_test)

        # Save model
        joblib.dump(model, 'rf_medical_demand_model.pkl')
        print("Model saved successfully.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Set random seed for reproducibility
np.random.seed(42)

# 1. Enhanced Data Loading with Automatic Categorical Detection
def load_preprocessed_data(file_path):
    """Load and preprocess data with automatic categorical detection"""
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded with shape: {df.shape}")
        print("Columns in dataset:", df.columns.tolist())

        # Convert date column to datetime if exists
        if 'date' in df.columns:
            df['date'] = pd.to_datetime(df['date'])
            df = df.sort_values('date')

        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# 2. Smart Data Preparation
def prepare_time_series_data(df, target_cols, test_size=0.2):
    """Prepare data with automatic feature handling"""
    if 'date' not in df.columns:
        raise ValueError("DataFrame must contain a 'date' column")

    # Auto-detect categorical columns (string type columns that aren't dates/targets)
    categorical_cols = [col for col in df.columns
                       if df[col].dtype == 'object'
                       and col not in target_cols + ['date']]

    # Auto-detect numerical columns
    numerical_cols = [col for col in df.columns
                     if col not in target_cols + ['date'] + categorical_cols
                     and pd.api.types.is_numeric_dtype(df[col])]

    print(f"Detected categorical columns: {categorical_cols}")
    print(f"Using numerical columns: {numerical_cols}")

    # Encode categorical variables
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    # Split data
    n_test = int(test_size * len(df))
    df_train = df[:-n_test]
    df_test = df[-n_test:]

    # Prepare features and targets
    feature_cols = numerical_cols + categorical_cols
    X_train = df_train[feature_cols].values
    X_test = df_test[feature_cols].values
    y_train = df_train[target_cols].values
    y_test = df_test[target_cols].values

    return X_train, X_test, y_train, y_test, label_encoders

# 3. Model Training with Feature Importance
def train_and_evaluate_rf_model(X_train, y_train, X_test, y_test):
    """Train and evaluate Random Forest with feature analysis"""
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }

    model = RandomForestRegressor(random_state=42, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=3)

    grid_search = GridSearchCV(model, param_grid, cv=tscv,
                             scoring='neg_mean_squared_error',
                             verbose=1)
    grid_search.fit(X_train, y_train.ravel())

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Evaluation metrics
    metrics = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred)
    }

    print("\nModel Evaluation:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

    # Feature importance
    feature_importance = pd.DataFrame({
        'Feature': numerical_cols + categorical_cols,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nFeature Importance:")
    print(feature_importance.head(10))

    return best_model

# 4. Main Execution
if __name__ == "__main__":
    # Update this path to your dataset
    DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

    # Load data
    data = load_preprocessed_data(DATA_PATH)

    if data is not None:
        # Configure target column (update this to your actual target)
        target_cols = ['icu_patients']  # Change this to your target column name

        # Prepare data
        X_train, X_test, y_train, y_test, label_encoders = prepare_time_series_data(
            data, target_cols
        )

        # Train model
        model = train_and_evaluate_rf_model(X_train, y_train, X_test, y_test)

        # Save model and encoders
        joblib.dump({
            'model': model,
            'label_encoders': label_encoders
        }, 'medical_demand_rf_model.pkl')
        print("Model and encoders saved successfully.")

In [None]:
# Load saved model
artifacts = joblib.load('xgboost_medical_demand_model.pkl')
model = artifacts['model']
label_encoders = artifacts['label_encoders']

# Prepare new data (example)
new_data = pd.DataFrame({
    'date': ['2023-12-01'],
    'region': ['North'],
    'cases': [1250],
    ...
})

# Apply same preprocessing
for col, le in label_encoders.items():
    new_data[col] = le.transform(new_data[col].astype(str))

# Generate time features
new_data['date'] = pd.to_datetime(new_data['date'])
new_data['day_of_week'] = new_data['date'].dt.dayofweek
# ... add other features like in training

# Predict
prediction = model.predict(new_data[features])  # features saved in artifacts
print(f"Predicted ICU patients: {prediction[0]:.0f}")

In [None]:
# Load saved model
artifacts = joblib.load('xgboost_medical_demand_model.pkl')
model = artifacts['model']
label_encoders = artifacts['label_encoders']
features = artifacts['features']  # This line is missing in your code

# Prepare new data (example)
new_data = pd.DataFrame({
    'date': ['2023-12-01'],
    'region': ['North'],
    'cases': [1250],
    # ... other features
})

# Apply same preprocessing
for col, le in label_encoders.items():
    if col in new_data.columns:
        new_data[col] = le.transform(new_data[col].astype(str))

# Generate time features
new_data['date'] = pd.to_datetime(new_data['date'])
new_data['day_of_week'] = new_data['date'].dt.dayofweek
# ... add other features like in training

# Predict
prediction = model.predict(new_data[features])
print(f"Predicted ICU patients: {prediction[0]:.0f}")

In [None]:
import pandas as pd

# 1. Convert 'date' column to DatetimeIndex and set as index
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')

# 2. Resample to weekly frequency and sum cases and deaths
weekly_trends = df[['new_cases', 'new_deaths']].resample('W').sum()

# 3. Display or further process the 'weekly_trends' DataFrame
print(weekly_trends.head())  # Example: Print the first few rows

# Data Splitting

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load pre-processed dataset (assumed to be cleaned and encoded)
 DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'
data = pd.read_csv(url)

# Define features (X) and target (y)
X = data.drop(columns=['target'])  # Replace 'target' with your target column (e.g., hospital_beds)
y = data['target']

# Split data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Scale numerical features (important for LSTM and other models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaled data for models requiring DataFrame format (e.g., Prophet)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Print shapes to verify splits
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

In [None]:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

# Define dataset path (no indentation)
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from prophet import Prophet
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Load pre-processed dataset
data = pd.read_csv(DATA_PATH)

# Define features (X) and target (y)
# Adjust column names based on your dataset
X = data.drop(columns=['hospital_beds_needed', 'date'])  # Example target and date columns
y = data['hospital_beds_needed']

# Verify data loading
print("Dataset shape:", data.shape)
print("Features:", X.columns.tolist())

In [None]:
from google.colab import drive
import pandas as pd

# Mount drive (if not already mounted)
drive.mount('/content/drive')

# Load dataset
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'
data = pd.read_csv(DATA_PATH)

# Inspect columns and data
print("Columns in dataset:", data.columns.tolist())
print("\nFirst few rows:\n", data.head())

# LOAD THE CLEANED DATASET FOR SPLITTING

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from prophet import Prophet
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Define dataset path
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# Load pre-processed dataset
data = pd.read_csv(DATA_PATH)

# Inspect columns (uncomment to verify)
# print("Columns in dataset:", data.columns.tolist())
# print("\nFirst few rows:\n", data.head())

# Define features (X) and target (y)
# REPLACE 'target' and 'date' with actual column names from your dataset
# Example: if target is 'hospital_beds' and date is 'date', use those
X = data.drop(columns=['target', 'date'])  # Update 'target' to your target column (e.g., 'hospital_beds')
y = data['target']  # Update 'target' to your target column

# Split data into training (70%), validation (15%), and test (15%) sets
# shuffle=False to preserve temporal order for time-series
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False)

# Scale numerical features (important for LSTM and other models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaled data as DataFrames for models requiring DataFrame format (e.g., Prophet)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Print shapes to verify splits
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive (if not already mounted)
drive.mount('/content/drive')

# Load dataset
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'
data = pd.read_csv(DATA_PATH)

# Inspect columns and data
print("Columns in dataset:", data.columns.tolist())
print("\nFirst few rows:\n", data.head())

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from prophet import Prophet
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Define dataset path
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# Load pre-processed dataset
try:
    data = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the path and try again.")
    raise

# Inspect columns (uncomment to verify)
# print("Columns in dataset:", data.columns.tolist())
# print("\nFirst few rows:\n", data.head())

# Define features (X) and target (y)
# REPLACE 'hospital_beds' and 'date_column' with actual column names from your dataset
# Run the inspection code above to find the correct names (e.g., 'beds_needed', 'date')
try:
    X = data.drop(columns=['hospital_beds', 'date_column'])  # Update to your target and date columns
    y = data['hospital_beds']  # Update to your target column
except KeyError as e:
    print(f"Error: {e}. Please check column names using data.columns.tolist() and update the code.")
    raise

# Split data into training (70%), validation (15%), and test (15%) sets
# shuffle=False to preserve temporal order for time-series
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False)

# Scale numerical features (important for LSTM and other models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaled data as DataFrames for models requiring DataFrame format (e.g., Prophet)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Print shapes to verify splits
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from prophet import Prophet
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Define dataset path
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# Load pre-processed dataset
try:
    data = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the path and try again.")
    raise

# Inspect columns (uncomment to verify)
print("Columns in dataset:", data.columns.tolist())
# print("\nFirst few rows:\n", data.head())

# Define features (X) and target (y)
# REPLACE 'hosp_patients' and 'date' with actual column names from your dataset
# Run the inspection code above to find the correct names (e.g., 'beds_needed', 'date')
try:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from prophet import Prophet
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Define dataset path
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# Load pre-processed dataset
try:
    data = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the path and try again.")
    raise

# Inspect columns (uncomment to verify)
print("Columns in dataset:", data.columns.tolist())
# print("\nFirst few rows:\n", data.head())

# Define features (X) and target (y)
# REPLACE 'hosp_patients' and 'date' with actual column names from your dataset
# Run the inspection code above to find the correct names (e.g., 'beds_needed', 'date')
try:
    # Update with actual column names
    X = data.drop(columns=['hosp_patients', 'date'])  # Example: hosp_patients might be the target for hospital bed demand
    y = data['hosp_patients']  #
except KeyError as e:
    print(f"Error: Column {e} not found in the dataset. Please check the column names and try again.")
    raise

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from prophet import Prophet
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Define dataset path
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# Load pre-processed dataset
try:
    data = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the path and try again.")
    raise

# Inspect columns (uncomment to verify)
print("Columns in dataset:", data.columns.tolist())
# print("\nFirst few rows:\n", data.head())

# Define features (X) and target (y)
# REPLACE 'hosp_patients' and 'date' with actual column names from your dataset
# Run the inspection code above to find the correct names (e.g., 'beds_needed', 'date')
try:
    # Update with actual column names
    X = data.drop(columns=['hosp_patients', 'date'])  # Example: hosp_patients might be the target for hospital bed demand
    y = data['hosp_patients']  # Example: hosp_patients could represent hospital bed demand
except KeyError as e:
    print(f"Error: {e}. Please check column names using data.columns.tolist() and update the code.")
    raise

# Split data into training (70%), validation (15%), and test (15%) sets
# shuffle=False to preserve temporal order for time-series
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False)

# Scale numerical features (important for LSTM and other models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaled data as DataFrames for models requiring DataFrame format (e.g., Prophet)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Print shapes to verify splits
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from prophet import Prophet
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Define dataset path
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# Load pre-processed dataset
try:
    data = pd.read_csv(DATA_PATH)
except FileNotFoundError:
    print(f"Error: File not found at {DATA_PATH}. Please check the path and try again.")
    raise

# Inspect columns (uncomment to verify)
print("Columns in dataset:", data.columns.tolist())
# print("\nFirst few rows:\n", data.head())

# Define features (X) and target (y)
# REPLACE 'hosp_patients' and 'date' with actual column names from your dataset
# Run the inspection code above to find the correct names (e.g., 'beds_needed', 'date')
try:
    # Update with actual column names
    X = data.drop(columns=['hosp_patients', 'date'])  # Example: hosp_patients might be the target for hospital bed demand
    # Select only numeric features for scaling
    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    X = X[numeric_features]
    y = data['hosp_patients']  # Example: hosp_patients could represent hospital bed demand
except KeyError as e:
    print(f"Error: {e}. Please check column names using data.columns.tolist() and update the code.")
    raise

# Split data into training (70%), validation (15%), and test (15%) sets
# shuffle=False to preserve temporal order for time-series
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False)

# Scale numerical features (important for LSTM and other models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaled data as DataFrames for models requiring DataFrame format (e.g., Prophet)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Print shapes to verify splits
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

# RANDOM FOREST MODEL

In [None]:
# Initialize Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Train model on scaled training data
rf_model.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred_rf = rf_model.predict(X_val_scaled)

# Evaluate model
rf_rmse = mean_squared_error(y_val, y_val_pred_rf, squared=False)
rf_mae = mean_absolute_error(y_val, y_val_pred_rf)
rf_r2 = r2_score(y_val, y_val_pred_rf)

# Print results
print(f"Random Forest - RMSE: {rf_rmse:.4f}, MAE: {rf_mae:.4f}, R²: {rf_r2:.4f}")

# Feature importance (optional)
importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
print("Feature Importances:\n", importances.sort_values(ascending=False))

# XGBoost Model



In [None]:
# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)

# Train model
xgb_model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], early_stopping_rounds=10, verbose=False)

# Predict on validation set
y_val_pred_xgb = xgb_model.predict(X_val_scaled)

# Evaluate model
xgb_rmse = mean_squared_error(y_val, y_val_pred_xgb, squared=False)
xgb_mae = mean_absolute_error(y_val, y_val_pred_xgb)
xgb_r2 = r2_score(y_val, y_val_pred_xgb)

# Print results
print(f"XGBoost - RMSE: {xgb_rmse:.4f}, MAE: {xgb_mae:.4f}, R²: {xgb_r2:.4f}")

# Gradient Boosting

In [None]:
# Initialize Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train model
gb_model.fit(X_train_scaled, y_train)

# Predict on validation set
y_val_pred_gb = gb_model.predict(X_val_scaled)

# Evaluate model
gb_rmse = mean_squared_error(y_val, y_val_pred_gb, squared=False)
gb_mae = mean_absolute_error(y_val, y_val_pred_gb)
gb_r2 = r2_score(y_val, y_val_pred_gb)

# Print results
print(f"Gradient Boosting - RMSE: {gb_rmse:.4f}, MAE: {gb_mae:.4f}, R²: {gb_r2:.4f}")

# Long Short-Term Memory (LSTM)



In [None]:
# Reshape data for LSTM: [samples, timesteps, features]
timesteps = 7  # Example: Use 7-day sequences
def create_sequences(X, y, timesteps):
    X_seq, y_seq = [], []
    for i in range(len(X) - timesteps):
        X_seq.append(X[i:i+timesteps])
        y_seq.append(y[i+timesteps])
    return np.array(X_seq), np.array(y_seq)

# Create sequences for training, validation, and test sets
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train.values, timesteps)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val.values, timesteps)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, timesteps)

# Build LSTM model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(timesteps, X_train_scaled.shape[1]), return_sequences=True),
    Dropout(0.2),
    LSTM(50, activation='relu'),
    Dropout(0.2),
    Dense(1)
])

# Compile model
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train model
history = lstm_model.fit(X_train_seq, y_train_seq, validation_data=(X_val_seq, y_val_seq),
                         epochs=20, batch_size=32, verbose=1)

# Predict on validation set
y_val_pred_lstm = lstm_model.predict(X_val_seq)

# Evaluate model
lstm_rmse = mean_squared_error(y_val_seq, y_val_pred_lstm, squared=False)
lstm_mae = mean_absolute_error(y_val_seq, y_val_pred_lstm)
lstm_r2 = r2_score(y_val_seq, y_val_pred_lstm)

# Print results
print(f"LSTM - RMSE: {lstm_rmse:.4f}, MAE: {lstm_mae:.4f}, R²: {lstm_r2:.4f}")

# Prophet

In [None]:
# Prepare data for Prophet (requires 'ds' for date and 'y' for target)
# REPLACE 'date_column' and 'hospital_beds' with actual column names from your dataset
try:
    prophet_data = data[['date_column', 'hospital_beds']].copy()
    prophet_data.rename(columns={'date_column': 'ds', 'hospital_beds': 'y'}, inplace=True)
except KeyError as e:
    print(f"Error: {e}. Please check column names using data.columns.tolist() and update the code.")
    raise

# Convert date column to datetime
prophet_data['ds'] = pd.to_datetime(prophet_data['ds'])

# Add regressors (all features except date and target)
for col in X_train_scaled_df.columns:
    prophet_data[col] = data[col]  # Align with original data

# Split Prophet data
train_size = int(len(prophet_data) * 0.7)
val_size = int(len(prophet_data) * 0.15)
prophet_train = prophet_data[:train_size]
prophet_val = prophet_data[train_size:train_size + val_size]

# Initialize Prophet model
prophet_model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=True)

# Add regressors
for col in X_train_scaled_df.columns:
    prophet_model.add_regressor(col)

# Train model
prophet_model.fit(prophet_train)

# Predict on validation set
future = prophet_model.make_future_dataframe(periods=len(prophet_val), freq='D')
future = future.merge(prophet_val[['ds'] + X_train_scaled_df.columns.tolist()], on='ds', how='left').fillna(0)
forecast = prophet_model.predict(future)
y_val_pred_prophet = forecast['yhat'].iloc[train_size:train_size + len(prophet_val)]

# Evaluate model
prophet_rmse = mean_squared_error(prophet_val['y'], y_val_pred_prophet, squared=False)
prophet_mae = mean_absolute_error(prophet_val['y'], y_val_pred_prophet)
prophet_r2 = r2_score(prophet_val['y'], y_val_pred_prophet)

# Print results
print(f"Prophet - RMSE: {prophet_rmse:.4f}, MAE: {prophet_mae:.4f}, R²: {prophet_r2:.4f}")

# Visualize predictions (optional)
plt.figure(figsize=(12, 6))
plt.plot(y_val.values, label='Actual', color='black')
plt.plot(y_val_pred_rf, label='Random Forest', alpha=0.7)
plt.plot(y_val_pred_xgb, label='XGBoost', alpha=0.7)
plt.plot(y_val_pred_gb, label='Gradient Boosting', alpha=0.7)
plt.plot(y_val_pred_lstm.flatten(), label='LSTM', alpha=0.7)  # Flatten LSTM predictions
plt.plot(y_val_pred_prophet, label='Prophet', alpha=0.7)
plt.legend()
plt.title('Model Predictions vs Actual (Validation Set)')
plt.xlabel('Validation Set Index')
plt.ylabel('Hospital Beds Needed')
plt.show()

In [None]:
"""
# Real-Time Machine Learning Model for Medical Resource Demand Prediction
# Author: Claude
# Date: May 8, 2025

This notebook implements a working prototype for real-time medical resource demand prediction:
1. Load and analyze preprocessed dataset
2. Build and train predictive models
3. Implement a dashboard for visualization
4. Set up framework for real-time data ingestion
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
from datetime import datetime, timedelta
import requests
import io
import time
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
from google.colab import drive

warnings.filterwarnings('ignore')

# Mount Google Drive to access the dataset
drive.mount('/content/drive')

# Define the path to the dataset
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# 1. Data Loading and Exploration
print("Loading dataset from:", DATA_PATH)
df = pd.read_csv(DATA_PATH)

print("\n==== Dataset Overview ====")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())

print("\nData types:")
display(df.dtypes)

print("\nSummary statistics:")
display(df.describe())

print("\nMissing values:")
display(df.isnull().sum())

# Check if there's a datetime column, if not, create one
if not any(df.columns.str.contains('date|time', case=False)):
    print("\nNo explicit date/time column found. Creating a synthetic one for time-series analysis...")
    df['date'] = pd.date_range(start='2023-01-01', periods=len(df), freq='D')
else:
    date_col = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()][0]
    print(f"\nUsing existing date column: {date_col}")
    df[date_col] = pd.to_datetime(df[date_col])
    date_col = 'date'

# Identify target columns (resources to predict)
# For now, let's assume columns with 'demand', 'usage', 'need', 'resource' in their name are targets
target_cols = [col for col in df.columns if any(x in col.lower() for x in ['demand', 'usage', 'need', 'resource'])]

if not target_cols:
    print("\nNo clear target columns identified. Please specify which columns represent resource demand.")
    # For demonstration, let's assume the last column might be the target
    target_cols = [df.columns[-2]]

print(f"\nIdentified potential target columns: {target_cols}")

# Let's visualize the time series of the first identified target
plt.figure(figsize=(12, 6))
for target in target_cols[:3]:  # Show up to 3 targets
    plt.plot(df[date_col], df[target], label=target)
plt.title('Medical Resource Demand Over Time')
plt.xlabel('Date')
plt.ylabel('Demand')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Feature Engineering and Preprocessing
def preprocess_data(dataframe, target_column):
    """Preprocess the data for modeling"""
    # Make a copy to avoid modifying the original dataframe
    df_copy = dataframe.copy()

    # Extract date features if date column exists
    if 'date' in df_copy.columns:
        df_copy['day_of_week'] = df_copy['date'].dt.dayofweek
        df_copy['month'] = df_copy['date'].dt.month
        df_copy['day'] = df_copy['date'].dt.day
        df_copy['is_weekend'] = df_copy['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    # Drop non-numeric columns except derived features
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns.tolist()
    cols_to_keep = numeric_cols + ['day_of_week', 'month', 'day', 'is_weekend']
    df_copy = df_copy[cols_to_keep]

    # Remove the target from features
    features = df_copy.drop(target_column, axis=1)
    target = df_copy[target_column]

    # Create lagged features (important for time series)
    for lag in [1, 3, 7]:
        if len(df_copy) > lag:
            features[f'{target_column}_lag_{lag}'] = target.shift(lag)

    # Drop rows with NaN due to lag creation
    features = features.dropna()
    target = target.iloc[features.index]

    return features, target

# 3. Model Building and Evaluation
def evaluate_model(y_true, y_pred, model_name):
    """Evaluate model performance"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n==== {model_name} Performance ====")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")

    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

def build_and_evaluate_models(features, target):
    """Build and evaluate multiple models"""
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=42
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
    }

    results = {}
    best_model_name = None
    best_model_obj = None
    best_score = float('inf')  # Lower is better for RMSE

    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        # Evaluate
        eval_results = evaluate_model(y_test, y_pred, name)
        results[name] = eval_results

        # Check if this is the best model so far
        if eval_results['rmse'] < best_score:
            best_score = eval_results['rmse']
            best_model_name = name
            best_model_obj = model

    print(f"\nBest model: {best_model_name} with RMSE: {best_score:.4f}")

    # Plot actual vs predicted for the best model
    plt.figure(figsize=(10, 6))
    y_pred = best_model_obj.predict(X_test_scaled)
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'Actual vs Predicted - {best_model_name}')
    plt.show()

    return best_model_obj, scaler, results, best_model_name

# Apply preprocessing and model building for the first target
selected_target = target_cols[0]
print(f"\nBuilding model for target: {selected_target}")
features, target = preprocess_data(df, selected_target)
best_model, scaler, model_results, best_model_name = build_and_evaluate_models(features, target)

# 4. Real-time Prediction Framework
class RealTimePredictor:
    def __init__(self, model, scaler, feature_columns, target_column, historical_data=None):
        self.model = model
        self.scaler = scaler
        self.feature_columns = feature_columns
        self.target_column = target_column

        # Store historical data for creating lag features
        self.historical_data = historical_data.copy() if historical_data is not None else pd.DataFrame()

        # Cache for API responses
        self.api_cache = {}

        # Mock data generator for demonstration
        self.last_update = datetime.now()

    def prepare_features(self, new_data):
        """Prepare features for prediction including lags"""
        # Combine with historical to generate lag features
        combined_data = pd.concat([self.historical_data, new_data]).reset_index(drop=True)

        # Create date features
        if 'date' in combined_data.columns:
            combined_data['day_of_week'] = combined_data['date'].dt.dayofweek
            combined_data['month'] = combined_data['date'].dt.month
            combined_data['day'] = combined_data['date'].dt.day
            combined_data['is_weekend'] = combined_data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

        # Create lag features
        for lag in [1, 3, 7]:
            if len(combined_data) > lag:
                combined_data[f'{self.target_column}_lag_{lag}'] = combined_data[self.target_column].shift(lag)

        # Get the latest row with all features
        latest_data = combined_data.iloc[-1:].dropna(axis=1, how='any')

        # Select only the feature columns the model was trained on
        available_features = [col for col in self.feature_columns if col in latest_data.columns]
        X = latest_data[available_features]

        # Scale features
        X_scaled = self.scaler.transform(X)

        return X_scaled

    def fetch_real_time_data(self, source="mock"):
        """Fetch real-time data from specified source"""
        current_time = datetime.now()

        if source == "mock":
            # Generate mock data for demonstration
            # In a real scenario, you would replace this with API calls

            # Only generate new data if enough time has passed
            if (current_time - self.last_update).seconds < 5:
                return None

            self.last_update = current_time

            # Create synthetic data with some randomness
            new_row = {
                'date': pd.Timestamp(current_time),
                # Add random fluctuation to the last known value
                self.target_column: self.historical_data[self.target_column].iloc[-1] * (1 + np.random.normal(0, 0.05))
            }

            # Add other features that might be needed
            for col in self.historical_data.columns:
                if col not in new_row and col != 'date' and col != self.target_column:
                    if col.endswith(('_lag_1', '_lag_3', '_lag_7')):
                        continue  # Skip lag columns as they'll be computed
                    new_row[col] = self.historical_data[col].iloc[-1] * (1 + np.random.normal(0, 0.03))

            new_data = pd.DataFrame([new_row])
            return new_data

        elif source == "owid":
            # Our World in Data API integration
            # Example: Fetch latest COVID-19 data
            if "owid" not in self.api_cache or (current_time - self.api_cache.get("last_update", datetime.min)).seconds > 3600:
                try:
                    url = "https://covid.ourworldindata.org/data/latest/owid-covid-latest.csv"
                    response = requests.get(url)
                    data = pd.read_csv(io.StringIO(response.text))
                    self.api_cache["owid"] = data
                    self.api_cache["last_update"] = current_time
                except Exception as e:
                    print(f"Error fetching OWID data: {e}")
                    return None

            # Process the data to match our format
            # This would need to be customized based on your specific requirements
            return None  # Return processed data

        elif source == "cdc":
            # CDC API integration would go here
            # This is a placeholder for demonstration
            return None

        elif source == "healthmap":
            # HealthMap API integration would go here
            return None

        elif source == "ehr":
            # Electronic Health Records integration would go here
            return None

        return None

    def update_historical_data(self, new_data):
        """Add new data to historical dataset"""
        if new_data is not None and not new_data.empty:
            self.historical_data = pd.concat([self.historical_data, new_data]).reset_index(drop=True)
            # Keep only the most recent 1000 rows to prevent memory issues
            if len(self.historical_data) > 1000:
                self.historical_data = self.historical_data.iloc[-1000:]

    def predict_demand(self):
        """Make a prediction based on the latest data"""
        # Fetch new data
        new_data = self.fetch_real_time_data()

        if new_data is None or new_data.empty:
            return None, None

        # Update historical data with the new data
        self.update_historical_data(new_data)

        # Prepare features for prediction
        X = self.prepare_features(new_data)

        # Make prediction
        prediction = self.model.predict(X)[0]

        return prediction, new_data

# 5. Create Interactive Dashboard
def create_dashboard(predictor, target_column):
    """Create an interactive dashboard for visualizing predictions"""
    # Create a figure for the dashboard
    fig = make_subplots(
        rows=2, cols=2,
        specs=[[{"colspan": 2}, None],
               [{"type": "indicator"}, {"type": "indicator"}]],
        subplot_titles=("Historical and Predicted Demand",
                         "Current Demand", "Predicted Demand")
    )

    # Get historical data for plotting
    historical_data = predictor.historical_data.copy()

    # Initial plot of historical data
    fig.add_trace(
        go.Scatter(
            x=historical_data['date'],
            y=historical_data[target_column],
            mode='lines',
            name='Historical Demand',
            line=dict(color='blue')
        ),
        row=1, col=1
    )

    # Add trace for predictions (initially empty)
    fig.add_trace(
        go.Scatter(
            x=[],
            y=[],
            mode='lines+markers',
            name='Predicted Demand',
            line=dict(color='red')
        ),
        row=1, col=1
    )

    # Add indicator for current demand
    fig.add_trace(
        go.Indicator(
            mode="number+delta",
            value=historical_data[target_column].iloc[-1] if not historical_data.empty else 0,
            title={"text": "Current Demand"},
            delta={'reference': historical_data[target_column].iloc[-2] if len(historical_data) > 1 else 0,
                   'valueformat': '.2f'},
            number={'valueformat': '.2f'}
        ),
        row=2, col=1
    )

    # Add indicator for predicted demand (initially same as current)
    fig.add_trace(
        go.Indicator(
            mode="number+delta",
            value=historical_data[target_column].iloc[-1] if not historical_data.empty else 0,
            title={"text": "Predicted Demand"},
            delta={'reference': historical_data[target_column].iloc[-1] if not historical_data.empty else 0,
                   'valueformat': '.2f'},
            number={'valueformat': '.2f'}
        ),
        row=2, col=2
    )

    # Update layout
    fig.update_layout(
        height=600,
        title_text=f"Real-Time Medical Resource Demand Dashboard: {target_column}",
        showlegend=True
    )

    # Create function to update dashboard
    prediction_history = []
    timestamp_history = []

    def update_dashboard():
        nonlocal prediction_history, timestamp_history

        # Make prediction
        prediction, new_data = predictor.predict_demand()

        if prediction is not None and new_data is not None:
            # Store prediction and timestamp
            prediction_history.append(prediction)
            timestamp_history.append(new_data['date'].iloc[0])

            # Keep only the last 50 predictions
            if len(prediction_history) > 50:
                prediction_history = prediction_history[-50:]
                timestamp_history = timestamp_history[-50:]

            # Get the most recent historical data
            historical_data = predictor.historical_data.copy()

            # Update historical demand line
            fig.data[0].x = historical_data['date']
            fig.data[0].y = historical_data[target_column]

            # Update prediction line
            fig.data[1].x = timestamp_history
            fig.data[1].y = prediction_history

            # Update current demand indicator
            current_value = historical_data[target_column].iloc[-1]
            reference_value = historical_data[target_column].iloc[-2] if len(historical_data) > 1 else current_value
            fig.data[2].value = current_value
            fig.data[2].delta.reference = reference_value

            # Update predicted demand indicator
            fig.data[3].value = prediction
            fig.data[3].delta.reference = current_value

            # Update layout with current time
            fig.update_layout(
                title_text=f"Real-Time Medical Resource Demand Dashboard: {target_column}<br>Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
            )

        return fig

    return fig, update_dashboard

# 6. Run Real-Time Prediction System
def run_real_time_system(model, scaler, df, target_column):
    """Run the real-time prediction system"""
    # Initialize predictor with historical data
    feature_columns = [col for col in df.columns if col != target_column and col != 'date']
    predictor = RealTimePredictor(model, scaler, feature_columns, target_column, historical_data=df)

    # Set up dashboard
    fig, update_dashboard = create_dashboard(predictor, target_column)

    # Create buttons for dashboard control
    start_button = widgets.Button(description='Start Real-Time Prediction')
    stop_button = widgets.Button(description='Stop')
    output = widgets.Output()

    # Define button behaviors
    running = False
    def start_clicked(b):
        nonlocal running
        running = True
        with output:
            clear_output(wait=True)
            print(f"Starting real-time prediction for {target_column}...")
            display(fig)

            while running:
                fig = update_dashboard()
                clear_output(wait=True)
                print(f"Real-time prediction for {target_column} (Press 'Stop' to end)")
                print(f"Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
                display(fig)
                time.sleep(2)  # Update every 2 seconds

    def stop_clicked(b):
        nonlocal running
        running = False
        with output:
            clear_output(wait=True)
            print("Real-time prediction stopped.")

    start_button.on_click(start_clicked)
    stop_button.on_click(stop_clicked)

    # Display buttons and output
    display(widgets.HBox([start_button, stop_button]))
    display(output)

# Execute the main workflow
if __name__ == "__main__":
    print("\n==== Starting Medical Resource Demand Prediction System ====")

    # Let the user select which target to predict
    if len(target_cols) > 1:
        target_selector = widgets.Dropdown(
            options=target_cols,
            value=target_cols[0],
            description='Select Resource:',
        )

        def on_target_change(change):
            clear_output(wait=True)
            print(f"Selected target: {change.new}")
            # Re-run the model building for the new target
            features, target = preprocess_data(df, change.new)
            best_model, scaler, model_results, best_model_name = build_and_evaluate_models(features, target)
            # Start the real-time system
            run_real_time_system(best_model, scaler, df, change.new)

        target_selector.observe(on_target_change, names='value')
        display(target_selector)
    else:
        # If only one target, start immediately
        run_real_time_system(best_model, scaler, df, selected_target)

In [None]:
"""
# Real-Time Machine Learning Model for Medical Resource Demand Prediction
# Author: Claude
# Date: May 8, 2025

This notebook implements a working prototype for real-time medical resource demand prediction:
1. Load and analyze preprocessed dataset
2. Build and train predictive models
3. Implement a dashboard for visualization
4. Set up framework for real-time data ingestion
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from prophet import Prophet
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
from datetime import datetime, timedelta
import requests
import io
import time
import joblib
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
from google.colab import drive

warnings.filterwarnings('ignore')

# Mount Google Drive to access the dataset
drive.mount('/content/drive')

# Define the path to the dataset
DATA_PATH = '/content/drive/MyDrive/Medical_Resource_Prediction/cleaned_dataset.csv'

# 1. Data Loading and Exploration
print("Loading dataset from:", DATA_PATH)
df = pd.read_csv(DATA_PATH)

print("\n==== Dataset Overview ====")
print(f"Shape: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())

print("\nData types:")
display(df.dtypes)

print("\nSummary statistics:")
display(df.describe())

print("\nMissing values:")
display(df.isnull().sum())

# Check if there's a datetime column, if not, create one
if not any(df.columns.str.contains('date|time', case=False)):
    print("\nNo explicit date/time column found. Creating a synthetic one for time-series analysis...")
    df['date'] = pd.date_range(start='2023-01-01', periods=len(df), freq='D')
else:
    date_col = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()][0]
    print(f"\nUsing existing date column: {date_col}")
    df[date_col] = pd.to_datetime(df[date_col])
    date_col = 'date'

# Identify target columns (resources to predict)
# For now, let's assume columns with 'demand', 'usage', 'need', 'resource' in their name are targets
target_cols = [col for col in df.columns if any(x in col.lower() for x in ['demand', 'usage', 'need', 'resource'])]

if not target_cols:
    print("\nNo clear target columns identified. Please specify which columns represent resource demand.")
    # For demonstration, let's assume the last column might be the target
    target_cols = [df.columns[-2]]

print(f"\nIdentified potential target columns: {target_cols}")

# Let's visualize the time series of the first identified target
plt.figure(figsize=(12, 6))
for target in target_cols[:3]:  # Show up to 3 targets
    plt.plot(df[date_col], df[target], label=target)
plt.title('Medical Resource Demand Over Time')
plt.xlabel('Date')
plt.ylabel('Demand')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 2. Feature Engineering and Preprocessing
def preprocess_data(dataframe, target_column):
    """Preprocess the data for modeling"""
    # Make a copy to avoid modifying the original dataframe
    df_copy = dataframe.copy()

    # Extract date features if date column exists
    if 'date' in df_copy.columns:
        df_copy['day_of_week'] = df_copy['date'].dt.dayofweek
        df_copy['month'] = df_copy['date'].dt.month
        df_copy['day'] = df_copy['date'].dt.day
        df_copy['is_weekend'] = df_copy['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    # Drop non-numeric columns except derived features
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns.tolist()
    cols_to_keep = numeric_cols + ['day_of_week', 'month', 'day', 'is_weekend']
    df_copy = df_copy[cols_to_keep]

    # Remove the target from features
    features = df_copy.drop(target_column, axis=1)
    target = df_copy[target_column]

    # Create lagged features (important for time series)
    for lag in [1, 3, 7]:
        if len(df_copy) > lag:
            features[f'{target_column}_lag_{lag}'] = target.shift(lag)

    # Drop rows with NaN due to lag creation
    features = features.dropna()
    target = target.iloc[features.index]

    return features, target

# 3. Advanced Model Building and Evaluation
def evaluate_model(y_true, y_pred, model_name):
    """Evaluate model performance"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n==== {model_name} Performance ====")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")

    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

def prepare_lstm_data(X, y, time_steps=10):
    """Prepare data for LSTM model with time steps"""
    X_lstm, y_lstm = [], []
    for i in range(len(X) - time_steps):
        X_lstm.append(X[i:i + time_steps])
        y_lstm.append(y[i + time_steps])
    return np.array(X_lstm), np.array(y_lstm)

def build_lstm_model(input_shape):
    """Build an LSTM model for time series forecasting"""
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

def prepare_prophet_data(df, date_col, target_col):
    """Prepare data for Prophet model"""
    # Prophet requires a specific dataframe format with 'ds' and 'y' columns
    prophet_df = df[[date_col, target_col]].copy()
    prophet_df.columns = ['ds', 'y']
    return prophet_df

def build_and_evaluate_models(features, target, df, date_col, target_col, time_steps=10):
    """Build and evaluate multiple advanced models"""
    # Split data into training and testing sets - use time series split for better evaluation
    tscv = TimeSeriesSplit(n_splits=5)

    # For regular models
    train_index, test_index = list(tscv.split(features))[-1]  # Use the last fold
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # For time series data normalization
    target_scaler = MinMaxScaler(feature_range=(0, 1))
    y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
    y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

    # Prepare data for LSTM
    X_train_lstm, y_train_lstm = prepare_lstm_data(X_train_scaled, y_train_scaled, time_steps)
    X_test_lstm, y_test_lstm = prepare_lstm_data(X_test_scaled, y_test_scaled, time_steps)

    # Prepare data for Prophet
    prophet_df = prepare_prophet_data(df, date_col, target_col)
    prophet_train = prophet_df.iloc[train_index]
    prophet_test = prophet_df.iloc[test_index]

    # Define models
    models = {
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        'XGBoost': XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42, n_jobs=-1),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
    }

    results = {}
    best_model_name = None
    best_model_obj = None
    best_scaler = None
    best_target_scaler = None
    best_score = float('inf')  # Lower is better for RMSE

    # Train and evaluate traditional models
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        # Evaluate
        eval_results = evaluate_model(y_test, y_pred, name)
        results[name] = eval_results

        # Check if this is the best model so far
        if eval_results['rmse'] < best_score:
            best_score = eval_results['rmse']
            best_model_name = name
            best_model_obj = model
            best_scaler = scaler
            best_target_scaler = None  # Not used for traditional models

    # Train and evaluate LSTM model
    print("\nTraining LSTM...")
    lstm_model = build_lstm_model((X_train_lstm.shape[1], X_train_lstm.shape[2]))

    # Use early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Fit the LSTM model
    lstm_history = lstm_model.fit(
        X_train_lstm, y_train_lstm,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=1
    )

    # Make predictions
    lstm_pred_scaled = lstm_model.predict(X_test_lstm)
    lstm_pred = target_scaler.inverse_transform(lstm_pred_scaled)

    # Need to align predictions with test data (LSTM requires time steps)
    y_test_lstm_actual = y_test.iloc[time_steps:]

    # Evaluate LSTM
    lstm_results = evaluate_model(y_test_lstm_actual, lstm_pred, 'LSTM')
    results['LSTM'] = lstm_results

    # Check if LSTM is the best model
    if lstm_results['rmse'] < best_score:
        best_score = lstm_results['rmse']
        best_model_name = 'LSTM'
        best_model_obj = lstm_model
        best_scaler = scaler
        best_target_scaler = target_scaler

    # Train and evaluate Prophet model
    print("\nTraining Prophet...")
    prophet_model = Prophet(
        interval_width=0.95,
        yearly_seasonality=True,
        weekly_seasonality=True,
        daily_seasonality=True
    )

    # Add holiday effects if available
    try:
        prophet_model.add_country_holidays(country_name='US')
    except:
        print("Could not add US holidays to Prophet model.")

    # Fit Prophet model
    prophet_model.fit(prophet_train)

    # Create future dataframe for prediction
    future = prophet_model.make_future_dataframe(periods=len(prophet_test))
    forecast = prophet_model.predict(future)

    # Extract predictions for test period
    prophet_pred = forecast.iloc[-len(prophet_test):]['yhat'].values

    # Evaluate Prophet
    prophet_results = evaluate_model(prophet_test['y'].values, prophet_pred, 'Prophet')
    results['Prophet'] = prophet_results

    # Check if Prophet is the best model
    if prophet_results['rmse'] < best_score:
        best_score = prophet_results['rmse']
        best_model_name = 'Prophet'
        best_model_obj = prophet_model
        best_scaler = None  # Not used for Prophet
        best_target_scaler = None  # Not used for Prophet

    print(f"\nBest model: {best_model_name} with RMSE: {best_score:.4f}")

    # Plot actual vs predicted for the best model
    plt.figure(figsize=(12, 6))

    if best_model_name == 'LSTM':
        plt.plot(y_test_lstm_actual.values, label='Actual')
        plt.plot(lstm_pred, label='Predicted')
    elif best_model_name == 'Prophet':
        plt.plot(prophet_test['y'].values, label='Actual')
        plt.plot(prophet_pred, label='Predicted')
    else:
        plt.scatter(y_test, y_pred, alpha=0.5)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')

    plt.xlabel('Actual' if best_model_name not in ['LSTM', 'Prophet'] else 'Time')
    plt.ylabel('Value')
    plt.title(f'Actual vs Predicted - {best_model_name}')
    plt.legend()
    plt.show()

    # Feature importance for tree-based models
    if best_model_name in ['Random Forest', 'XGBoost', 'Gradient Boosting']:
        plt.figure(figsize=(12, 6))
        feature_importance = best_model_obj.feature_importances_
        sorted_idx = np.argsort(feature_importance)
        plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
        plt.yticks(range(len(sorted_idx)), np.array(X_train.columns)[sorted_idx])
        plt.title(f'Feature Importance - {best_model_name}')
        plt.tight_layout()
        plt.show()

    # Save the best model
    if best_model_name not in ['LSTM', 'Prophet']:
        joblib.dump(best_model_obj, f'best_model_{best_model_name}.pkl')
        joblib.dump(best_scaler, f'scaler_{best_model_name}.pkl')
        print(f"\nModel saved as 'best_model_{best_model_name}.pkl'")
    elif best_model_name == 'LSTM':
        best_model_obj.save('best_model_LSTM.h5')
        joblib.dump(best_scaler, 'scaler_LSTM.pkl')
        joblib.dump(best_target_scaler, 'target_scaler_LSTM.pkl')
        print("\nModel saved as 'best_model_LSTM.h5'")
    elif best_model_name == 'Prophet':
        with open('best_model_Prophet.pkl', 'wb') as f:
            joblib.dump(prophet_model, f)
        print("\nModel saved as 'best_model_Prophet.pkl'")

    return best_model_obj, best_scaler, results, best_model_name, best_target_scaler

# Apply preprocessing and model building for the first target
selected_target = target_cols[0]
print(f"\nBuilding model for target: {selected_target}")
features, target = preprocess_data(df, selected_target)
best_model, scaler, model_results, best_model_name = build_and_evaluate_models(features, target)

# 4. Advanced Real-time Prediction Framework
class RealTimePredictor:
    def __init__(self, model, scaler, feature_columns, target_column, model_type, historical_data=None, target_scaler=None, time_steps=10):
        self.model = model
        self.scaler = scaler
        self.target_scaler = target_scaler  # For LSTM models
        self.feature_columns = feature_columns
        self.target_column = target_column
        self.model_type = model_type  # Type of model: 'Random Forest', 'XGBoost', 'Gradient Boosting', 'LSTM', 'Prophet'
        self.time_steps = time_steps  # For LSTM models

        # Store historical data for creating lag features
        self.historical_data = historical_data.copy() if historical_data is not None else pd.DataFrame()

        # Maintain a buffer for LSTM predictions
        if self.model_type == 'LSTM':
            self.prediction_buffer = np.zeros((1, self.time_steps, len(feature_columns)))
            self.initialized_buffer = False

        # Cache for API responses
        self.api_cache = {}

        # Track model performance metrics in real-time
        self.performance_metrics = {'predictions': [], 'actuals': [], 'rmse': [], 'mae': [], 'timestamp': []}

        # Mock data generator for demonstration
        self.last_update = datetime.now()

        # For Prophet predictions
        if self.model_type == 'Prophet':
            self.last_forecast = None
            self.forecast_dates = None

    def prepare_features(self, new_data):
        """Prepare features for prediction including lags"""
        # Combine with historical to generate lag features
        combined_data = pd.concat([self.historical_data, new_data]).reset_index(drop=True)

        # Create date features
        if 'date' in combined_data.columns:
            combined_data['day_of_week'] = combined_data['date'].dt.dayofweek
            combined_data['month'] = combined_data['date'].dt.month
            combined_data['day'] = combined_data['date'].dt.day
            combined_data['is_weekend'] = combined_data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
            combined_data['hour'] = combined_data['date'].dt.hour
            combined_data['quarter'] = combined_data['date'].dt.quarter

            # Add seasonality features
            combined_data['sin_day'] = np.sin(2 * np.pi * combined_data['day'] / 31)
            combined_data['cos_day'] = np.cos(2 * np.pi * combined_data['day'] / 31)
            combined_data['sin_month'] = np.sin(2 * np.pi * combined_data['month'] / 12)
            combined_data['cos_month'] = np.cos(2 * np.pi * combined_data['month'] / 12)
            combined_data['sin_hour'] = np.sin(2 * np.pi * combined_data['hour'] / 24)
            combined_data['cos_hour'] = np.cos(2 * np.pi * combined_data['hour'] / 24)

        # Create lag features
        for lag in [1, 3, 7, 14, 30]:  # Extended lag features
            if len(combined_data) > lag:
                combined_data[f'{self.target_column}_lag_{lag}'] = combined_data[self.target_column].shift(lag)

        # Add rolling statistics
        for window in [7, 14, 30]:
            if len(combined_data) > window:
                combined_data[f'{self.target_column}_rolling_mean_{window}'] = combined_data[self.target_column].rolling(window=window).mean()
                combined_data[f'{self.target_column}_rolling_std_{window}'] = combined_data[self.target_column].rolling(window=window).std()
                combined_data[f'{self.target_column}_rolling_min_{window}'] = combined_data[self.target_column].rolling(window=window).min()
                combined_data[f'{self.target_column}_rolling_max_{window}'] = combined_data[self.target_column].rolling(window=window).max()

        # Get the latest data
        if self.model_type == 'LSTM':
            # For LSTM, we need the last time_steps rows
            latest_data = combined_data.iloc[-self.time_steps:].dropna(axis=1, how='any')

            # Select only the feature columns the model was trained on
            available_features = [col for col in self.feature_columns if col in latest_data.columns]
            X = latest_data[available_features]

            # Scale features
            X_scaled = self.scaler.transform(X)

            # Reshape for LSTM [samples, time_steps, features]
            X_lstm = np.reshape(X_scaled, (1, X_scaled.shape[0], X_scaled.shape[1]))

            # Update the prediction buffer
            self.prediction_buffer = X_lstm
            self.initialized_buffer = True

            return X_lstm
        elif self.model_type == 'Prophet':
            # For Prophet, prepare a future dataframe
            latest_date = combined_data['date'].max()
            future = pd.DataFrame({'ds': [latest_date]})
            return future
        else:
            # For traditional models
            latest_data = combined_data.iloc[-1:].dropna(axis=1, how='any')

            # Select only the feature columns the model was trained on
            available_features = [col for col in self.feature_columns if col in latest_data.columns]
            X = latest_data[available_features]

            # Scale features
            X_scaled = self.scaler.transform(X)

            return X_scaled

    def fetch_real_time_data(self, source="mock"):
        """Fetch real-time data from specified source"""
        current_time = datetime.now()

        if source == "mock":
            # Generate mock data for demonstration
            # In a real scenario, you would replace this with API calls

            # Only generate new data if enough time has passed
            if (current_time - self.last_update).seconds < 5:
                return None

            self.last_update = current_time

            # Create synthetic data with some randomness and seasonality patterns
            hour_of_day = current_time.hour
            day_of_week = current_time.weekday()
            month = current_time.month

            # Base value with seasonal patterns
            base_value = self.historical_data[self.target_column].iloc[-1]

            # Add time-based patterns
            # Higher demand during business hours
            hour_factor = 1.2 if 8 <= hour_of_day <= 17 else 0.8
            # Lower demand on weekends
            day_factor = 0.7 if day_of_week >= 5 else 1.1
            # Seasonal variations
            month_factor = 1.1 if month in [1, 2, 12] else (1.2 if month in [6, 7, 8] else 1.0)

            # Add trend component (slight upward trend)
            trend_factor = 1.001

            # Add random noise
            noise = np.random.normal(0, 0.03)

            # Calculate new value
            new_value = base_value * hour_factor * day_factor * month_factor * trend_factor * (1 + noise)

            new_row = {
                'date': pd.Timestamp(current_time),
                self.target_column: new_value
            }

            # Add other features that might be needed
            for col in self.historical_data.columns:
                if col not in new_row and col != 'date' and col != self.target_column:
                    if col.endswith(('_lag_', '_rolling_')):
                        continue  # Skip lag and rolling columns as they'll be computed
                    new_row[col] = self.historical_data[col].iloc[-1] * (1 + np.random.normal(0, 0.02))

            new

# 5. Create Interactive Dashboard
def create_dashboard(predictor, target_column):
    """Create an interactive dashboard for visualizing predictions"""
    # Create a figure for the dashboard
    fig = make_subplots(
        rows=2, cols=2,
        specs=[[{"colspan": 2}, None],
               [{"type": "indicator"}, {"type": "indicator"}]],
        subplot_titles=("Historical and Predicted Demand",
                         "Current Demand", "Predicted Demand")
    )

    # Get historical data for plotting
    historical_data = predictor.historical_data.copy()

    # Initial plot of historical data
    fig.add_trace(
        go.Scatter(
            x=historical_data['date'],
            y=historical_data[target_column],
            mode='lines',
            name='Historical Demand',
            line=dict(color='blue')
        ),
        row=1, col=1
    )

    # Add trace for predictions (initially empty)
    fig.add_trace(
        go.Scatter(
            x=[],
            y=[],
            mode='lines+markers',
            name='Predicted Demand',
            line=dict(color='red')
        ),
        row=1, col=1
    )

    # Add indicator for current demand
    fig.add_trace(
        go.Indicator(
            mode="number+delta",
            value=historical_data[target_column].iloc[-1] if not historical_data.empty else 0,
            title={"text": "Current Demand"},
            delta={'reference': historical_data[target_column].iloc[-2] if len(historical_data) > 1 else 0,
                   'valueformat': '.2f'},
            number={'valueformat': '.2f'}
        ),
        row=2, col=1
    )

    # Add indicator for predicted demand (initially same as current)
    fig.add_trace(
        go.Indicator(
            mode="number+delta",
            value=historical_data[target_column].iloc[-1] if not historical_data.empty else 0,
            title={"text": "Predicted Demand"},
            delta={'reference': historical_data[target_column].iloc[-1] if not historical_data.empty else 0,
                   'valueformat': '.2f'},
            number={'valueformat': '.2f'}
        ),
        row=2, col=2
    )

    # Update layout
    fig.update_layout(
        height=600,
        title_text=f"Real-Time Medical Resource Demand Dashboard: {target_column}",
        showlegend=True
    )

    # Create function to update dashboard
    prediction_history = []
    timestamp_history = []

    def update_dashboard():
        nonlocal prediction_history, timestamp_history

        # Make prediction
        prediction, new_data = predictor.predict_demand()

        if prediction is not None and new_data is not None:
            # Store prediction and timestamp
            prediction_history.append(prediction)
            timestamp_history.append(new_data['date'].iloc[0])

            # Keep only the last 50 predictions
            if len(prediction_history) > 50:
                prediction_history = prediction_history[-50:]
                timestamp_history = timestamp_history[-50:]

            # Get the most recent historical data
            historical_data = predictor.historical_data.copy()

            # Update historical demand line
            fig.data[0].x = historical_data['date']
            fig.data[0].y = historical_data[target_column]

            # Update prediction line
            fig.data[1].x = timestamp_history
            fig.data[1].y = prediction_history

            # Update current demand indicator
            current_value = historical_data[target_column].iloc[-1]
            reference_value = historical_data[target_column].iloc[-2] if len(historical_data) > 1 else current_value
            fig.data[2].value = current_value
            fig.data[2].delta.reference = reference_value

            # Update predicted demand indicator
            fig.data[3].value = prediction
            fig.data[3].delta.reference = current_value

            # Update layout with current time
            fig.update_layout(
                title_text=f"Real-Time Medical Resource Demand Dashboard: {target_column}<br>Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
            )

        return fig

    return fig, update_dashboard

# 6. Run Real-Time Prediction System
def run_real_time_system(model, scaler, df, target_column):
    """Run the real-time prediction system"""
    # Initialize predictor with historical data
    feature_columns = [col for col in df.columns if col != target_column and col != 'date']
    predictor = RealTimePredictor(model, scaler, feature_columns, target_column, historical_data=df)

    # Set up dashboard
    fig, update_dashboard = create_dashboard(predictor, target_column)

    # Create buttons for dashboard control
    start_button = widgets.Button(description='Start Real-Time Prediction')
    stop_button = widgets.Button(description='Stop')
    output = widgets.Output()

    # Define button behaviors
    running = False
    def start_clicked(b):
        nonlocal running
        running = True
        with output:
            clear_output(wait=True)
            print(f"Starting real-time prediction for {target_column}...")
            display(fig)

            while running:
                fig = update_dashboard()
                clear_output(wait=True)
                print(f"Real-time prediction for {target_column} (Press 'Stop' to end)")
                print(f"Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
                display(fig)
                time.sleep(2)  # Update every 2 seconds

    def stop_clicked(b):
        nonlocal running
        running = False
        with output:
            clear_output(wait=True)
            print("Real-time prediction stopped.")

    start_button.on_click(start_clicked)
    stop_button.on_click(stop_clicked)

    # Display buttons and output
    display(widgets.HBox([start_button, stop_button]))
    display(output)

# Execute the main workflow
if __name__ == "__main__":
    print("\n==== Starting Medical Resource Demand Prediction System ====")

    # Let the user select which target to predict
    if len(target_cols) > 1:
        target_selector = widgets.Dropdown(
            options=target_cols,
            value=target_cols[0],
            description='Select Resource:',
        )

        def on_target_change(change):
            clear_output(wait=True)
            print(f"Selected target: {change.new}")
            # Re-run the model building for the new target
            features, target = preprocess_data(df, change.new)
            best_model, scaler, model_results, best_model_name = build_and_evaluate_models(features, target)
            # Start the real-time system
            run_real_time_system(best_model, scaler, df, change.new)

        target_selector.observe(on_target_change, names='value')
        display(target_selector)
    else:
        # If only one target, start immediately
        run_real_time_system(best_model, scaler, df, selected_target)