In [None]:
import os

# Change to the desired directory
os.chdir('/content/drive/MyDrive/Recommendations ')

# Confirm the change
print("Current Directory:", os.getcwd())


Current Directory: /content/drive/MyDrive/Recommendations 


In [None]:
# STEP 1: Install required packages (if not installed)
!pip install pandas scikit-learn

# STEP 2: Imports
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# STEP 3: Load dataset
df = pd.read_csv("/content/drive/MyDrive/Recommendations /Final stroke recommendations .csv")  # Replace with your file path

# STEP 4: Drop 'id' column if present
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)

# STEP 5: Drop missing data
df.dropna(inplace=True)

# STEP 6: Encode categorical columns
label_encoders = {}
categorical_columns = df.select_dtypes(include=['object']).columns

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store for later decoding

# STEP 7: Define input (X) and output (Y) features
X = df.drop(columns=['stroke_stage', 'stroke_type', 'recommended_doctor', 'medication', 'recommended_duration'])
Y = df[['stroke_stage', 'stroke_type', 'recommended_doctor', 'medication', 'recommended_duration']]


# STEP 8: Train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# 🔹 DEBUG: Print unique values in each output column before training
for col in Y_train.columns:
    print(f"Unique labels in {col}:", Y_train[col].unique())

# STEP 9: Train model using RandomForest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
multi_model = MultiOutputClassifier(rf)
multi_model.fit(X_train, Y_train)

# STEP 10: Save the trained model
with open('final_stroke_recommendation_model.pkl', 'wb') as f:
    pickle.dump(multi_model, f)

# STEP 11: Save label encoders
with open('final_label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

# STEP 12: Save EVERYTHING in a single file
with open('full_model.pkl', 'wb') as f:
    pickle.dump({"model": multi_model, "encoders": label_encoders}, f)

print("✅ Models saved: 'final_stroke_recommendation_model.pkl', 'final_label_encoders.pkl', and 'final_full_model.pkl'")



#updated code for recommendations and all with one model file

✅ Models saved: 'final_stroke_recommendation_model.pkl', 'final_label_encoders.pkl', and 'final_full_model.pkl'


In [None]:
from google.colab import files

# Download CNN model
files.download('/content/drive/MyDrive/Recommendations /final_stroke_recommendation_model.pkl')

# Download recommendation model
files.download('/content/drive/MyDrive/Recommendations /final_label_encoders.pkl')

# Download label encoders (if used)
files.download('/content/drive/MyDrive/Recommendations /full_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# STEP 1: Install required packages
!pip install pandas scikit-learn --quiet

# STEP 2: Imports
import pandas as pd
import pickle
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

# STEP 3: Load dataset
df = pd.read_csv("/content/drive/MyDrive/Recommendations /Final stroke recommendations .csv")

# STEP 4: Data Cleaning
# Handle missing values
df.dropna(inplace=True)

# Convert duration to categorical bins if needed
df['recommended_duration'] = df['recommended_duration'].astype('category')

# STEP 5: Preprocessing setup
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.drop(['stroke_stage', 'stroke_type',
                                                                                  'recommended_doctor', 'medication',
                                                                                  'recommended_duration'])
numerical_cols = df.select_dtypes(include=['number']).columns

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# STEP 6: Define outputs
output_cols = ['stroke_stage', 'stroke_type', 'recommended_doctor',
               'medication', 'recommended_duration']
Y = df[output_cols]
X = df.drop(columns=output_cols)

# STEP 7: Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(
        RandomForestClassifier(n_estimators=200,
                              class_weight='balanced',
                              random_state=42))
    )
])

# STEP 8: Train model
pipeline.fit(X, Y)

# STEP 9: Save full pipeline with metadata
model_package = {
    'model': pipeline,
    'input_columns': list(X.columns),
    'output_columns': output_cols,
    'categorical_cols': list(categorical_cols),
    'numerical_cols': list(numerical_cols),
    'category_mappings': {
        col: dict(enumerate(df[col].astype('category').cat.categories))
        for col in output_cols
    }
}

with open('stroke_recommendation_package.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("✅ Full model package saved as 'stroke_recommendation_package.pkl'")
print("Package contains:")
print("- Trained pipeline")
print("- Input/Output column names")
print("- Category mappings for decoding predictions")

✅ Full model package saved as 'stroke_recommendation_package.pkl'
Package contains:
- Trained pipeline
- Input/Output column names
- Category mappings for decoding predictions


In [None]:
from google.colab import files

# Download CNN model
files.download('/content/drive/MyDrive/Recommendations /stroke_recommendation_package.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# 1️⃣ **Load the Dataset**
df = pd.read_csv("/content/drive/MyDrive/Recommendations /Final stroke recommendations .csv")

# 2️⃣ **Define Input (X) & Output (Y) Columns**
input_columns = ['age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
                 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
                 'smoking_status', 'trauma', 'medication_use', 'symptoms_duration']
output_columns = ['stroke_type', 'stroke_stage', 'medication', 'recommended_duration', 'recommended_doctor']

X = df[input_columns]
Y = df[output_columns]

# 3️⃣ **Identify Categorical & Numerical Columns**
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 4️⃣ **Create a Preprocessing Pipeline**
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('scaler', StandardScaler(), numerical_cols)
])

# 5️⃣ **Train the MultiOutput RandomForestClassifier**
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")))
])

# Split the data for training & testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, Y_train)

# 6️⃣ **Save the Model**
category_mappings = {
    col: {i: cat for i, cat in enumerate(df[col].astype('category').cat.categories)}
    for col in output_columns
}

model_package = {
    'model': model,
    'input_columns': input_columns,
    'output_columns': output_columns,
    'categorical_cols': categorical_cols,
    'numerical_cols': numerical_cols,
    'category_mappings': category_mappings
}

joblib.dump(model_package, "final_stroke_recommendation_package.pkl")
print("✅ Model training complete! Saved as 'Final stroke_recommendation_package.pkl'.")


✅ Model training complete! Saved as 'Final stroke_recommendation_package.pkl'.


In [None]:
from google.colab import files

# Download CNN model
files.download('/content/drive/MyDrive/Recommendations /final_stroke_recommendation_package.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import joblib

RECOMMENDATION_MODEL_PATH = "/content/drive/MyDrive/Recommendations /final_stroke_recommendation_package.pkl"

try:
    with open(RECOMMENDATION_MODEL_PATH, "rb") as f:
        model_package = joblib.load(f)  # Load file
    print(model_package.keys())  # Show available keys
except Exception as e:
    print(f"Error loading file: {e}")


dict_keys(['model', 'input_columns', 'output_columns', 'categorical_cols', 'numerical_cols', 'category_mappings'])


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# 1️⃣ **Load the Dataset**
df = pd.read_csv("/content/drive/MyDrive/Recommendations /Final stroke recommendations .csv")

# 2️⃣ **Define Input (X) & Output (Y) Columns**
input_columns = ['age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
                 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
                 'smoking_status', 'trauma', 'medication_use', 'symptoms_duration']
output_columns = ['stroke_type', 'stroke_stage', 'medication', 'recommended_duration', 'recommended_doctor']

X = df[input_columns]
Y = df[output_columns]

# 3️⃣ **Identify Categorical & Numerical Columns**
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# ✅ Ensure categorical outputs are converted to numerical encoding
def encode_categorical_outputs(Y_df):
    mappings = {}
    for col in Y_df.columns:
        Y_df[col] = Y_df[col].astype('category')
        mappings[col] = {category: i for i, category in enumerate(Y_df[col].cat.categories)}
        Y_df[col] = Y_df[col].map(mappings[col])
    return Y_df, mappings

Y, category_mappings = encode_categorical_outputs(Y)

# 4️⃣ **Create a Preprocessing Pipeline**
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('scaler', StandardScaler(), numerical_cols)
])

# 5️⃣ **Train the MultiOutput RandomForestClassifier**
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")))
])

# Split the data for training & testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, Y_train)

# 6️⃣ **Save the Model**
model_package = {
    'model': model,
    'input_columns': input_columns,
    'output_columns': output_columns,
    'categorical_cols': categorical_cols,
    'numerical_cols': numerical_cols,
    'category_mappings': category_mappings
}

joblib.dump(model_package, "updated_final_stroke_recommendation_package.pkl")
print("✅ Model training complete! Saved as 'updated_final_stroke_recommendation_package.pkl'.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_df[col] = Y_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_df[col] = Y_df[col].map(mappings[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_df[col] = Y_df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

ValueError: Unknown label type: unknown. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# ✅ Load dataset
df = pd.read_csv("/content/drive/MyDrive/Recommendations /Final stroke recommendations .csv")

# ✅ Define input (X) & output (Y) columns
input_columns = ['age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
                 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
                 'smoking_status', 'trauma', 'medication_use', 'symptoms_duration']

output_columns = ['stroke_type', 'stroke_stage', 'medication', 'recommended_duration', 'recommended_doctor']

X = df[input_columns]
Y = df[output_columns]

# ✅ Handle missing values
X.fillna("Unknown", inplace=True)  # Fill missing categorical values
Y.dropna(inplace=True)  # Drop rows with missing output values


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna("Unknown", inplace=True)  # Fill missing categorical values
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y.dropna(inplace=True)  # Drop rows with missing output values


In [None]:
# ✅ Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# ✅ Encode output variables (Y) properly
category_mappings = {}
Y_encoded = Y.copy()

for col in output_columns:
    Y_encoded[col] = Y_encoded[col].astype("category")  # Ensure categorical type
    Y_encoded[col] = Y_encoded[col].cat.codes  # Convert to integer category codes
    category_mappings[col] = {cat: i for i, cat in enumerate(Y[col].astype("category").cat.categories)}

# ✅ Check if encoding worked correctly
print(Y_encoded.dtypes)  # Should show "int" types for all target columns

# ✅ Split the data for training & testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=0.2, random_state=42)

# ✅ Train the model
model.fit(X_train, Y_train)


stroke_type             int8
stroke_stage            int8
medication              int8
recommended_duration    int8
recommended_doctor      int8
dtype: object


In [None]:
# ✅ Create a preprocessing pipeline
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('scaler', StandardScaler(), numerical_cols)
])

# ✅ Train MultiOutput RandomForestClassifier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")))
])

# ✅ Split the data for training & testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=0.2, random_state=42)

# ✅ Train the model
model.fit(X_train, Y_train)


In [None]:
# ✅ Save model & mappings
model_package = {
    'model': model,
    'input_columns': input_columns,
    'output_columns': output_columns,
    'categorical_cols': categorical_cols,
    'numerical_cols': numerical_cols,
    'category_mappings': category_mappings  # Save category mappings
}

joblib.dump(model_package, "updated_final_stroke_recommendation_package.pkl")
print("✅ Model training complete! Saved as 'final_stroke_recommendation_package.pkl'.")


✅ Model training complete! Saved as 'final_stroke_recommendation_package.pkl'.


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# ✅ Ensure Y_test is converted to string categories
Y_test_decoded = pd.DataFrame(Y_test.copy())  # Create a copy to avoid modifying the original data

for col in Y_test_decoded.columns:
    if col in category_mappings:
        Y_test_decoded[col] = Y_test_decoded[col].map({v: k for k, v in category_mappings[col].items()})  # Reverse mapping

# ✅ Ensure Y_pred is fully decoded into categories
Y_pred_decoded = pd.DataFrame(Y_pred, columns=Y_test.columns)

for col in Y_pred_decoded.columns:
    if col in category_mappings:
        Y_pred_decoded[col] = Y_pred_decoded[col].round().astype(int)  # Convert float predictions to int
        Y_pred_decoded[col] = Y_pred_decoded[col].map({v: k for k, v in category_mappings[col].items()})  # Reverse mapping

# ✅ Check Model Performance
print("🔹 Model Performance:")
for col in Y_test.columns:
    print(f"\n🎯 {col}:")
    print(classification_report(Y_test_decoded[col], Y_pred_decoded[col]))

# ✅ Overall Accuracy
overall_accuracy = accuracy_score(Y_test_decoded.values.flatten(), Y_pred_decoded.values.flatten())
print(f"\n✅ Overall Accuracy: {overall_accuracy:.2f}")


🔹 Model Performance:

🎯 stroke_type:
              precision    recall  f1-score   support

 Hemorrhagic       1.00      1.00      1.00        40
    Ischemic       1.00      1.00      1.00       200
         TIA       1.00      1.00      1.00         5

    accuracy                           1.00       245
   macro avg       1.00      1.00      1.00       245
weighted avg       1.00      1.00      1.00       245


🎯 stroke_stage:
              precision    recall  f1-score   support

    Critical       1.00      0.94      0.97        36
       Early       0.99      1.00      0.99        76
    Moderate       0.99      0.99      0.99       133

    accuracy                           0.99       245
   macro avg       0.99      0.98      0.98       245
weighted avg       0.99      0.99      0.99       245


🎯 medication:
                                 precision    recall  f1-score   support

Alteplase + Intensive Care Meds       1.00      0.94      0.97        36
                      

In [None]:
from google.colab import files

# Download CNN model
files.download('/content/updated_final_stroke_recommendation_package.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
import joblib

# ✅ Load dataset
file_path = "/content/drive/MyDrive/Recommendations /Final stroke recommendations .csv"  # Update path if needed
df = pd.read_csv(file_path)

# ✅ Define input and output labels
output_labels = ["stroke_stage", "stroke_type", "recommended_doctor", "medication", "recommended_duration"]
input_labels = [col for col in df.columns if col not in output_labels]

# ✅ Separate inputs and outputs
X = df[input_labels].copy()
Y = df[output_labels].copy()

# ✅ Identify categorical and numerical features
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# ✅ Encode categorical inputs
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_categorical = encoder.fit_transform(X[categorical_features])

# ✅ Scale numerical inputs
scaler = MinMaxScaler()
X_numerical = scaler.fit_transform(X[numerical_features])

# ✅ Ensure correct dimensions for stacking
X_numerical = X_numerical.reshape(-1, 1) if X_numerical.ndim == 1 else X_numerical
X = np.hstack([X_categorical, X_numerical])

# ✅ Encode output labels
label_encoders = {}
Y_encoded = {}

for col in output_labels:
    le = LabelEncoder()
    Y_encoded[col] = le.fit_transform(Y[col])
    label_encoders[col] = le  # Store for decoding later

# Convert to NumPy arrays
Y_encoded = [np.array(Y_encoded[col]) for col in output_labels]

# ✅ Split dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=0.2, random_state=42)

# ✅ Define Multi-Task MLP Model
input_layer = Input(shape=(X.shape[1],))

# Shared hidden layers
hidden1 = Dense(64, activation="relu")(input_layer)
hidden2 = Dense(32, activation="relu")(hidden1)

# Separate output layers
outputs = []
for col in output_labels:
    outputs.append(Dense(len(label_encoders[col].classes_), activation="softmax", name=col)(hidden2))

# Compile model
model = Model(inputs=input_layer, outputs=outputs)
model.compile(
    optimizer="adam",
    loss=["sparse_categorical_crossentropy"] * len(output_labels),
    metrics=["accuracy"]
)

# ✅ Train Model
history = model.fit(
    X_train, Y_train,
    validation_data=(X_test, Y_test),
    epochs=50, batch_size=16
)

# ✅ Save Model & Encoders
model.save("/mnt/data/stroke_recommendation_model.h5")
joblib.dump(encoder, "/mnt/data/input_encoder.pkl")
joblib.dump(scaler, "/mnt/data/input_scaler.pkl")
joblib.dump(label_encoders, "/mnt/data/output_label_encoders.pkl")

print("✅ Model & encoders saved successfully!")


ValueError: Found input variables with inconsistent numbers of samples: [1222, 5]

In [None]:
import numpy as np
import pandas as pd
import re

# ✅ Load dataset
file_path = "/content/drive/MyDrive/Recommendations /Final stroke recommendations .csv"
df = pd.read_csv(file_path)

# ✅ Identify time-related columns (without affecting other fields)
time_units = ["day", "hour", "h", "hrs"]

def contains_time_units(value):
    return isinstance(value, str) and any(unit in value.lower() for unit in time_units)

columns_with_time = [col for col in df.columns if df[col].astype(str).apply(contains_time_units).any()]

# ✅ Extract only numbers from time-related fields (Keep other fields unchanged)
def extract_number(value):
    if isinstance(value, str):
        match = re.search(r'\d+', value)  # Extract numeric part
        return int(match.group()) if match else np.nan
    return value  # Keep non-string values unchanged

for col in columns_with_time:
    df[col] = df[col].apply(extract_number)

# ✅ Fill missing values **only for time-related fields**
for col in columns_with_time:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

# ✅ Save cleaned dataset **without removing stroke type, doctor, or medication**
cleaned_path = "/content/drive/MyDrive/Recommendations /final_cleaned_stroke_data.csv"
df.to_csv(cleaned_path, index=False)
print(f"✅ Cleaned dataset saved at: {cleaned_path}")


✅ Cleaned dataset saved at: /content/drive/MyDrive/Recommendations /final_cleaned_stroke_data.csv


In [None]:
claude

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

class StrokeRecommendationModel:
    def __init__(self, input_columns, output_columns):
        """
        Initialize Stroke Recommendation Model

        Args:
            input_columns (list): Input feature columns
            output_columns (list): Target prediction columns
        """
        self.input_columns = input_columns
        self.output_columns = output_columns

        # Initialize label encoders for output columns
        self.label_encoders = {col: LabelEncoder() for col in output_columns}

        # Model and preprocessing components
        self.model = None
        self.preprocessor = None

        # Mapping dictionaries
        self.input_mappings = {}
        self.output_mappings = {}

    def preprocess_data(self, X, Y):
        """
        Preprocess input and output data

        Args:
            X (pd.DataFrame): Input features
            Y (pd.DataFrame): Target variables

        Returns:
            tuple: Processed X and Y
        """
        # Handle missing values
        X_processed = X.copy()
        Y_processed = Y.copy()

        X_processed.fillna('Unknown', inplace=True)
        Y_processed.dropna(inplace=True)

        # Separate categorical and numerical columns
        self.categorical_cols = X_processed.select_dtypes(include=['object']).columns.tolist()
        self.numerical_cols = X_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()

        # Encode output columns
        Y_encoded = Y_processed.copy()
        for col in self.output_columns:
            # Fit label encoder and transform
            Y_encoded[col] = self.label_encoders[col].fit_transform(Y_processed[col])

            # Store mappings for reverse lookup
            self.output_mappings[col] = {
                idx: label for idx, label in enumerate(self.label_encoders[col].classes_)
            }

        return X_processed, Y_encoded

    def create_preprocessing_pipeline(self):
        """
        Create preprocessing pipeline for features

        Returns:
            ColumnTransformer: Preprocessor for features
        """
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.numerical_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore'), self.categorical_cols)
            ])

        return self.preprocessor

    def create_model_pipeline(self):
        """
        Create machine learning pipeline

        Returns:
            Pipeline: Complete model pipeline
        """
        self.model = Pipeline([
            ('preprocessor', self.create_preprocessing_pipeline()),
            ('classifier', MultiOutputClassifier(
                RandomForestClassifier(
                    n_estimators=100,
                    random_state=42,
                    class_weight='balanced'
                )
            ))
        ])

        return self.model

    def train(self, X, Y, test_size=0.2, random_state=42):
        """
        Train the model

        Args:
            X (pd.DataFrame): Input features
            Y (pd.DataFrame): Target variables
            test_size (float): Proportion of test data
            random_state (int): Random seed for reproducibility
        """
        # Preprocess data
        X_processed, Y_processed = self.preprocess_data(X, Y)

        # Split data
        X_train, X_test, Y_train, Y_test = train_test_split(
            X_processed, Y_processed,
            test_size=test_size,
            random_state=random_state
        )

        # Create and train model
        self.create_model_pipeline()
        self.model.fit(X_train, Y_train)

        # Evaluate model
        Y_pred = self.model.predict(X_test)

        # Print classification report for each output column
        for i, col in enumerate(self.output_columns):
            print(f"\nClassification Report for {col}:")
            print(classification_report(
                Y_test.iloc[:, i],
                Y_pred[:, i]
            ))

    def predict(self, X):
        """
        Make predictions

        Args:
            X (pd.DataFrame): Input features

        Returns:
            list: Decoded predictions
        """
        # Ensure input matches training columns
        X_processed = X.reindex(columns=self.input_columns, fill_value='Unknown')

        # Make predictions
        predictions = self.model.predict(X_processed)

        # Decode predictions
        decoded_predictions = {}
        for i, col in enumerate(self.output_columns):
            decoded_predictions[col] = self.output_mappings[col].get(
                predictions[0][i],
                f"Unknown_{col}_Category"
            )

        return decoded_predictions

    def save_model(self, filename='claude_stroke_recommendation_model.pkl'):
        """
        Save trained model and associated metadata

        Args:
            filename (str): Path to save model
        """
        model_package = {
            'model': self.model,
            'input_columns': self.input_columns,
            'output_columns': self.output_columns,
            'categorical_cols': self.categorical_cols,
            'numerical_cols': self.numerical_cols,
            'output_mappings': self.output_mappings,
            'label_encoders': self.label_encoders
        }

        joblib.dump(model_package, filename)
        print(f"✅ Model saved successfully to {filename}")

    def load_model(self, filename='claude_stroke_recommendation_model.pkl'):
        """
        Load saved model

        Args:
            filename (str): Path to load model from

        Returns:
            dict: Loaded model package
        """
        return joblib.load(filename)

# Example usage
def main():
    # Load dataset
    df = pd.read_csv("/content/drive/MyDrive/Recommendations /final modified recommendations file.csv")

    # Define input and output columns
    input_columns = [
        'age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
        'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
        'smoking_status', 'trauma', 'medication_use', 'symptoms_duration'
    ]

    output_columns = [
        'stroke_type', 'stroke_stage', 'medication',
        'recommended_duration', 'recommended_doctor'
    ]

    # Initialize model
    model = StrokeRecommendationModel(input_columns, output_columns)

    # Select input and output data
    X = df[input_columns]
    Y = df[output_columns]

    # Train model
    model.train(X, Y)

    # Save model
    model.save_model()

    # Example prediction
    sample_input = X.iloc[[0]]  # Take first row as example
    predictions = model.predict(sample_input)
    print("Sample Predictions:", predictions)

if __name__ == "__main__":
    main()


Classification Report for stroke_type:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00       200
           2       1.00      1.00      1.00         5

    accuracy                           1.00       245
   macro avg       1.00      1.00      1.00       245
weighted avg       1.00      1.00      1.00       245


Classification Report for stroke_stage:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.99      1.00      0.99        76
           2       0.99      0.99      0.99       133

    accuracy                           0.99       245
   macro avg       0.99      0.99      0.99       245
weighted avg       0.99      0.99      0.99       245


Classification Report for medication:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
         

In [None]:
validation script

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

class StrokeRecommendationModelValidator:
    def __init__(self):
        """
        Initialize model validator
        """
        # Define input and output columns
        self.input_columns = [
            'age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
            'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
            'smoking_status', 'trauma', 'medication_use', 'symptoms_duration'
        ]

        self.output_columns = [
            'stroke_type', 'stroke_stage', 'medication',
            'recommended_duration', 'recommended_doctor'
        ]

        # Define column types
        self.numerical_cols = [
            'age', 'avg_glucose_level', 'bmi', 'symptoms_duration'
        ]

        self.categorical_cols = [
            'gender', 'hypertension', 'heart_disease', 'ever_married',
            'work_type', 'Residence_type', 'smoking_status',
            'trauma', 'medication_use'
        ]

        # Initialize label encoders
        self.label_encoders = {}

    def _create_preprocessor(self):
        """
        Create preprocessing pipeline

        Returns:
            ColumnTransformer: Preprocessor for features
        """
        return ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.numerical_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore'), self.categorical_cols)
            ])

    def _encode_targets(self, Y):
        """
        Encode target variables

        Args:
            Y (pd.DataFrame): Target variables

        Returns:
            np.ndarray: Encoded target variables
        """
        # Create a copy of the dataframe
        Y_encoded = Y.copy()

        # Encode each output column
        for col in self.output_columns:
            # Check if column needs encoding
            if Y[col].dtype == 'object':
                # Create label encoder if not exists
                if col not in self.label_encoders:
                    le = LabelEncoder()
                    Y_encoded[col] = le.fit_transform(Y[col])
                    self.label_encoders[col] = le
                else:
                    le = self.label_encoders[col]
                    Y_encoded[col] = le.transform(Y[col])

        # Convert to numpy array
        return Y_encoded[self.output_columns].values

    def validate_model(self, X, Y):
        """
        Perform comprehensive model validation

        Args:
            X (pd.DataFrame): Input features
            Y (pd.DataFrame): Target variables
        """
        # Ensure input matches expected columns
        X_processed = X.reindex(columns=self.input_columns, fill_value='Unknown')

        # Encode target variables
        Y_multi = self._encode_targets(Y)

        # Create model pipeline
        model = Pipeline([
            ('preprocessor', self._create_preprocessor()),
            ('classifier', RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                class_weight='balanced'
            ))
        ])

        # Use standard KFold
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # Perform cross-validation for each output column
        print("\nCross-Validation Results:")
        for i, col in enumerate(self.output_columns):
            print(f"\nValidation for {col}:")

            # Perform cross-validation
            cv_scores = cross_val_score(
                model,
                X_processed,
                Y_multi[:, i],
                cv=kf,
                scoring='accuracy'
            )

            print(f"Cross-validation scores: {cv_scores}")
            print(f"Mean CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

    def analyze_feature_importance(self, X, Y):
        """
        Analyze feature importance for each target variable

        Args:
            X (pd.DataFrame): Input features
            Y (pd.DataFrame): Target variables
        """
        # Ensure input matches expected columns
        X_processed = X.reindex(columns=self.input_columns, fill_value='Unknown')

        # Encode target variables
        Y_multi = self._encode_targets(Y)

        # Create preprocessor and fit
        preprocessor = self._create_preprocessor()
        X_transformed = preprocessor.fit_transform(X_processed)

        # Prepare feature names
        feature_names = (
            self.numerical_cols +
            list(preprocessor.named_transformers_['cat']
                 .get_feature_names_out(self.categorical_cols))
        )

        # Analyze feature importance for each output column
        for i, col in enumerate(self.output_columns):
            print(f"\nFeature Importance for {col}:")

            # Create and fit model
            clf = RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                class_weight='balanced'
            )
            clf.fit(X_transformed, Y_multi[:, i])

            # Get feature importances
            importances = clf.feature_importances_

            # Create and sort feature importance DataFrame
            feature_importance = pd.DataFrame({
                'feature': feature_names,
                'importance': importances
            }).sort_values('importance', ascending=False)

            print(feature_importance.head(10))

def main():
    # Load your dataset
    df = pd.read_csv("/content/drive/MyDrive/Recommendations /final modified recommendations file.csv")

    # Prepare data
    X = df[[
        'age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
        'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
        'smoking_status', 'trauma', 'medication_use', 'symptoms_duration'
    ]]

    Y = df[[
        'stroke_type', 'stroke_stage', 'medication',
        'recommended_duration', 'recommended_doctor'
    ]]

    # Initialize model validator
    model_validator = StrokeRecommendationModelValidator()

    # Validate model
    model_validator.validate_model(X, Y)

    # Analyze feature importance
    model_validator.analyze_feature_importance(X, Y)

if __name__ == "__main__":
    main()


Cross-Validation Results:

Validation for stroke_type:
Cross-validation scores: [1.         0.99591837 1.         1.         1.        ]
Mean CV Score: 0.9992 (+/- 0.0033)

Validation for stroke_stage:
Cross-validation scores: [0.99591837 1.         0.99180328 0.99180328 0.9795082 ]
Mean CV Score: 0.9918 (+/- 0.0137)

Validation for medication:
Cross-validation scores: [0.99591837 1.         0.99180328 0.99180328 0.9795082 ]
Mean CV Score: 0.9918 (+/- 0.0137)

Validation for recommended_duration:
Cross-validation scores: [0.99591837 1.         0.99180328 0.98770492 0.9795082 ]
Mean CV Score: 0.9910 (+/- 0.0141)

Validation for recommended_doctor:
Cross-validation scores: [0.99591837 1.         0.99180328 0.98770492 0.9795082 ]
Mean CV Score: 0.9910 (+/- 0.0141)

Feature Importance for stroke_type:
                           feature  importance
18     smoking_status_never smoked    0.198635
21                      trauma_Yes    0.100859
7                   hypertension_1    0.097643
20

In [None]:
from google.colab import files

# Download CNN model
files.download('/content/drive/MyDrive/Recommendations /claude_stroke_recommendation_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.impute import SimpleImputer

class StrokeRecommendationModel:
    def __init__(self, input_columns, output_columns):
        self.input_columns = input_columns
        self.output_columns = output_columns
        self.label_encoders = {col: LabelEncoder() for col in output_columns}
        self.model = None
        self.preprocessor = None
        self.feature_names = None
        self.class_distributions = {}
        self.output_mappings = {}

    def _validate_inputs(self, X, Y):
        if X.shape[0] != Y.shape[0]:
            raise ValueError("X and Y must have the same number of samples")

        for col in self.output_columns:
            unique_values = Y[col].nunique()
            if unique_values < 2:
                raise ValueError(f"Output column {col} has insufficient unique values ({unique_values})")

    def _analyze_class_distribution(self, Y):
        for col in self.output_columns:
            dist = Y[col].value_counts(normalize=True)
            self.class_distributions[col] = dist
            print(f"\nClass distribution for {col}:")
            print(dist)

    def create_preprocessing_pipeline(self, X):
        # Remove 'trauma' from numerical columns if it's causing issues
        numerical_cols = [col for col in X.select_dtypes(include=['int64', 'float64']).columns
                         if col in self.input_columns and col != 'trauma']
        categorical_cols = [col for col in X.select_dtypes(include=['object']).columns
                           if col in self.input_columns]

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]), numerical_cols),
                ('cat', Pipeline([
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot', OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=20))
                ]), categorical_cols)
            ])

        return self.preprocessor

    def train(self, X, Y, test_size=0.2, random_state=42):
        self._validate_inputs(X, Y)
        self._analyze_class_distribution(Y)

        # Encode outputs and create mappings
        Y_encoded = Y.copy()
        for col in self.output_columns:
            Y_encoded[col] = self.label_encoders[col].fit_transform(Y[col])
            self.output_mappings[col] = {
                idx: label for idx, label in enumerate(self.label_encoders[col].classes_)
            }

        self.create_preprocessing_pipeline(X)

        self.model = Pipeline([
            ('preprocessor', self.preprocessor),
            ('classifier', MultiOutputClassifier(
                RandomForestClassifier(
                    n_estimators=150,
                    class_weight='balanced',
                    max_depth=5,
                    random_state=random_state
                )
            ))
        ])

        # Adjusted cross-validation with proper scoring
        def multioutput_accuracy(y_true, y_pred):
            return np.mean(np.all(y_pred == y_true, axis=1))

        cv_scores = cross_val_score(
            self.model, X, Y_encoded,
            cv=5,
            scoring=make_scorer(multioutput_accuracy)
        )

        print("\nCross-validation results:")
        print(f"Validation accuracy: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")

        # Final training and evaluation
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y_encoded,
            test_size=test_size,
            random_state=random_state
        )

        self.model.fit(X_train, Y_train)
        Y_pred = self.model.predict(X_test)

        # Store feature names
        self.feature_names = self.model.named_steps['preprocessor'].get_feature_names_out()

        # Evaluation reports
        print("\nTest set evaluation:")
        for i, col in enumerate(self.output_columns):
            print(f"\nClassification Report for {col}:")
            print(classification_report(Y_test.iloc[:, i], Y_pred[:, i]))

    def save_model(self, filename):
        model_package = {
            'model': self.model,
            'label_encoders': self.label_encoders,
            'input_columns': self.input_columns,
            'output_columns': self.output_columns,
            'output_mappings': self.output_mappings,
            'feature_names': self.feature_names
        }
        joblib.dump(model_package, filename)
        print(f"Model saved to {filename}")

def main():
    # Updated input columns (remove 'trauma' if not present in data)
    input_columns = [
        'age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
        'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
        'smoking_status', 'medication_use', 'symptoms_duration'
    ]

    output_columns = [
        'stroke_type', 'stroke_stage', 'medication',
        'recommended_duration', 'recommended_doctor'
    ]

    try:
        df = pd.read_csv("/content/drive/MyDrive/Recommendations /final modified recommendations file.csv")
        print(f"Data loaded: {df.shape[0]} samples")

        # Ensure data types match Flask expectations
        numeric_cols = ['age', 'hypertension', 'heart_disease',
                       'avg_glucose_level', 'bmi', 'symptoms_duration']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

        categorical_cols = list(set(input_columns) - set(numeric_cols))
        for col in categorical_cols:
            df[col] = df[col].astype(str).fillna('Unknown')

    except Exception as e:
        print(f"Error loading data: {e}")
        return

    model = StrokeRecommendationModel(input_columns, output_columns)

    try:
        model.train(df[input_columns], df[output_columns])

        # Test prediction with proper output handling
        sample_input = pd.DataFrame([{
            'age': 45.0,
            'gender': 'Male',
            'hypertension': 1.0,
            'heart_disease': 0.0,
            'ever_married': 'Yes',
            'work_type': 'Private',
            'Residence_type': 'Urban',
            'avg_glucose_level': 120.5,
            'bmi': 25.5,
            'smoking_status': 'Formerly Smoked',
            'medication_use': 'No',
            'symptoms_duration': 30.0
        }])

        prediction = model.model.predict(sample_input)
        print("\nSample prediction test:")
        for i, col in enumerate(output_columns):
            decoded = model.label_encoders[col].inverse_transform([prediction[0][i]])[0]
            print(f"{col}: {decoded}")

    except Exception as e:
        print(f"Training failed: {e}")
        return

    try:
        model.save_model('/content/drive/MyDrive/Recommendations /claude2_stroke_recommendation_model.pkl')
    except Exception as e:
        print(f"Error saving model: {e}")

if __name__ == "__main__":
    main()

Data loaded: 1222 samples

Class distribution for stroke_type:
stroke_type
Ischemic       0.788871
Hemorrhagic    0.190671
TIA            0.020458
Name: proportion, dtype: float64

Class distribution for stroke_stage:
stroke_stage
Moderate    0.546645
Early       0.319967
Critical    0.133388
Name: proportion, dtype: float64

Class distribution for medication:
medication
Clopidogrel + Blood Thinners       0.546645
Aspirin                            0.319967
Alteplase + Intensive Care Meds    0.133388
Name: proportion, dtype: float64

Class distribution for recommended_duration:
recommended_duration
14    0.546645
7     0.319967
21    0.133388
Name: proportion, dtype: float64

Class distribution for recommended_doctor:
recommended_doctor
Neurologist          0.546645
General Physician    0.319967
Stroke Specialist    0.133388
Name: proportion, dtype: float64

Cross-validation results:
Validation accuracy: 0.928 ± 0.010

Test set evaluation:

Classification Report for stroke_type:
      

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Recommendations /final modified recommendations file.csv").drop(columns=['trauma'])
df.to_csv("Recommendations_clean_data.csv", index=False)

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/drive/MyDrive/Recommendations /Recommendations_clean_data.csv')  # Replace with your input CSV filename

# Remove decimals by converting to integers
df['age'] = df['age'].astype(int)
df['avg_glucose_level'] = df['avg_glucose_level'].round().astype(int)

# Save the modified DataFrame to a new CSV file
df.to_csv('FINAL RECOMMENDATOINS.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.impute import SimpleImputer

class StrokeRecommendationModel:
    def __init__(self, input_columns, output_columns):
        self.input_columns = input_columns
        self.output_columns = output_columns
        self.label_encoders = {col: LabelEncoder() for col in output_columns}
        self.model = None
        self.preprocessor = None
        self.feature_names = None
        self.class_distributions = {}
        self.output_mappings = {}

    def _validate_inputs(self, X, Y):
        if X.shape[0] != Y.shape[0]:
            raise ValueError("X and Y must have the same number of samples")

        for col in self.output_columns:
            unique_values = Y[col].nunique()
            if unique_values < 2:
                raise ValueError(f"Output column {col} has insufficient unique values ({unique_values})")

    def _analyze_class_distribution(self, Y):
        for col in self.output_columns:
            dist = Y[col].value_counts(normalize=True)
            self.class_distributions[col] = dist
            print(f"\nClass distribution for {col}:")
            print(dist)

    def create_preprocessing_pipeline(self, X):
        # Remove 'trauma' from numerical columns if it's causing issues
        numerical_cols = [col for col in X.select_dtypes(include=['int64', 'float64']).columns
                         if col in self.input_columns and col != 'trauma']
        categorical_cols = [col for col in X.select_dtypes(include=['object']).columns
                           if col in self.input_columns]

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', Pipeline([
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]), numerical_cols),
                ('cat', Pipeline([
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('onehot', OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=20))
                ]), categorical_cols)
            ])

        return self.preprocessor

    def train(self, X, Y, test_size=0.2, random_state=42):
        self._validate_inputs(X, Y)
        self._analyze_class_distribution(Y)

        # Encode outputs and create mappings
        Y_encoded = Y.copy()
        for col in self.output_columns:
            Y_encoded[col] = self.label_encoders[col].fit_transform(Y[col])
            self.output_mappings[col] = {
                idx: label for idx, label in enumerate(self.label_encoders[col].classes_)
            }

        self.create_preprocessing_pipeline(X)

        self.model = Pipeline([
            ('preprocessor', self.preprocessor),
            ('classifier', MultiOutputClassifier(
                RandomForestClassifier(
                    n_estimators=150,
                    class_weight='balanced',
                    max_depth=5,
                    random_state=random_state
                )
            ))
        ])

        # Adjusted cross-validation with proper scoring
        def multioutput_accuracy(y_true, y_pred):
            return np.mean(np.all(y_pred == y_true, axis=1))

        cv_scores = cross_val_score(
            self.model, X, Y_encoded,
            cv=5,
            scoring=make_scorer(multioutput_accuracy)
        )

        print("\nCross-validation results:")
        print(f"Validation accuracy: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")

        # Final training and evaluation
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y_encoded,
            test_size=test_size,
            random_state=random_state
        )

        self.model.fit(X_train, Y_train)
        Y_pred = self.model.predict(X_test)

        # Store feature names
        self.feature_names = self.model.named_steps['preprocessor'].get_feature_names_out()

        # Evaluation reports
        print("\nTest set evaluation:")
        for i, col in enumerate(self.output_columns):
            print(f"\nClassification Report for {col}:")
            print(classification_report(Y_test.iloc[:, i], Y_pred[:, i]))

    def save_model(self, filename):
        model_package = {
            'model': self.model,
            'label_encoders': self.label_encoders,
            'input_columns': self.input_columns,
            'output_columns': self.output_columns,
            'output_mappings': self.output_mappings,
            'feature_names': self.feature_names
        }
        joblib.dump(model_package, filename)
        print(f"Model saved to {filename}")

def main():
    # Updated input columns (remove 'trauma' if not present in data)
    input_columns = [
        'age', 'gender', 'hypertension', 'heart_disease', 'ever_married',
        'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
        'smoking_status', 'medication_use', 'symptoms_duration'
    ]

    output_columns = [
        'stroke_type', 'stroke_stage', 'medication',
        'recommended_duration', 'recommended_doctor'
    ]

    try:
        df = pd.read_csv("/content/drive/MyDrive/Recommendations /FINAL RECOMMENDATOINS.csv")
        print(f"Data loaded: {df.shape[0]} samples")

        # Ensure data types match Flask expectations
        numeric_cols = ['age', 'hypertension', 'heart_disease',
                       'avg_glucose_level', 'bmi', 'symptoms_duration']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

        categorical_cols = list(set(input_columns) - set(numeric_cols))
        for col in categorical_cols:
            df[col] = df[col].astype(str).fillna('Unknown')

    except Exception as e:
        print(f"Error loading data: {e}")
        return

    model = StrokeRecommendationModel(input_columns, output_columns)

    try:
        model.train(df[input_columns], df[output_columns])

        # Test prediction with proper output handling
        sample_input = pd.DataFrame([{
            'age': 45.0,
            'gender': 'Male',
            'hypertension': 1.0,
            'heart_disease': 0.0,
            'ever_married': 'Yes',
            'work_type': 'Private',
            'Residence_type': 'Urban',
            'avg_glucose_level': 120.5,
            'bmi': 25.5,
            'smoking_status': 'Formerly Smoked',
            'medication_use': 'No',
            'symptoms_duration': 30.0
        }])

        prediction = model.model.predict(sample_input)
        print("\nSample prediction test:")
        for i, col in enumerate(output_columns):
            decoded = model.label_encoders[col].inverse_transform([prediction[0][i]])[0]
            print(f"{col}: {decoded}")

    except Exception as e:
        print(f"Training failed: {e}")
        return

    try:
        model.save_model('/content/drive/MyDrive/Recommendations /CLAUDE_RECOMMENDATOIN_MODEL')
    except Exception as e:
        print(f"Error saving model: {e}")

if __name__ == "__main__":
    main()

Data loaded: 1222 samples

Class distribution for stroke_type:
stroke_type
Ischemic       0.788871
Hemorrhagic    0.190671
TIA            0.020458
Name: proportion, dtype: float64

Class distribution for stroke_stage:
stroke_stage
Moderate    0.546645
Early       0.319967
Critical    0.133388
Name: proportion, dtype: float64

Class distribution for medication:
medication
Clopidogrel + Blood Thinners       0.546645
Aspirin                            0.319967
Alteplase + Intensive Care Meds    0.133388
Name: proportion, dtype: float64

Class distribution for recommended_duration:
recommended_duration
14    0.546645
7     0.319967
21    0.133388
Name: proportion, dtype: float64

Class distribution for recommended_doctor:
recommended_doctor
Neurologist          0.546645
General Physician    0.319967
Stroke Specialist    0.133388
Name: proportion, dtype: float64

Cross-validation results:
Validation accuracy: 0.930 ± 0.014

Test set evaluation:

Classification Report for stroke_type:
      

In [None]:
from google.colab import files

# Download CNN model
files.download('/content/drive/MyDrive/Recommendations /CLAUDE_RECOMMENDATOIN_MODEL')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>