In [None]:
# prompt: como programador necesito codigo para basado en la base de datos en excel "Copia de BASE_PACIENTES_TRIAGE_2024.xlsx", tomados de mi google drive con la informacion del modelo medico NEWS2 que incluyen: Edad_Paciente, SATURACION_SpO2, RITMO_CARDIACO_Lpm, TEMPERATURA_°C, PRESION_ARTERIAL_mmHg, Unnamed: 25 y Frec_Respira_Rpm, renombrar la columna "Unnamed: 25" por Diastolica, solo incluya las filas donde "Edad_Paciente" >= 50, emplear todos los metodo de aprendizaje ia no supervisados, generar resultados, imprimir y graficar generando un archivo html.  Luego efectuar analisis, imprimir y graficar resultados de  - Calculating silhouette score to evaluate clustering quality,  - Visualizing the clusters using scatter plots y  - Analyzing the characteristics of patients within each cluster, llevar todos los la informacion obtenida con un reporte detallado de los  resultados  en un archivo html y  descargarlo

# Install necessary libraries
!pip install pandas scikit-learn matplotlib seaborn

# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files

# Authenticate with Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the Excel file from Google Drive
file_path = '/content/drive/My Drive/Copia de BASE_PACIENTES_TRIAGE_2024.xlsx'  # Reemplazar con la ruta correcta
try:
    df = pd.read_excel(file_path)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()

# Rename column
df = df.rename(columns={"Unnamed: 25": "Diastolica"})

# Filter by age
df_filtered = df[df['Edad_Paciente'] >= 50]

# Select features for clustering
features = ['Edad_Paciente', 'SATURACION_SpO2', 'RITMO_CARDIACO_Lpm', 'TEMPERATURA_°C', 'PRESION_ARTERIAL_mmHg', 'Diastolica', 'Frec_Respira_Rpm']
X = df_filtered[features]

# Handle missing values (replace with mean) and convert to numeric
for col in features:
    X[col] = pd.to_numeric(X[col], errors='coerce')  # Convert to numeric, invalid values become NaN
    X[col].fillna(X[col].mean(), inplace=True)

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Unsupervised learning methods
clustering_algorithms = {
    'KMeans': KMeans(n_clusters=2, random_state=42),
    'DBSCAN': DBSCAN(eps=2, min_samples=100),
    'AgglomerativeClustering': AgglomerativeClustering(n_clusters=2)
}

results = {}

for name, algorithm in clustering_algorithms.items():
    labels = algorithm.fit_predict(X_scaled)

    # Check if there is more than one cluster before calculating silhouette score
    n_clusters = len(np.unique(labels))
    if n_clusters > 1:
        df_filtered[f'{name}_Labels'] = labels
        results[name] = {
            'labels': labels,
            'silhouette_score': silhouette_score(X_scaled, labels)
        }

        # Visualizations
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='Edad_Paciente', y='RITMO_CARDIACO_Lpm', hue=f'{name}_Labels', data=df_filtered, palette='viridis')
        plt.title(f'{name} Clustering')
        plt.show()
    else:
        print(f"Warning: {name} resulted in only one cluster. Silhouette score cannot be calculated.")
        df_filtered[f'{name}_Labels'] = 0

    # Get a list of numeric columns in the DataFrame
    numeric_cols = df_filtered.select_dtypes(include=np.number).columns.tolist()

    # Calculate aggregation only for numeric features
    cluster_stats = df_filtered.groupby(f'{name}_Labels')[numeric_cols].agg(['mean', 'std'])
    print(cluster_stats)

    if name in results:
        print(f"Silhouette Score for {name}: {results[name]['silhouette_score']}")
    else:
        print(f"Silhouette Score for {name}: Not available (only one cluster)")

    print(df_filtered.groupby(f'{name}_Labels')[numeric_cols].agg(['mean', 'std']))

# Create HTML report
html_report = """
<html>
<head><title>Clustering Report</title></head>
<body>
<h1>Clustering Analysis Results</h1>"""

for name, result in results.items():
    html_report += f"<h2>{name}</h2>"
    html_report += f"<p>Silhouette Score: {result['silhouette_score']}</p>"

    # Ensure only numeric columns are used for aggregation in the report
    numeric_cols_report = df_filtered.select_dtypes(include=np.number).columns.tolist()
    html_report += df_filtered.groupby(f"{name}_Labels")[numeric_cols_report].agg(['mean', 'std']).to_html()

html_report += "</body></html>"

with open('clustering_report.html', 'w') as f:
    f.write(html_report)

files.download('clustering_report.html')


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import base64 # Import base64 for encoding the image
from google.colab import files
import io # Import io for working with in-memory files

for name in clustering_algorithms:
    # Create a DataFrame for each cluster
    cluster_data = df_filtered.groupby(f'{name}_Labels')

    for cluster_label, cluster_df in cluster_data:
        # Get a list of numeric columns in the DataFrame
        numeric_cols = cluster_df.select_dtypes(include=np.number).columns.tolist()

        # Calculate mean, std for numeric columns
        cluster_stats = cluster_df[numeric_cols].agg(['mean', 'std'])

        # Save to xlsx file
        xlsx_file_name = f'{name}_cluster_{cluster_label}.xlsx'
        cluster_stats.to_excel(xlsx_file_name)

        # Download the file
        files.download(xlsx_file_name)

        # Create scatter plot and save it to a BytesIO object
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='Edad_Paciente', y='RITMO_CARDIACO_Lpm', hue=f'{name}_Labels', data=df_filtered, palette='viridis')
        plt.title(f'{name} Clustering')

        # Save the plot to a BytesIO object
        image_stream = io.BytesIO()
        plt.savefig(image_stream, format='png')
        image_stream.seek(0) # Rewind the stream to the beginning

        # Encode the image data as base64
        encoded_image = base64.b64encode(image_stream.read()).decode('utf-8')

        plt.close() # Close the plot to avoid displaying it in the notebook

        # Create HTML file with plot
        html_file_name = f'{name}_cluster_{cluster_label}.html'
        with open(html_file_name, "w") as file:
          file.write(f"""
<html>
<head>
  <title>{name} Cluster {cluster_label}</title>
</head>
<body>
  <h1>{name} Cluster {cluster_label} Statistics</h1>
  {cluster_stats.to_html()}
  <h2>Scatter Plot</h2>
  <img src="data:image/png;base64,{encoded_image}" alt="Scatter Plot">
</body>
</html>
""")

        # Download html file
        files.download(html_file_name)
