In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage


data = pd.read_csv('/Users/aaron3j/Downloads/KMapp/heart_disease_uci.csv')
data.rename({'num': 'Heart Disease Stage'}, axis=1, inplace=True)
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/aaron3j/Downloads/KMapp/heart_disease_uci.csv'

In [None]:
for column in data:
    unique_vals = np.unique(data[column].astype(str).fillna('0'))
    nr_values = len(unique_vals)
    if nr_values <= 12:
        print('The number of values for feature {} :{} -- {}'.format(column, nr_values,unique_vals))
    else:
        print('The number of values for feature {} :{}'.format(column, nr_values))

In [None]:
data.isnull().sum()

In [None]:

numerical_cols = data.select_dtypes(include=[np.number]).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())


categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

In [None]:
data.drop('id', axis=1, inplace=True)  # Drop ID column
categorical_cols = ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'Heart Disease Stage']
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=False)

In [None]:
data_encoded.head()

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_encoded)

In [None]:
inertia = []
for k in range(1, 30):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(range(1, 30), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
linked = linkage(scaled_features, 'ward')
plt.figure(figsize=(10, 7))
dendrogram(linked, truncate_mode='lastp', p=10)
plt.title('Dendrogram')
plt.show()

In [None]:
optimal_clusters = 5 
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_features)
data.head()

In [None]:
pca = PCA()
pca_features = pca.fit_transform(scaled_features)
explained_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(8, 4))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o')
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
plt.title('Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.legend()
plt.show()

In [None]:
pca_2d = pca_features[:, :2]
plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_2d[:, 0], y=pca_2d[:, 1], hue=data['Cluster'], palette='viridis')
plt.title('Clusters Visualized on First Two PCA Components')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()

In [None]:
data.head()

In [None]:
numeric_columns = data.select_dtypes(include=[np.number]).columns
categorical_columns = [col for col in data.columns if col not in numeric_columns and col != 'Cluster']


numeric_analysis = data.groupby('Cluster', as_index=False)[numeric_columns].mean()
numeric_analysis = numeric_analysis[['Cluster', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca',
       'Heart Disease Stage']]

categorical_analysis = {}
for col in categorical_columns:
    counts = data.groupby('Cluster')[col].value_counts().unstack(fill_value=0)
    counts.columns = [f"{col}_{val}" for val in counts.columns]  # Rename columns to include category values
    categorical_analysis[col] = counts


categorical_analysis_df = pd.concat(categorical_analysis.values(), axis=1).reset_index()

cluster_analysis = pd.merge(numeric_analysis, categorical_analysis_df, on='Cluster', how='left')


cluster_analysis

In [None]:
from groq import Groq
client = Groq(api_key='')


data_json = cluster_analysis.to_json(orient='records')


prompt = f"Summarize the following data. Specifically, I want a summary per Cluster with the main characteristics and statistics that each cluster has. The cluster column already exists and has 5 clusters (0 to 4). Column 'Heart Disease Stage' is an important column as it has the heart disease stages [0=no heart disease; 1,2,3,4 = stages of heart disease]:\n{data_json}"


model_name = 'llama-3.1-8b-instant'  

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "you are a helpful assistant."
        },
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model=model_name,
)

print(chat_completion.choices[0].message.content)

cluster_summaries = chat_completion.choices[0].message.content


In [None]:
import pickle
import json

with open('kmeans_model.pkl', 'wb') as model_file:
    pickle.dump(kmeans, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


with open('cluster_summaries.pkl', 'wb') as summary_file:
    pickle.dump(cluster_summaries, summary_file)


pca_2d_df = pd.DataFrame(pca_features[:, :2], columns=['PCA1', 'PCA2'])
pca_2d_df['Cluster'] = data['Cluster'] 
pca_2d_df.to_excel('pca_2d.xlsx', index=False)


cluster_analysis.to_excel('cluster_analysis.xlsx', index=False)
data.to_excel('main_data_with_pred.xlsx', index=False)
with open('cluster_summaries.txt', 'w') as file:
    file.write(json.dumps(cluster_summaries, indent=4)) 


In [None]:
cluster_summaries

In [None]:
numeric_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
categorical_options = {
    'sex': ['Female', 'Male'],
    'dataset': ['Cleveland', 'Hungary', 'Switzerland', 'VA Long Beach'],
    'cp': ['asymptomatic', 'atypical angina', 'non-anginal', 'typical angina'],
    'fbs': ['False', 'True'],
    'restecg': ['lv hypertrophy', 'normal', 'st-t abnormality'],
    'exang': ['False', 'True'],
    'slope': ['downsloping', 'flat', 'upsloping'],
    'thal': ['fixed defect', 'normal', 'reversable defect'],
    'Heart Disease Stage': ['0', '1', '2', '3', '4']
}

user_inputs = {}
for feature in numeric_features:
    user_inputs[feature] = st.sidebar.number_input(feature, value=0.0)

for feature, options in categorical_options.items():
    selected_value = st.sidebar.selectbox(feature, options)
    for option in options:
        user_inputs[f"{feature}_{option}"] = 1 if selected_value == option else 0


input_df = pd.DataFrame([user_inputs])

input_df

In [None]:
import streamlit as st
import pandas as pd
import plotly.express as px
from PIL import Image
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
import streamlit as st
import pandas as pd
import plotly.express as px
from PIL import Image
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


st.set_page_config(layout="wide")
st.markdown(
    """
    <style>
    /* Remove default top margin and padding */
    .block-container {
        padding-top: 0rem;
    }
    .title {
        text-align: center;
        font-size: 32px;
        font-weight: bold;
        color: #333333;
    }
    .subheader {
        text-align: center;
        font-size: 24px;
        font-weight: bold;
        color: #555555;
    }
    .scrollable-summary {
        height: 420px;
        overflow-y: auto;
        border: 1px solid #ccc;
        padding: 10px;
        background-color: #f9f9f9;
    }
    </style>
    """,
    unsafe_allow_html=True,
)


with open('kmeans_model.pkl', 'rb') as model_file:
    kmeans = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)


pca_2d_df = pd.read_excel('pca_2d.xlsx')


cluster_analysis = pd.read_excel('cluster_analysis.xlsx')


try:
    with open('cluster_summaries.pkl', 'rb') as summary_file:
        cluster_summaries = pickle.load(summary_file)
except FileNotFoundError:
    cluster_summaries = None
    st.error("Cluster summaries file not found. Please generate it in the base code.")

sidebar_image = Image.open('/Users/aaron3j/Downloads/KMapp/Pic1.PNG')
main_image = Image.open('/Users/aaron3j/Downloads/KMapp/Pic2.PNG')


st.sidebar.image(sidebar_image, use_column_width=True)
st.sidebar.header("Cluster Visualization")


numeric_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
categorical_options = {
    'sex': ['Female', 'Male'],
    'dataset': ['Cleveland', 'Hungary', 'Switzerland', 'VA Long Beach'],
    'cp': ['asymptomatic', 'atypical angina', 'non-anginal', 'typical angina'],
    'fbs': ['False', 'True'],
    'restecg': ['lv hypertrophy', 'normal', 'st-t abnormality'],
    'exang': ['False', 'True'],
    'slope': ['downsloping', 'flat', 'upsloping'],
    'thal': ['fixed defect', 'normal', 'reversable defect'],
    'Heart Disease Stage': ['0', '1', '2', '3', '4']
}
user_inputs = {}
for feature in numeric_features:
    user_inputs[feature] = st.sidebar.number_input(feature, value=0.0)

for feature, options in categorical_options.items():
    selected_value = st.sidebar.selectbox(feature, options)
    for option in options:
        user_inputs[f"{feature}_{option}"] = 1 if selected_value == option else 0

input_df = pd.DataFrame([user_inputs])
st.markdown('<h1 class="title">Cluster Analysis with PCA Visualization</h1>', unsafe_allow_html=True)
st.image(main_image, use_column_width=True)


left_col, right_col = st.columns(2)
with left_col:
    st.markdown('<h2 class="subheader">Cluster Visualization</h2>', unsafe_allow_html=True)
    fig = px.scatter(
        pca_2d_df, 
        x='PCA1', 
        y='PCA2', 
        color='Cluster', 
        title="Clusters Visualized with PCA", 
        labels={'PCA1': 'PCA Component 1', 'PCA2': 'PCA Component 2'},
        template='plotly'
    )
    st.plotly_chart(fig)

with right_col:
    st.markdown('<h2 class="subheader">Cluster Summaries</h2>', unsafe_allow_html=True)
    summary_content = '<div class="scrollable-summary">'
    if isinstance(cluster_summaries, str):
        summary_content += f"<p>{cluster_summaries}</p>"
    elif cluster_summaries:
        for key, value in cluster_summaries.items():
            summary_content += f"<p><strong>Cluster {key}</strong>: {value}</p>"
    else:
        summary_content += "<p>No cluster summaries available.</p>"
    summary_content += "</div>"
    st.markdown(summary_content, unsafe_allow_html=True)


second_left_col, second_right_col = st.columns(2)

with second_left_col:
    st.subheader("Determine Your Cluster")
    if st.button("Cluster Me"):
        # Predict the user's cluster
        cluster_id = kmeans.predict(scaler.transform(input_df))[0]
        st.success(f"You belong to Cluster {cluster_id}.")

with second_right_col:
    st.subheader("Cluster Analysis Table")
    # Display the cluster_analysis DataFrame with horizontal scrolling
    st.dataframe(cluster_analysis.head(), height=212)



