In [None]:
#%watermark -a "Chibuzor Enyioko" -d -t -v -p numpy,pandas,matplotlib

# Project 3: Unsupervised Learning

This project explores unsupervised learning methods through clustering. 

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk

## Part 1: Cleaning the data sets

In [43]:
# importing the data sets
hospital_data = pd.read_csv('hospital.csv')

# preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


hospital_data_cat_cols = hospital_data.select_dtypes(exclude=[np.number]).columns.tolist()
hospital_data_num_cols = hospital_data.select_dtypes(include=[np.number]).columns.tolist()

categorical_cols = [h for h in hospital_data_cat_cols if h not in ['readmitted', 'encounter_id', 'patient_nbr']]
numeric_cols = [h for h in hospital_data_num_cols if h not in ['readmitted', 'encounter_id', 'patient_nbr']]

# Preprocessor: one-hot for categorical, passthrough for numeric
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)


## Part 2: Questions
### Mice Data set

1. Run k-Means on the dataset and identify the best number (between 2 and 8)
of clusters (hint – Silhouette Scores on the right side of the k-Means box). Report this
number and its silhouette score. For non-technical track students – you may see an error
regarding the max number of allowed rows for the Hospital dataset. Report the scores that
you see regardless.


In [None]:
# kmeans clustering on hospital data set
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.cm as cm
from sklearn.decomposition import PCA

# Preprocessor for clustering
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols),
    ('num', StandardScaler(), numeric_cols)
])

# Clustering pipeline 
kmeans_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clusterer', KMeans(n_clusters=n, random_state=42))
])

# For visualization
X_transformed = preprocessor.fit_transform(hospital_data[categorical_cols + numeric_cols])
X_pca = PCA(n_components=2).fit_transform(X_transformed)

n_clusters = range(2, 9)
for n in n_clusters:
    kmeans_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('clusterer', KMeans(n_clusters=n, random_state=42))
    ])
    cluster_labels = kmeans_pipeline.fit_predict(hospital_data[categorical_cols + numeric_cols])
    silhouette_avg = silhouette_score(
        kmeans_pipeline.named_steps['preprocessor'].transform(hospital_data[categorical_cols + numeric_cols]),
        cluster_labels
    )
    print(f'For n_clusters = {n}, the average silhouette_score is : {silhouette_avg}')



For n_clusters = 2, the average silhouette_score is : 0.08045423772354186
For n_clusters = 3, the average silhouette_score is : 0.052483554630527465
For n_clusters = 4, the average silhouette_score is : 0.0477048789064236


In [None]:
# plotting k means

fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.flatten()

# Convert sparse matrix to dense array
if hasattr(X_transformed, 'toarray'):
    X_transformed = X_transformed.toarray()


for i, n in enumerate(n_clusters):
    kmeans = KMeans(n_clusters=n, random_state=42)
    cluster_labels = kmeans.fit_predict(X_transformed)
    
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n)
    axes[i].scatter(X_transformed[:, 0], X_pca[:, 1],
                    marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k')
    axes[i].set_title(f'KMeans Clustering with n_clusters = {n}')

plt.tight_layout()
plt.show()



3. Pick the best 5 features by “Information Gain”, and attempt Q1 again. Use the
Rank widget.


In [None]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information for each feature
mi_scores = mutual_info_classif(X_transformed, cluster_labels)
mi_df = pd.DataFrame({'feature': X_transformed, 'mi_score': mi_scores})
mi_df = mi_df.sort_values(by='mi_score', ascending=False)
print(mi_df.head(5))

In [None]:
# selecting the top 5 features based on mutual information scores
hospital_data_subset = hospital_data_x[[]]

# kmeans clustering on mice data subset
n_clusters = range(2, 9)
for n in n_clusters:
    kmeans = KMeans(n_clusters=n)
    cluster_labels = kmeans.fit_predict(hospital_data_subset)
    silhouette_avg = silhouette_score(hospital_data_subset, cluster_labels)
    print(f'For n_clusters = {n}, the average silhouette_score is : {silhouette_avg}')

# plot clusters
import matplotlib.pyplot as plt
import matplotlib.cm as cm

fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.flatten()

for i, n in enumerate(n_clusters):
    kmeans = KMeans(n_clusters=n)
    cluster_labels = kmeans.fit_predict(hospital_data_subset)
    
    # scatter plot for all clusters

    colors = cm.nipy_spectral(cluster_labels.astype(float) / n)
    axes[i].scatter(hospital_data_subset.iloc[:, 0], hospital_data_subset.iloc[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k')
    axes[i].set_title(f'KMeans Clustering with n_clusters = {n}')
    
plt.tight_layout()
plt.show()


4. [Extra Credit: 10 points] Try to maximize performance using a supervised method.
Points will be awarded based on the relative rank of students who participate in the extra credit. Include the workbook in your submission.
