<a href="https://colab.research.google.com/github/BilalKhaliqWillis/BILAL-Assignment2/blob/main/BILAL_Final_Project_Unsupervised_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# BILAL-Final Project - Unsupervised Learning.ipynb
# Setup & Libraries
# Core libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Clustering & ML
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

# Anomaly Detection
from sklearn.neighbors import KernelDensity

# Neural Networks
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

# Utilities
import zipfile
import os

sns.set(style="whitegrid")

In [2]:
# Uploading ZIP Files
from google.colab import files

uploaded = files.upload()

Saving car+evaluation.zip to car+evaluation.zip
Saving covertype.zip to covertype.zip
Saving gas+sensor+array+drift+dataset.zip to gas+sensor+array+drift+dataset.zip
Saving human+activity+recognition+using+smartphones.zip to human+activity+recognition+using+smartphones.zip
Saving wholesale+customers.zip to wholesale+customers.zip


In [3]:
# Unzipping Uploaded Files
for zip_file in uploaded.keys():
    if zip_file.endswith(".zip"):
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(zip_file.replace(".zip", ""))

In [None]:
# Clustering Natural Groups
# Dataset: Wholesale Customers
# Loading & Preprocessing the Data
wholesale_path = "Wholesale customers data.csv"
df_wholesale = pd.read_csv(wholesale_path)

# Drop categorical columns
X = df_wholesale.drop(columns=["Channel", "Region"])

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Agglomerative Clustering
agg = AgglomerativeClustering(n_clusters=3)
labels_agg = agg.fit_predict(X_scaled)

sil_agg = silhouette_score(X_scaled, labels_agg)
print("Agglomerative Silhouette Score:", sil_agg)

In [None]:
# Visualization
plt.figure(figsize=(8,5))
plt.scatter(X_scaled[:,0], X_scaled[:,1], c=labels_km, cmap="viridis")
plt.title("Wholesale Customers – KMeans Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.show()

"""Discussion:-

KMeans performs well on spherical clusters

Agglomerative clustering captures hierarchy

Silhouette score helps compare performance objectively"""

In [None]:
# Non-Linear & Hierarchical Clustering
# Dataset: Forest Cover Type
# Loading the Data
forest_path = "covtype.csv"
df_forest = pd.read_csv(forest_path)

# Sample for speed
df_forest = df_forest.sample(5000, random_state=42)

X_forest = df_forest.drop(columns=["Cover_Type"])
X_forest = StandardScaler().fit_transform(X_forest)

In [None]:
# DBSCAN - Non-Linear
dbscan = DBSCAN(eps=1.5, min_samples=10)
labels_db = dbscan.fit_predict(X_forest)

print("Unique clusters (DBSCAN):", np.unique(labels_db))

In [None]:
# Hierarchical Clustering
agg_forest = AgglomerativeClustering(n_clusters=5)
labels_hier = agg_forest.fit_predict(X_forest)

In [None]:
# Visualization
plt.figure(figsize=(8,5))
plt.scatter(X_forest[:,0], X_forest[:,1], c=labels_hier, cmap="tab10")
plt.title("Forest Cover – Hierarchical Clustering")
plt.show()

"""Discussion

DBSCAN detects arbitrary shapes and noise

Hierarchical clustering shows structured grouping

Useful when cluster count is unknown"""

In [None]:
# Soft Clustering
# Dataset: Car Evaluation
# Load & Encode Data
car_path = "car.data"
columns = ["buying","maint","doors","persons","lug_boot","safety","class"]
df_car = pd.read_csv(car_path, names=columns)

encoder = LabelEncoder()
for col in df_car.columns:
    df_car[col] = encoder.fit_transform(df_car[col])

X_car = StandardScaler().fit_transform(df_car.drop(columns=["class"]))

In [None]:
# Gaussian Mixture Model
gmm = GaussianMixture(n_components=4, random_state=42)
gmm_labels = gmm.fit_predict(X_car)
gmm_probs = gmm.predict_proba(X_car)

print("Soft probabilities shape:", gmm_probs.shape)

"""Discussion

GMM Advantages

Probabilistic cluster membership

Handles overlapping clusters

Limitations

Assumes Gaussian distributions

Sensitive to initialization

(Fuzzy C-Means discussed theoretically if not implemented)"""

In [None]:
# Anomaly Detection
# Dataset: Gas Sensor Array Drift
# Loading the Data
gas_path = "batch1.dat"
df_gas = pd.read_csv(gas_path, sep=" ", header=None)

X_gas = StandardScaler().fit_transform(df_gas)

In [None]:
# Kernel Density Estimation
kde = KernelDensity(kernel="gaussian", bandwidth=1.0)
kde.fit(X_gas)

log_density = kde.score_samples(X_gas)
threshold = np.percentile(log_density, 5)

anomalies = log_density < threshold
print("Number of anomalies:", np.sum(anomalies))

"""Discussion

Density estimation flags rare patterns

KDE is flexible but computationally expensive

Threshold selection is critical"""

In [None]:
# Neural Network PCA
# Dataset: Human Activity Recognition
# Loading the Data
har_path = "train.txt"
X_har = pd.read_csv(har_path, delim_whitespace=True, header=None)

X_har = StandardScaler().fit_transform(X_har)

In [None]:
# Autoencoder - Neural PCA
input_dim = X_har.shape[1]
encoding_dim = 2

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation="linear")(input_layer)
decoded = Dense(input_dim, activation="linear")(encoded)

autoencoder = Model(input_layer, decoded)
encoder = Model(input_layer, encoded)

autoencoder.compile(optimizer=Adam(), loss="mse")
autoencoder.fit(X_har, X_har, epochs=20, batch_size=256, verbose=1)

In [None]:
# Visualization
X_encoded = encoder.predict(X_har)

plt.figure(figsize=(8,5))
plt.scatter(X_encoded[:,0], X_encoded[:,1], s=5)
plt.title("Neural Network PCA – HAR Dataset")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

"""Discussion

Neural PCA captures non-linear structure

Better visualization than linear PCA

Useful for high-dimensional sensor data"""