# WK09 Prep

In [None]:
!wget -q https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/src/data_utils.py

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.decomposition import PCA

from data_utils import StandardScaler
from data_utils import object_from_json_url

# Classification / Clustering

In [None]:
## 1. Load Dataset
WINE_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024F-H/9103-utils/main/datasets/json/wines.json"

# Read into DataFrame
wines_data = object_from_json_url(WINE_FILE)
wines_df = pd.DataFrame.from_records(wines_data)

## 3. Normalize
wine_scaler = StandardScaler()
wines_scaled = wine_scaler.fit_transform(wines_df)

features = wines_scaled.drop(columns=["quality"])
wines_scaled.cov()["quality"].sort_values()

In [None]:
wine_pca = PCA()
wines_pcad = wine_pca.fit_transform(features.values)

In [None]:
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

In [None]:
x = features["alcohol"].values
y = features["density"].values
c = [colors[int(i)] for i in wines_scaled["quality"].values]

plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.7)
plt.show()

In [None]:
x = wines_pcad[:, 0]
y = wines_pcad[:, 1]
c = [colors[int(i)] for i in wines_scaled["quality"].values]

# Plot the PCAs
plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.7)
plt.show()

In [None]:
from sklearn.cluster import KMeans

## 5. Create a KMeans object
km_model = KMeans(n_clusters=4, n_init=10)

# Create a model that tries to group wines by features
result = km_model.fit(features.values)

## 6. Run the model on the training data
predicted_scaled = km_model.predict(features.values)

In [None]:
x = features["alcohol"].values
y = features["density"].values
c = [colors[i] for i in predicted_scaled]

plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.5)
plt.xlim(-2,3)
plt.ylim(-2,3)
plt.show()

In [None]:
x = wines_pcad[:, 0]
y = wines_pcad[:, 1]
c = [colors[i] for i in predicted_scaled]

# Plot the PCAs
plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.5)
plt.xlim(-4,4)
plt.ylim(-4,4)
plt.show()


# Iris

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
X_reduced = PCA(n_components=3).fit_transform(iris.data)

In [None]:
x = iris.data[:, 0]
y = iris.data[:, 1]
c = [colors[int(i)] for i in iris.target]

plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.5)
plt.show()

In [None]:
x = X_reduced[:, 0]
y = X_reduced[:, 1]
c = [colors[int(i)] for i in iris.target]


plt.figure(figsize=(9, 6.75), dpi=150)
plt.scatter(x, y, color=colors[0], marker='o', linestyle='', alpha=0.7)
plt.xlabel("petal length")
plt.ylabel("petal width")
plt.show()

plt.figure(figsize=(9, 6.75), dpi=150)
plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.7)
plt.xlabel("petal length")
plt.ylabel("petal width")
plt.show()