In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA


KeyboardInterrupt: 

In [None]:
# Read CSV file
df = pd.read_csv('myopia.csv')
df.head()

In [None]:
# drop the column 'MYOPIC'
X = df.drop('MYOPIC', axis=1)
y = df['MYOPIC'].values


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the data
scaler = StandardScaler()
# Train the scaler on the training set
X_scaled = scaler.fit_transform(X_train)
X_scaled[0]

# Apply Dimensionality Reduction

In [None]:
# perform dimensionality reduction with PCA
pca = PCA(n_components=2)
# fit the PCA model to the scaled data
myopia_pca = pca.fit_transform(X_scaled)
#Getting the explained variance ratio
pca.explained_variance_ratio_.sum()

In [None]:
# Getting the shape of features after reduction
myopia_pca.shape

In [None]:
# Further reduce the dimensions with t-SNE and visually inspect the results
xs = myopia_pca[:,0]
ys = myopia_pca[:,1]
plt.scatter(xs, ys, alpha=0.5)
plt.show()


In [None]:
# Create an elbow plot to identify the best number of clusters. Make sure to do the following:
# 1. Run k-means clustering on the dataset with different values of k.
# 2. For each k, calculate the sum of squared errors (SSE).
# 3. Plot the SSE values for each k.
# 4. Identify the elbow in the plot.
sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(myopia_pca)
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()