In [15]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Prepare the data

In [16]:
# Read in the data
file_path = Path('Resources/Myopia.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [17]:
# Remove the "MYOPIC" column from the dataset.
df = df.drop('MYOPIC', axis=1)
# Save the cleaned dataframe as a new csv file for future analysis
file_path = Path('Resources/cleaned_myopia.csv')
df.to_csv(file_path, index=False)

In [18]:
# Standardize the data using StandardScaler.
scaler = StandardScaler()

# Train the scaler with the data.
X_scaled = scaler.fit_transform(df)
print( X_scaled[0])

[-0.42021911 -1.3639169  -0.89286146  0.48378402 -0.28144315 -1.0197916
  4.1506609   1.69744958 -0.68931054 -0.67299591  0.18405806  0.49830393
  0.98713773  1.0032415 ]


### Apply Dimensionality Reduction

In [22]:
# Applying PCA to reduce dimensions to 90% of the explained variance. 

# Initialize PCA model
pca =PCA(n_components=0.90)

# Get principal components of the data. 
myopia_pca = pca.fit_transform(X_scaled)
myopia_pca.shape

(618, 10)

In [23]:
# Get the explained variance ratio
pca.explained_variance_ratio_.sum()

0.9187361702915189

### Perform a Cluster Analysis with K-means

In [None]:
# Identify best number of clusters using elbow curve
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(myopia_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

In [None]:
# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow curve for customer data')
plt.show()