In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load the modified Iris dataset
iris_df = pd.read_csv('iris-modified.csv')

# Display the first few rows of the dataset to understand its structure
print(iris_df.head())

# Check for missing values
print(iris_df.isnull().sum())

# Assuming there are no missing values, proceed with scaling the features
# Separate features and target variable if applicable
X = iris_df.drop(columns=['Species'])  # Adjust this if 'Species' is not the target variable
y = iris_df['Species']  # Adjust this if 'Species' is the target variable

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_features)

# Analyze cluster sizes
cluster_labels = kmeans.labels_
unique, counts = np.unique(cluster_labels, return_counts=True)
cluster_sizes = dict(zip(unique, counts))
print("Cluster Sizes:", cluster_sizes)

# Add cluster labels to the original dataframe
iris_df['Cluster'] = cluster_labels

# Visualize clusters using pair plots
sns.pairplot(iris_df, hue='Cluster', palette='viridis')
plt.show()


   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2
Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
dtype: int64


KeyError: "['Species'] not found in axis"

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the dataset
iris_modified = pd.read_csv('iris-modified.csv')

# Display the first few rows of the dataset to understand its structure
print(iris_modified.head())

# Check for missing values
print(iris_modified.isnull().sum())

# Assuming there are no missing values, proceed with scaling the features
X = iris_modified.drop(columns=['Species'])  # Features
y = iris_modified['Species']  # Target variable (if applicable)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)

# Analyze cluster sizes
cluster_labels = kmeans.labels_
cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index()
print("Cluster Sizes:\n", cluster_sizes)

# Reduce dimensions for visualization using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Create a scatter plot of the clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', s=50)
plt.title('Clusters in Iris Dataset (Modified)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()


   Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2
Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
dtype: int64


KeyError: "['Species'] not found in axis"