In [1]:
pip install kaggle

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import kaggle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler

# Define dataset (replace 'username/dataset-name' with actual dataset reference)
kaggle_dataset = "username/dataset-name"
dataset_filename = "your-dataset.csv"  # Update this to match the actual file

# Download dataset
os.makedirs("dataset", exist_ok=True)
kaggle.api.dataset_download_files(kaggle_dataset, path="dataset", unzip=True)

# Load dataset
df = pd.read_csv(f"dataset/{dataset_filename}")

# Preprocess dataset (drop non-numeric columns, handle missing values)
df = df.select_dtypes(include=[np.number]).dropna()
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Determine optimal clusters using silhouette score
silhouette_scores = []
K_range = range(2, 10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(df_scaled)
    silhouette_scores.append(silhouette_score(df_scaled, labels))

# Plot silhouette scores
plt.figure(figsize=(8, 5))
plt.plot(K_range, silhouette_scores, marker='o', linestyle='--', color='b')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Different Clusters')
plt.show()

# Apply KMeans with the best k (highest silhouette score)
best_k = K_range[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Visualize clusters using pairplot
sns.pairplot(df, hue='Cluster', palette='tab10')
plt.show()

# Visualize silhouette analysis
fig, ax = plt.subplots(figsize=(8, 5))
silhouette_vals = silhouette_samples(df_scaled, df['Cluster'])
y_ticks = []
y_lower, y_upper = 0, 0
for i in range(best_k):
    cluster_vals = silhouette_vals[df['Cluster'] == i]
    cluster_vals.sort()
    y_upper += len(cluster_vals)
    ax.barh(range(y_lower, y_upper), cluster_vals, height=1.0)
    y_ticks.append((y_lower + y_upper) / 2)
    y_lower = y_upper
ax.set_yticks(y_ticks)
ax.set_yticklabels(range(best_k))
ax.set_xlabel('Silhouette Coefficient')
ax.set_ylabel('Cluster')
ax.set_title('Silhouette Plot')
plt.show()

OSError: Could not find kaggle.json. Make sure it's located in C:\Users\Engineer\.kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/