# Import Required Libraries
In this section, we will import the necessary libraries for clustering and visualization, including pandas, numpy, matplotlib, seaborn, and sklearn.

# Load and Explore Dataset
In this section, we will load the dataset.csv file, display the first few rows, and check for missing values.

In [None]:
# Load the dataset
df = pd.read_csv('dataset.csv')
# Display the first few rows of the dataset
df.head()
# Check for missing values
df.isnull().sum()

In [None]:
# Drop rows with missing values
df = df.dropna()
# Select relevant features for clustering
features = ['IngresosAnuales (k$)', 'ScoreGasto (1-100)']
X = df[features].values

In [None]:
# Visualize the distribution of features
plt.figure(figsize=(15, 6))
for i, feature in enumerate(features):
    plt.subplot(1, 2, i + 1)
    sns.histplot(X[:, i], kde=True, bins=20)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
# Calculate inertia for different numbers of clusters
inertia = []
for n in range(1, 11):
    kmeans = KMeans(n_clusters=n, init='k-means++', n_init=10, max_iter=300, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the elbow method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method to Determine Optimal Number of Clusters')
plt.show()

In [None]:
# Apply KMeans with the optimal number of clusters (e.g., 5)
kmeans = KMeans(n_clusters=5, init='k-means++', n_init=10, max_iter=300, random_state=42)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
# Visualize the clusters
plt.figure(figsize=(15, 7))
plt.scatter(X[:, 0], X[:, 1], c=labels, s=100, cmap='viridis', label='Data Points')
plt.scatter(centroids[:, 0], centroids[:, 1], s=300, c='red', label='Centroids')
plt.xlabel('Ingresos Anuales (k$)')
plt.ylabel('Score de Gasto (1-100)')
plt.title('Clusters and Centroids')
plt.legend()
plt.show()

In [None]:
# Add cluster labels to the dataset
df['Cluster'] = labels
# Display the updated dataset
df.head()

# Assign Clusters to Data
In this section, we will add the cluster labels as a new column in the dataset and display the updated dataset.

# Visualize Clusters
In this section, we will plot the clusters along with their centroids using matplotlib.

# Apply KMeans Clustering
In this section, we will apply the KMeans algorithm with the optimal number of clusters and fit it to the selected features.

# Determine Optimal Number of Clusters
In this section, we will use the elbow method to calculate inertia for different numbers of clusters and plot the results to determine the optimal number of clusters.

# Exploratory Data Analysis (EDA)
In this section, we will visualize the distribution of selected features using seaborn and matplotlib.

# Data Preprocessing
In this section, we will handle missing values and select relevant features for clustering.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")