In [None]:

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("FINAL_FROM_DF.csv")

In [None]:

df.head()

In [None]:
numerical_columns = []
categorical_columns = []
for column in df.columns:
    if df[column].dtype == 'object':
        categorical_columns.append(column)
    else:
        numerical_columns.append(column)

In [None]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_columns),
    ("cat", OneHotEncoder(), categorical_columns)
])

In [None]:
X = preprocessor.fit_transform(df)

In [None]:
inertias = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)
    diff = inertias[-2] - inertias[-1] if k > 1 else 0
    print(f'K={k}, Inercia = {kmeans.inertia_:,.6f} - Diferencia = {diff:,.6f}')

In [None]:
sns.lineplot(x=range(1, 11), y=inertias, marker="o")
plt.xlabel("Número de clusters (k)")
plt.ylabel("Inercia (SSE - Suma de errores al cuadrado)")
plt.title("Método del codo para determinar el número óptimo de clusters")
plt.xticks(range(1, 11))
plt.grid()
plt.show()

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)
df["Cluster"] = kmeans.labels_
df.head()

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [None]:
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df["Cluster"])
plt.title("Visualización de clusters con PCA")
plt.show()

In [None]:
df["Cluster"].value_counts()

In [None]:

df[numerical_columns + ["Cluster"]].groupby("Cluster").mean()

In [None]:
df[categorical_columns + ["Cluster"]].groupby("Cluster").agg(lambda x: x.mode()[0])

In [None]:
sns.barplot(df, x="Cluster", y="TOTALTRADES", estimator="mean")

In [None]:

sns.scatterplot(df, x="OPEN", y="CLOSE", hue="Cluster")