# Explore here

In [178]:
from pickle import dump

import pandas as pd
import plotly.express as px
import plotly.io as pio
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [179]:
df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-means-project-tutorial/main/housing.csv", usecols=['Latitude', 'Longitude', 'MedInc'])
df.head()

Unnamed: 0,MedInc,Latitude,Longitude
0,8.3252,37.88,-122.23
1,8.3014,37.86,-122.22
2,7.2574,37.85,-122.24
3,5.6431,37.85,-122.25
4,3.8462,37.85,-122.25


In [180]:
df.shape

(20640, 3)

In [181]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MedInc     20640 non-null  float64
 1   Latitude   20640 non-null  float64
 2   Longitude  20640 non-null  float64
dtypes: float64(3)
memory usage: 483.9 KB


In [182]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

In [183]:
model = KMeans(n_clusters = 6, random_state = 42)
model.fit(X_train)

In [184]:
X_train["cluster"] = model.labels_.astype(str)
X_train["cluster"].head()

14196    3
8267     1
17445    1
14265    3
2271     5
Name: cluster, dtype: object

In [185]:
fig = px.scatter(
    X_train,
    x='Longitude',
    y='Latitude',
    color='cluster',
    title='Clusters K-Means (train set)',
    labels={'cluster': 'Cluster'}
)
pio.renderers.default = 'browser'
fig.show()

In [186]:
predictions = model.predict(X_test)
X_test['cluster'] = predictions.astype(str)

In [187]:
df_all = pd.concat([X_train, X_test])

In [190]:
fig = px.scatter(
    df_all,
    x='Longitude',
    y='Latitude',
    color='cluster',
    symbol=df_all.index.isin(X_test.index),
    title='Clusters K-Means con train y test',
    labels={'cluster': 'Cluster'}
)
pio.renderers.default = 'browser'
fig.show()

In [191]:
df_all.groupby('cluster')['MedInc'].mean()

cluster
0     5.380710
1     4.340447
2     6.944277
3     2.431684
4    11.742077
5     2.737183
Name: MedInc, dtype: float64

In [192]:
fig = px.scatter_3d(
    df_all,
    x='Longitude',
    y='Latitude',
    z='MedInc',
    color='cluster',
    title='Clusters K-Means en 3D (Longitude, Latitude, MedInc)',
    labels={'cluster': 'Cluster'}
)
pio.renderers.default = 'browser'
fig.show()

In [193]:
y_train = X_train['cluster']
X_train_sup = X_train.drop(columns=['cluster'])

y_test = X_test['cluster']
X_test_sup = X_test.drop(columns=['cluster'])

model2 = RandomForestClassifier(random_state=42)
model2.fit(X_train_sup, y_train)

y_pred = model2.predict(X_test_sup)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       561
           1       0.99      1.00      1.00       983
           2       0.99      1.00      1.00       299
           3       1.00      1.00      1.00      1111
           4       1.00      0.97      0.98        62
           5       1.00      0.99      1.00      1112

    accuracy                           1.00      4128
   macro avg       1.00      0.99      0.99      4128
weighted avg       1.00      1.00      1.00      4128



In [194]:
dump(model, open("../models/modelo_kmeans_6_42.sav", "wb"))