In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [3]:
# Importar conjunto de dados
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
# Separar o conjunto de dados em conjunto de treinamento e conjunto de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [6]:
print("Conjunto de treinamento: ", X_train.shape)
print("Conjunto de teste: ", X_test.shape)

Conjunto de treinamento:  (14447, 8)
Conjunto de teste:  (6193, 8)


### Treinando o modelo de Regressão Linear

In [7]:
# Configurando o algoritmo (config default)
reg = LinearRegression()
# Ajustar o modelo
reg.fit(X_train, y_train)

### Validando o modelo ajustado

In [9]:
# Avaliar as métricas do modelo
y_pred = reg.predict(X_test)

print("Erro Médio Quadrático: ", mean_squared_error(y_test, y_pred))
print("Coeficiente de Determinação (R2): ", r2_score(y_test, y_pred))

Erro Médio Quadrático:  0.5600684073830428
Coeficiente de Determinação (R2):  0.5926411062561453


In [10]:
from sklearn.dummy import DummyRegressor
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)

print("Erro Médio Quadrático: ", mean_squared_error(y_test, dummy_regr.predict(X_test)))
print("Coeficiente de Determinação (R2): ", r2_score(y_test, dummy_regr.predict(X_test)))

Erro Médio Quadrático:  1.3749417558640837
Coeficiente de Determinação (R2):  -4.703933954530726e-05


### Mineirando padrões

In [12]:
# Coeficientes 
coeficientes = pd.concat([pd.DataFrame(
    fetch_california_housing().feature_names),
    pd.DataFrame(
        np.transpose(reg.coef_))], axis=1)

print(coeficientes)
print("Intercepto: ", reg.intercept_)

            0         0
0      MedInc  0.444152
1    HouseAge  0.009328
2    AveRooms -0.117691
3   AveBedrms  0.659260
4  Population -0.000005
5    AveOccup -0.004584
6    Latitude -0.412442
7   Longitude -0.426155
Intercepto:  -36.240868509857606


### Agrupamento (Cluster)

In [13]:
# Importar biblioetecas
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score

In [17]:
# Normalizar os dados de X_train
Xn_train = normalize(X_train)
Xn_train[0:5,:]

array([[ 2.20834025e-02,  2.23372347e-01,  2.62944138e-02,
         5.46715534e-03,  7.20883483e-01,  1.84841919e-02,
         1.98852155e-01, -6.23970344e-01],
       [ 7.61177393e-03,  5.78494819e-02,  1.87345643e-02,
         3.81274442e-03,  9.19502291e-01,  8.28380442e-03,
         1.22640902e-01, -3.68288070e-01],
       [ 1.97444585e-03,  3.96370344e-02,  3.84829886e-03,
         1.07858895e-03,  9.90925860e-01,  4.21670579e-03,
         3.54438534e-02, -1.23292039e-01],
       [ 2.05251083e-03,  7.77576673e-03,  2.27571339e-03,
         5.06500089e-04,  9.98214054e-01,  1.45300445e-03,
         1.59889203e-02, -5.69283322e-02],
       [ 2.29795681e-03,  8.34139726e-03,  2.98227573e-03,
         4.81107136e-04,  9.98023649e-01,  1.49628733e-03,
         1.84541148e-02, -5.93465882e-02]])

In [18]:
# Configurar o algoritmo
kmeans = KMeans(n_clusters=5, init='random')
# Ajustar o modelo
kmeans.fit(Xn_train)

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
# Verificar o centro de cada cluster
kmeans.cluster_centers_

array([[ 2.16381848e-03,  1.50079609e-02,  2.97132203e-03,
         6.07854840e-04,  9.97035374e-01,  1.77154453e-03,
         2.01784934e-02, -6.80782106e-02],
       [ 2.61066267e-02,  2.04539364e-01,  6.25169913e-02,
         1.36451305e-02,  3.96611474e-01,  2.02303592e-02,
         2.49449961e-01, -8.26211829e-01],
       [ 4.67443440e-03,  3.76417897e-02,  6.43715212e-03,
         1.26705629e-03,  9.87859154e-01,  3.25474722e-03,
         4.20829852e-02, -1.40817014e-01],
       [ 8.40678326e-03,  7.01492117e-02,  1.28742112e-02,
         2.60723000e-03,  9.57223260e-01,  5.82207874e-03,
         7.89448430e-02, -2.61478569e-01],
       [ 1.55851928e-02,  1.27275390e-01,  2.51551932e-02,
         5.29136901e-03,  8.24694930e-01,  1.21066294e-02,
         1.54777640e-01, -5.11130934e-01]])

In [20]:
# Distância de cada amostra para cada grupo
dist = kmeans.fit_transform(Xn_train)
dist[0:5,:]

  super()._check_params_vs_input(X, default_n_init=10)


array([[0.38777179, 0.18646974, 0.47502372, 0.67960254, 0.60393089],
       [0.72332966, 0.18806075, 0.12220581, 0.32985166, 0.25200257],
       [0.96163192, 0.44798661, 0.1522579 , 0.06270981, 0.01948183],
       [1.02551234, 0.52037905, 0.22704878, 0.01400622, 0.09351701],
       [1.02287199, 0.51738307, 0.22396371, 0.0111689 , 0.09043715]])

In [23]:
# Rotular amostras
labels = kmeans.labels_
labels[0:5]

array([1, 2, 4, 3, 3], dtype=int32)

In [24]:
# Valor de ajuste
kmeans.inertia_

30.69986101789494

In [25]:
silhouette_score(Xn_train, labels)

0.5180903449315377