In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

palette = px.colors.qualitative.T10

In [2]:
import plotly.figure_factory as ff

In [3]:
df = pd.read_csv('./housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df.shape

(20640, 10)

1. *Tipo de supervisión*: problema supervisado, de regresión múltiple (varios *features*) univariada (un *target*).
2. *Tipo de entrenamiento*: batch learning porque no es un conjunto tan grande y cabe en memoria.
3. *Generalización*: basadas en modelos matemáticos.

In [5]:
df_sample = df.sample(500)

In [6]:
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    lon=df_sample['longitude'],
    lat=df_sample['latitude'],
    marker_colorbar_title='',
    marker_colorscale='RdBu_r',
    marker_size=df_sample['median_house_value']/30_000,
    marker_color=df_sample['median_house_value'],
    text=df_sample['ocean_proximity'] + '<br>' + 'Median house value: ' + df_sample['median_house_value'].map(str)
))

fig.update_layout(
    title='Valor medio de las casas por vecindario',
    geo_scope='usa',
    height=600,
    width=900,
    template='ggplot2',
    geo_landcolor='rgb(217, 217, 217)',
    margin=dict(l=10, r=10, b=10, t=50)
)

fig.update_geos(fitbounds='locations')

fig.show()

In [7]:
fig.write_image('Curso_ML\images\mapa.svg')

Para selección de características.

In [None]:
fig = px.parallel_coordinates(
    df_sample,
    color='median_house_value',
    dimensions=df_sample.columns[:-2],
    color_continuous_scale=px.colors.diverging.Tealrose
)

fig.update_layout(
    title='',
    xaxis_title='',
    yaxis_title='',
    height=600,
    width=1200,
    template='ggplot2',
    margin=dict(l=40, r=10, b=10, t=50)
)

fig.show()

In [31]:
fig.write_image('Curso_ML/images/paralelas.svg')

In [6]:
df_numeric = df_sample.columns[2:-2]

In [None]:
bin_size = [2.5, 750, 200, 500, 150, 0.5]

for idx, col in enumerate(df_numeric):
    hist_data = [df_sample.dropna()[col]]
    group_labels = [col]

    fig = ff.create_distplot(
        hist_data,
        group_labels,
        colors=[palette[idx]],
        show_rug=False,
        bin_size=bin_size[idx]
        )

    fig.update_layout(
        title=f'Distribución de <b>{col}',
        xaxis_title='Item',
        yaxis_title='Frecuencia relativa',
        height=500,
        width=700,
        showlegend=False,
        template='plotly_white',
        margin=dict(l=10, r=10, b=10, t=50)
    )

    fig.show(renderer='svg')

    if idx == 1:
        fig.write_image('Curso_ML/images/hist.svg')