# GPS MÓDULO 2

In [2]:
import pandas as pd
# opción mostrar hasta 500 columnas
pd.set_option('display.max_columns', 500)
# opción mostrar toda la info de una columna
# pd.set_option('display.max_colwidth', None)
# opción de que muestre hasta 3 decimales y suprima la notación científica
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np

In [3]:
# setear parámetros generales para los gráficos
import matplotlib.pyplot as plt
#plt.rc('font', size=12)          # controls default text sizes
plt.rc('axes', titlesize=15)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=11)    # fontsize of the tick labels
plt.rc('ytick', labelsize=11)    # fontsize of the tick labels
plt.rc('legend', fontsize=9)    # legend fontsize
plt.rc('figure', titlesize=19)  # fontsize of the figure title

In [4]:
# también se puede usar
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(data_agrup_type_state)

## 0. CARGAMOS DATASET DE PROPERATTI

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('properati.csv', index_col='Unnamed: 0')

FileNotFoundError: [Errno 2] No such file or directory: 'properati.csv'

In [None]:
df.head(2)

In [None]:
# opción mostrar toda la info de una columna
pd.set_option('display.max_colwidth', None) # current 50

In [None]:
df.description.iloc[1]

## 2. PANDAS 2

### Groupby y transform para completar nulos

Queremos completar nulos en price_aprox_usd con la media corresp al tipo de propiedad y state_name

In [None]:
df[['property_type','state_name','price_aprox_usd']].head()

In [None]:
# mask de nulos
mask_nulos = df['price_aprox_usd'].isnull()

In [None]:
df[mask_nulos][['property_type','state_name','price_aprox_usd']].head()

In [None]:
# Vamos a hacer la media pesada por el tipo de propiedad y el state name
df_groupby=df.groupby(['property_type','state_name'])

In [None]:
pd.set_option('display.max_rows', 100) # default= None significa todos ; el de inicio es 15

In [None]:
df_groupby['price_aprox_usd'].mean()

In [None]:
# Aplicamos el transform al groupby combinado al lambda
df_groupby['price_aprox_usd'].transform(lambda x: x.fillna(x.mean()))

In [None]:
#Chequeamos que esté haciendo lo que queríamos que haga
# Recordemos los nulos
df[mask_nulos][['property_type','state_name','price_aprox_usd']].head()

In [None]:
df_groupby['price_aprox_usd'].transform(lambda x: x.fillna(x.mean())).iloc[67]

In [None]:
media_check_fill=df_groupby['price_aprox_usd'].mean()

media_check_fill[('house','Bs.As. G.B.A. Zona Oeste')]

### Pivot table

In [None]:
# Propiedades de Capital Federal
df_capital=df[df['state_name']=='Capital Federal']

In [None]:
df_capital.pivot_table(index='place_name',columns='property_type',
                       aggfunc={'price_aprox_usd':[np.mean,'count']})

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.html
sns.set_style('darkgrid')
df_capital.pivot_table(index='place_name',values='price_aprox_usd')\
          .sort_values(by='price_aprox_usd')\
          .plot(kind='bar',figsize=(15,4),title='Precio promedio por barrio',
                xlabel='Barrios',ylabel='Precio aprox USD (promedio)',color='red',legend=False);

### Merge + groupby (+lambda) para sacar outliers

Primero un groupby combinado con lambda para calcular el percentil 90 por barrio

In [None]:
df_groupby_barrios=df_capital.groupby('place_name')

In [None]:
umbral_90=df_groupby_barrios['price_aprox_usd'].aggregate(lambda x: np.percentile(x.dropna(),90))

In [None]:
umbral_90

Unimos el resultado del groupby con un merge a nuestro dataframe original

In [None]:
df_capital_new=df_capital.merge(umbral_90.reset_index().rename(columns={'price_aprox_usd':'percentil_90'}),
                how='left',on='place_name')

In [None]:
df_capital_new.head(2)

Filtramos el dataset según la columna percentil_90

In [None]:
df_capital_clean_for_plot=df_capital_new[df_capital_new['price_aprox_usd']<df_capital_new['percentil_90']]

Hacemos el plot con seaborn usando boxplots

In [None]:
sns.set_style('darkgrid')
fig,ax0 = plt.subplots(1, 1, figsize=(15, 6))
ax0=sns.boxplot(data=df_capital_clean_for_plot,
                x='place_name',y='price_aprox_usd',palette='magma',
                order=df_groupby_barrios['price_aprox_usd'].median().sort_values().index.values,
                ax=ax0)
ax0.set(xlabel = "Barrios", ylabel = "Precio en USD", title = "Boxplot por Barrios")
plt.xticks(rotation=90);

## 3. DATA WRANGLING

### Qcut para filtrar superficies muy grandes

In [None]:
fig,axs = plt.subplots(2, 2, figsize=(18, 13))

#Dividimos la superficie usando qcut y contamos la cant. de observaciones
pd.qcut(df_capital_clean_for_plot['surface_total_in_m2'],q=10)\
  .value_counts()\
  .sort_index()\
  .plot(kind='barh',ax=axs[0,0])
axs[0,0].set(xlabel = "Cantidad de observaciones", ylabel = "Categorias", title = "Superficie por deciles")

# Dividimos la superficie usando cut con 10 bins
pd.cut(df_capital_clean_for_plot['surface_total_in_m2'],bins=10)\
  .value_counts()\
  .sort_index()\
  .plot(kind='barh',ax=axs[0,1],color='red')
axs[0,1].set(xlabel = "Cantidad de observaciones", ylabel = "", title = "Superficie por bines (10)")

# Repetimos la figura 1 pero viendo la proporción de observacciones
pd.qcut(df_capital_clean_for_plot['surface_total_in_m2'],q=10)\
  .value_counts(normalize=True)\
  .sort_index()\
  .plot(kind='barh',ax=axs[1,0],color='c')
axs[1,0].set(xlabel = "Proproción de observaciones", ylabel = "Categorias", title = "Superficie por deciles")

plt.tight_layout()

In [None]:
#filtramos la base generando una maskara booleana
maskara_filtrar_superficies=pd.qcut(df_capital_clean_for_plot['surface_total_in_m2'],q=10).astype(str)!='(150.0, 8053.0]'

In [None]:
df_capital_clean_for_plot_superficie=df_capital_clean_for_plot[maskara_filtrar_superficies]

In [None]:
df_capital_clean_for_plot_superficie.shape

### Variables categóricas y get dummies

In [None]:
df_clean_dummies=pd.get_dummies(df_capital_clean_for_plot,columns=['property_type'],drop_first=True)

In [None]:
# pd.set_option('display.max_colwidth', 50)
df_clean_dummies.head()

Vamos a usar la columna dummie para quedarnos solo con los departamentos

In [None]:
df_clean_dummies_dpto=df_clean_dummies[df_clean_dummies['property_type_apartment']==1]

### Plotly

In [None]:
import plotly.express as px
import plotly as pl
pl.offline.init_notebook_mode(connected=True)

In [None]:
# df_capital_clean_for_plot_superficie = df_capital_clean_for_plot.copy()

In [None]:
# dropeo los nans porque sino no me deja ajustar el size a una variable que tiene valores faltantes
fig = px.scatter(data_frame = df_capital_clean_for_plot_superficie.dropna(subset=['price_usd_per_m2',
                                                                                   'surface_total_in_m2',
                                                                                   'surface_covered_in_m2']), 
                 x = "price_usd_per_m2", y = "surface_total_in_m2",
                 size='surface_covered_in_m2',size_max=45,
                 color = "property_type", opacity = 0.6,
                 width=800, height=500,
                 hover_data={'property_type':False})

fig.update_layout(
    title='Relación entre precio y superficie',
    yaxis=dict(title='Superficie en m2'),
    xaxis=dict(title='Precio por m2 (USD)'))

fig.update_xaxes(rangeslider_visible=True)

fig.show()

**Plotly en 3D**

In [None]:
fig = px.scatter_3d(df_capital_clean_for_plot_superficie, 
                    x="price_usd_per_m2", y="surface_total_in_m2", 
                    z="surface_covered_in_m2", color="property_type")

fig.update_layout(
    title='Relación entre precio, superficie total y superficie cubierta',
    scene = dict(xaxis_title='Precio por m2 (USD)',
                 yaxis_title='Sup. Total en m2',
                 zaxis_title='Sup. Cubierta en m2'))
fig.show()

In [None]:
df_capital_clean_for_plot_superficie.columns.values

In [None]:
fig = px.sunburst(df_capital_clean_for_plot_superficie.dropna(subset=['price_usd_per_m2',
                                                                                   'surface_total_in_m2',
                                                                                   'surface_covered_in_m2']), 
                  path=['property_type', 'place_name'], values='price_usd_per_m2',
                  color='surface_total_in_m2')

fig.update_layout(
    title='Relación tipo de inmueble y barrios (precio total y superficie)')

fig.show()

### Hagamos Mapas con Plotly

In [None]:
df_capital_clean_for_plot_superficie.dropna(subset=['lat','lon']).head(2)

In [None]:
fig = px.scatter_mapbox(df_capital_clean_for_plot_superficie.dropna(subset=['lat','lon','price_usd_per_m2']),
                        title='Inmuebles CABA - por tipo de propiedad y precio x m2 (USD)',
                        lat="lat", lon="lon", color="property_type", size="price_usd_per_m2",
                  color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=10,
                  mapbox_style="stamen-toner")#open-street-map
#open-street-map, white-bg, carto-positron, carto-darkmatter, stamen-terrain, stamen-toner, stamen-watercolor

fig.show()

In [None]:
fig = px.scatter_mapbox(df_clean_dummies_dpto.dropna(subset=['lat','lon','price_usd_per_m2']),
                        title='Apartamentos - precio m2 (USD) y superficie total',
                        lat="lat", lon="lon", color="price_usd_per_m2", size="surface_total_in_m2",
                        color_continuous_scale=px.colors.sequential.Hot, 
                        range_color=[0,8000],
                        size_max=15, zoom=10,
                        mapbox_style="stamen-toner")#open-street-map
#open-street-map, white-bg, carto-positron, carto-darkmatter, stamen-terrain, stamen-toner, stamen-watercolor
#px.colors.cyclical.IceFire
# https://plotly.com/python/builtin-colorscales/ (lista de colormaps)

fig.show()

In [None]:
pd.read_csv('')

In [None]:
df