### Imports

In [1]:
import pandas as pd
import numpy as np


import re
import time
import json
import requests as rq
import unidecode

from scipy.stats import spearmanr, pearsonr

pd.options.display.max_columns = 200
pd.options.display.max_seq_items = 500

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')
sns.set(font_scale=1.4)

import plotly.offline as offline
import plotly.plotly as py
import plotly.graph_objs as go

offline.init_notebook_mode()

ModuleNotFoundError: No module named 'plotly'

### Google API para lat y lon de aglomerados

In [None]:
def geodecode(local):
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address='
    headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36'}
    q = local.replace(' ', '%20')
    print(url+q)
    c = rq.get(url+q, headers = headers).content
    data = json.loads(c)
    x, y = data['results'][0]['geometry']['location'].items()
    return (x[1], y[1]) if 'lat' in x else (y[1], x[1])

In [None]:
s = unidecode.unidecode(""" 02 = Gran La Plata
 03 = Bahía Blanca - Cerri
 04 = Gran Rosario
 05 = Gran Santa Fé
 06 = Gran Paraná
 07 = Posadas
 08 = Gran Resistencia
 09 = Cdro. Rivadavia - R.Tilly
 10 = Gran Mendoza
 12 = Corrientes
 13 = Gran Córdoba
 14 = Concordia
 15 = Formosa
 17 = Neuquén – Plottier
 18 = S.del Estero - La Banda
 19 = Jujuy - Palpalá
 20 = Río Gallegos
 22 = Gran Catamarca
 23 = Salta
 25 = La Rioja
 26 = San Luis - El Chorrillo
 27 = Gran San Juan
 29 = Gran Tucumán - T. Viejo
 30 = Santa Rosa - Toay
 31 = Ushuaia - Río Grande
 32 = Ciudad de Bs As
 33 = Partidos del GBA
 34 = Mar del Plata - Batán
 36 = Río Cuarto
 38 = San Nicolás – Villa Constitución
 91 = Rawson – Trelew
 93 = Viedma – Carmen de Patagones
 """)

In [None]:
aglomerados = {k:[c] for c, k in re.findall('(\d+)\s=\s([\w\s/./-]+?)\n', s, flags=re.IGNORECASE)}

In [None]:
# cambio entre 'arg' y 'argentina' por obstaculo de google 
coords = 0
while coords < len(aglomerados):
    offset = ' argentina'
    for t in aglomerados.keys():
        if len(aglomerados[t]) == 1:
            try:
                aglomerados[t].extend(geodecode(t + offset))
                coords += 1
                print(t, 'ok')
            except:
                print('err', t)
                offset = ' arg'
            time.sleep(2)

In [None]:
coords_df = pd.DataFrame.from_dict(aglomerados, orient='index')
coords_df.columns = 'cod, lat, lon'.split(', ')
coords_df['cod'] = coords_df['cod'].apply(pd.to_numeric)

In [None]:
coords_df.to_json('coord_df.json')

## EPH

In [None]:
coords_df = pd.read_json('coord_df.json')

In [None]:
fname = r'C:\Users\mgrinberg\Downloads\EPH_usu_1_Trim_2018_txt\usu_individual_t118.txt'
df = pd.read_table(fname, sep = ';')

In [None]:
df = df.replace('', np.nan)

In [None]:
df.sample(10)

In [None]:
edu = 'NIVEL_ED'
edad = 'CH06'
parentesco = 'CH03'
analf = 'CH09'
busco_trab = 'PP10A'

In [None]:
df = df[(df[edad] >= 18) & (df[edad] <= 40)]

In [None]:
# El promedio de edad por aglo es similar. 
edu_aglo = df[df[edu] <= 3].groupby('AGLOMERADO').count()[edu] / df.groupby('AGLOMERADO').count()[edu]
coords_df = coords_df.join(edu_aglo, on='cod')

In [None]:
analf_aglo = df[df[analf] == 2].groupby('AGLOMERADO').count()[analf] / df.groupby('AGLOMERADO').count()[analf]
coords_df = coords_df.join(analf_aglo, on='cod')

In [None]:
sns.distplot(tuple(analf_aglo), bins=8);

In [None]:
# tamaño de muestra por aglomerado
sns.distplot(df.groupby('AGLOMERADO').count().iloc[:,0]
            .sort_values(), 8)

In [None]:
# porcentaje desempleo
desempl_aglo = df[~df[busco_trab].isnull()].groupby('AGLOMERADO').count().iloc[:,0]/df.groupby('AGLOMERADO').count().iloc[:,0]
coords_df = coords_df.join(desempl_aglo.rename(busco_trab), on='cod')

In [None]:
x, y, z = 'Desempleo', 'Analfabetismo', 'Secundaria Incompleta'
coords_df = coords_df.rename({busco_trab:x, analf:y, edu:z}, axis=1)
df = df.rename({busco_trab:x, analf:y, edu:z}, axis=1)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(13,8), sharey=True)

coords_df.sort_values(x, inplace=True)
sns.barplot(coords_df[x], coords_df.index, palette='copper_r', ax=ax[0])
ax[0].set_xlabel(x, fontsize=14)
ax[0].set_yticklabels(ax[0].get_yticklabels());

sns.barplot(coords_df[y], coords_df.index, palette='copper_r', ax=ax[1])
ax[1].set_xlabel(y, fontsize=14)

sns.barplot(coords_df[z], coords_df.index, palette='copper_r', ax=ax[2])
ax[2].set_xlabel('% Secundario incompleto'.format(z), fontsize=14, rotation=0)

plt.setp(ax[0].xaxis.get_majorticklabels(), rotation=45, fontsize=12)
plt.setp(ax[1].xaxis.get_majorticklabels(), rotation=45, fontsize=12)
plt.setp(ax[2].xaxis.get_majorticklabels(), rotation=45, fontsize=12)
plt.tight_layout(pad=0)

In [None]:
coords_df.sample(5)

In [None]:
cols = [x, y, z, parentesco, edad]

X = df[cols].dropna()

corrs, ps = spearmanr(X.values)

fig, ax = plt.subplots(1,2,figsize=(17,6))

sns.heatmap(corrs, cmap='Blues', ax=ax[0], cbar=False, annot=True, linewidths=1, annot_kws={"size": 20})
ax[0].set_title('Tamaño de efecto, r Spearman')
ax[0].set_yticklabels(cols, rotation=30)
ax[0].set_xticklabels(cols, rotation=30)

sns.heatmap(ps, cmap='Blues_r', ax=ax[1], annot=True, linewidths=1, annot_kws={"size": 20});
ax[1].set_title('P-Valor');
ax[1].set_yticklabels([])
ax[1].set_xticklabels(cols, rotation=30)
plt.tight_layout()

In [None]:
def corrcoef_loop(matrix):
    rows, cols = matrix.shape[0], matrix.shape[1]
    r = np.ones(shape=(rows, rows))
    p = np.ones(shape=(rows, rows))
    for i in range(rows):
        for j in range(i+1, rows):
            r_, p_ = pearsonr(matrix[i], matrix[j])
            r[i, j] = r[j, i] = r_
            p[i, j] = p[j, i] = p_
    return r, p

In [None]:
corrs, ps = corrcoef_loop(X.T.values)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(17,6))

sns.heatmap(corrs, cmap='Purples', ax=ax[0], cbar=False, annot=True, linewidths=1, annot_kws={"size": 20})
ax[0].set_title('Tamaño de efecto, p Pearson')
ax[0].set_yticklabels(cols, rotation=30)
ax[0].set_xticklabels(cols, rotation=30)

sns.heatmap(ps, cmap='Purples_r', ax=ax[1], annot=True, linewidths=1, annot_kws={"size": 20});
ax[1].set_title('P-Valor');
ax[1].set_yticklabels([])
ax[1].set_xticklabels(cols, rotation=30)
plt.tight_layout()

In [None]:
lat = -40
lon= -66
api_k='pk.eyJ1IjoiY2hyaWRkeXAiLCJhIjoiY2lxMnVvdm5iMDA4dnhsbTQ5aHJzcGs0MyJ9.X9o_rzNLNesDxdra4neC_A'

data = [go.Scattermapbox(
        lat = coords_df['lat'],
        lon= coords_df['lon'],
        mode='markers',
        name=x,
        marker=dict(sizemin=6,
            size= coords_df[x]*500,
            color= coords_df[x],
            colorscale='Blues',
            opacity=0.8,
            showscale=True,
            reversescale=True,
            colorbar=dict(  title=x, 
                          x=1, 
                          xpad=15,
                          titleside='right',
                          tickmode='auto',
                          tickangle=20,
                          ),
        ),
        text= coords_df.index.values + 
            '<br> {}: %'.format(x) + coords_df[x].apply(lambda x: x*100).apply('{0:.3f}'.format).astype(str),
        
        hoverinfo= 'text'),
    
    go.Scattermapbox(
        lat = coords_df['lat'],
        lon= coords_df['lon'],
        mode='markers',
        name = y,
        marker=dict(sizemin=6,
            size= coords_df[y]*2000,
            color= coords_df[y],
            colorscale=[[0, 'rgb(255,200,200)'], [0.5, 'rgb(255,100,100)'],
                       [1, 'rgb(150,0,0)']],
            reversescale=False,
            opacity=0.6,
            showscale=True,
            colorbar=dict(title=y,
                          titleside='right',
                          x=1, 
                          xpad=110,
                          tickmode='auto', 
                          tickangle=20),
        ),
        text= coords_df.index.values + 
            '<br> {}: %'.format(y) + coords_df[y].apply(lambda x: x*100).apply('{0:.3f}'.format).astype(str),
        
        hoverinfo= 'text'
    ),
    go.Scattermapbox(
        lat = coords_df['lat'],
        lon= coords_df['lon'],
        mode='markers',
        name = z,
        marker=dict(sizemin=6,
                    symbol='circle',
            size= coords_df[z]*100,
            color= coords_df[z],
            colorscale=[[0, 'rgb(210,235,215)'], [0.5, 'rgb(10,180,60)'],
                       [1, 'rgb(50,120,50)']],
            reversescale=False,
            opacity=0.6,
            showscale=True,
            colorbar=dict(title=z,
                          titleside='right',
                          x=1, 
                          xpad=208,
                          tickmode='auto', 
                          tickangle=20),
        ),
        text= coords_df.index.values + 
            '<br> {}: %'.format(z) + coords_df[z].apply(lambda x: x*100).apply('{0:.3f}'.format).astype(str),
        
        hoverinfo= 'text'
    )]

layout = go.Layout(
    title = 'Indicadores relacionados con el mercado laboral en Argentina',
    autosize=True,
    hovermode='closest',
    height=800,
    legend=dict(x=-.25, y=1),
    mapbox=dict(accesstoken=api_k,
        bearing=0,
        center=dict(
            lat=lat,
            lon=lon
        ),
        pitch=0,
        zoom=3.2,
        style='satellite-streets'
    ),
)

fig = dict( data=data, layout=layout )
offline.iplot(fig, filename='d3-cloropleth-map' )