# Estudio final para articulo de Computación cientifica y analítica

In [None]:
# Installing Altair
!pip install vega altair

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import altair as alt
alt.renderers.enable('default')

In [None]:
# Enable in altair dataframe more than 5000 rows
from altair import pipe, limit_rows, to_values
t = lambda data: pipe(data, limit_rows(max_rows=100000), to_values)
alt.data_transformers.register('custom', t)
alt.data_transformers.enable('custom')

In [None]:
# Graficar inline
%matplotlib inline

In [None]:
df =  pd.read_csv("./pseudo_facebook/pseudo_facebook.csv", encoding="latin-1")

In [None]:
#df.head()
df.sample()

In [None]:
df.shape

## Analisis Exploratorio de Datos - EDA

In [None]:
# Búsqueda de datos Nulos o NaN
print(f'Columnas con valores nulos: {df.columns[df.isnull().any()]}')

In [None]:
# verificación de valores null en las columnas objetivo
print('Gender: %d' % (df['gender'].isnull().sum()))
print('Quantity Post: %d' % (df['qty_post'].isnull().sum()))

In [None]:
# Llenar los valores NaN Cantidad de Post con 0 y convertir a int64.
# Llenar los valores NaN Género a NotDefined
df['qty_post'] = df['qty_post'].fillna(0)
df['qty_post'] = df['qty_post'].astype('int64')
df['gender'] = df['gender'].fillna('notDefined')
df.head()

In [None]:
# Verificar datos Nulos o NaN
print(f'Columnas con valores nulos: {df.columns[df.isnull().any()]}')
print(f'Tamaño del DataFrame: {df.shape}')

In [None]:
# filtrado de datos y sacar los datos de los que no definen el sexo para evitar sesgo de edad
df = df[df['age'] < 100]
df = df[df.gender.isin(['male','female'])]
df.head()

In [None]:
# Comprabación de datos 'NotDefinded'
exist_NotDefined = df[df['gender'] == 'NotDefined'].sum().any()
print(f'Existe algun NotDefined en el género: { exist_NotDefined }')

## Descriptive Stadistic

In [None]:
df.groupby(['gender']).agg({
    'userid': ['min','max'],
    'qty_post': ['mean','median','std', 'count']
}).round(2)

In [None]:
alt.Chart(df).mark_bar().encode(
    alt.X('count():Q'),
    alt.Y('gender:N', sort = '-x'),
    alt.Color('gender:N'),
    tooltip = [alt.Tooltip('count():Q')]
).properties(
   width = 500,
   height = 50
)

In [None]:
alt.Chart(df).mark_boxplot().encode(
    alt.Y('qty_post:Q'),
    alt.X('gender:N'),
    alt.Color('gender:N')
).properties(
    width = 1000,
    height = 250
)

In [None]:
alt.Chart(df).mark_point(filled = True).encode(
    alt.X('mobile_likes_received:Q'),
    alt.Y('www_likes_received:Q'),
    alt.Size('age:Q'),
    alt.Color('gender:N'),
    alt.OpacityValue(0.7),
    tooltip = [alt.Tooltip('mobile_likes_received:Q'),
               alt.Tooltip('www_likes_received:Q'),
               alt.Tooltip('gender:N'),
               alt.Tooltip('age:O')]
).properties(
    title = 'Correlación entre Likes por Móvil vs Likes por PC',
    width = 600,
    height = 500
).interactive()

In [None]:
qty_post_he = df.groupby(['gender']).agg({
    'likes': ['min','max'],
    'mobile_likes': ['min','max'],
    'www_likes': ['min','max'],
    'qty_post': ['min','max']
}).round(2)

qty_post_he

In [None]:
qty_post = alt.Chart(df).mark_area(opacity=0.5).encode(
    x="age:Q",
    y=alt.Y("qty_post:Q", stack=None),
    color="gender:N"
).properties(
    height=100,
    width=1000
)

likes = alt.Chart(df).mark_area(opacity=0.5).encode(
    x="age:Q",
    y=alt.Y("likes:Q", stack=None),
    color="gender:N"
).properties(
    height=100,
    width=1000
)

mobile_likes = alt.Chart(df).mark_area(opacity=0.5).encode(
    x="age:Q",
    y=alt.Y("mobile_likes:Q", stack=None),
    color="gender:N"
).properties(
    height=100,
    width=1000
)

www_likes = alt.Chart(df).mark_area(opacity=0.5).encode(
    x="age:Q",
    y=alt.Y("www_likes:Q", stack=None),
    color="gender:N"
).properties(
    height=100,
    width=1000
)

alt.vconcat(qty_post, likes, mobile_likes, www_likes)