# Entendiendo nuestros datos

In [None]:
# import libraries
import pandas as pd
from optimus import Optimus
op = Optimus()

In [None]:
# Read the data
# Data from http://rpubs.com/rhuebner/HRCodebook-13
df = op.read.csv("data/hr-data.csv", header=True)

In [None]:
# See the data
df.table()

In [None]:
# Número de datos faltantes por columna
from pyspark.sql.functions import when, count, col, isnull

df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).table()

In [None]:
df = df.dropna(how="all")

In [None]:
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).table()

In [None]:
df.printSchema()

In [None]:
integer_cols = ["MaritalStatusID", "EmpStatusID", "DeptID", "PerfScoreID", "PositionID", "Termd", "ManagerID", 
                "EmpSatisfaction", "SpecialProjectsCount", "DaysLateLast30"]

In [None]:
for col_name in integer_cols:
    df = df.withColumn(col_name, col(col_name).cast('int'))

In [None]:
float_cols = ["PayRate", "EngagementSurvey"]

In [None]:
for col_name in float_cols:
    df = df.withColumn(col_name, col(col_name).cast('float'))

In [None]:
df.printSchema()

## Demográficos 

In [None]:
# Número de empleados
df.count()

In [None]:
df.cols.years_between?

In [None]:
# Get age
df = df.cols.years_between("DOB", date_format="mm/dd/yy",output_cols="Age")

In [None]:
# For plots let's use pandas
df_pd = df.toPandas()

In [None]:
df.plot.hist("Age", buckets=20)

In [None]:
# Using plotly
# hist
import plotly.express as px
px.histogram(df_pd, "Age", nbins=100)

In [None]:
# Gender
px.histogram(df_pd, x="Sex")

In [None]:
# Estado civil
px.histogram(df_pd, x="MaritalDesc")

In [None]:
# Raza
px.histogram(df_pd, x="RaceDesc")

## Información sobre empleo

In [None]:
# Departamento
px.histogram(df_pd, "Department")

In [None]:
# Fuente de empleo
px.histogram(df_pd, "RecruitmentSource")

In [None]:
# Satisfacción
px.histogram(df_pd, "EmpSatisfaction", nbins=10)

In [None]:
# Proyectos especiales
px.histogram(df_pd, "SpecialProjectsCount", nbins=10)

In [None]:
# Performance 
px.histogram(df_pd, "PerformanceScore")

## Relaciones

In [None]:
df.table()

In [None]:
px.histogram(df_pd, x="Department", color="Termd", barmode="group")

In [None]:
px.histogram(df_pd, x="MaritalDesc", color="Termd", barmode="group")

In [None]:
px.histogram(df_pd, x="RecruitmentSource", color="Termd", barmode="group")

In [None]:
px.histogram(df_pd, x="Age", color="Termd", barmode="group")

In [None]:
px.histogram(df_pd, x="PayRate", color="Termd", barmode="group")

In [None]:
px.scatter(df_pd, x="PayRate", y="EmpSatisfaction")

In [None]:
px.histogram(df_pd, "PerformanceScore", color="Termd", barmode="group")

In [None]:
px.scatter(df_pd, x="PayRate", y="Age")

In [None]:
px.bar(df_pd, x="ManagerName", y="PerformanceScore", color="PerformanceScore")

## Información departamental

In [None]:
df.table()

In [None]:
# Cuántos empleados por departamento
df.groupby("Department").count().table()

In [None]:
# Salario promedio por departamento
df.groupby("Department").avg("PayRate").table()

In [None]:
# Edad promedio por departamento
df.groupby("Department").avg("Age").table()

In [None]:
# Sexo por departamento
px.histogram(df_pd,x="Sex", color="Department")

In [None]:
# Activos por departamento
px.histogram(df_pd,x="Department", color="Termd")