INSTALACION DE LIBRERIAS Y CARGA DE DATOS

In [5]:
# ============================================
# Taller 1 – Estadística Probabilística
# Exploración y visualización del dataset DS Salaries
# Google Colab + Drive
# ============================================


In [6]:
# Instalamos librerías necesarias en Colab
!pip install pandas openpyxl

# Imports básicos
import pandas as pd
import numpy as np

# Conexión con Google Drive para acceder al archivo
from google.colab import drive
drive.mount('/content/drive')

# Carga del archivo desde Google Drive
salarios = pd.read_excel("/content/drive/MyDrive/ds_salaries.xlsx", sheet_name='ds_salaries')
salarios


Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...,...
602,602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


EXPLORACION INICIAL Y LIMPIEZA

In [None]:
# Información básica del DataFrame
salarios.info()

# Eliminar columna innecesaria 'Unnamed: 0'
salarios.drop(['Unnamed: 0'], axis =1, inplace=True)
salarios.info()

# Tamaño del dataset (filas, columnas)
salarios.shape


NORMALIZACION DE VARIABLES CATEGORICAS

In [7]:
# Convertir valores numéricos/códigos en etiquetas legibles
salarios.remote_ratio.replace([100, 50, 0], ['remote', 'partially remote', 'on-site'], inplace=True)
salarios.experience_level.replace(['EN', 'MI','SE', 'EX'], ['Entry', 'Mid', 'Senior', 'Executive'], inplace=True)
salarios.employment_type.replace(['PT', 'FT', 'CT', 'FL'], ['Part-time', 'Full_time', 'Contract', 'Freelance'], inplace=True)
salarios.company_size.replace(['S', 'M', 'L'], ['less than 50 employees', '50 to 250 employees', 'more than 250 employees'], inplace=True)

# Vista previa
salarios.head()


Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,Mid,Full_time,Data Scientist,70000,EUR,79833,DE,on-site,DE,more than 250 employees
1,1,2020,Senior,Full_time,Machine Learning Scientist,260000,USD,260000,JP,on-site,JP,less than 50 employees
2,2,2020,Senior,Full_time,Big Data Engineer,85000,GBP,109024,GB,partially remote,GB,50 to 250 employees
3,3,2020,Mid,Full_time,Product Data Analyst,20000,USD,20000,HN,on-site,HN,less than 50 employees
4,4,2020,Senior,Full_time,Machine Learning Engineer,150000,USD,150000,US,partially remote,US,more than 250 employees


CONFIGURACION DE VISUALIZACION

In [8]:
# Librerías de visualización
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Estilos gráficos
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.facecolor'] = '#00000000'

import warnings
warnings.filterwarnings("ignore")


ANALISIS EXPLORATORIO Y VISUALIZACIONES

DISTRIBUCION DE NIVELES DE EXPERIENCIA

In [9]:
salarios.groupby(['experience_level']).size()

fig=px.histogram(data_frame=salarios,x="experience_level",
                 color_discrete_sequence=[px.colors.sequential.Aggrnyl],
                 template='plotly_white',title="Distribution of Experience Level",
                 text_auto='.2f')

fig.update_layout(yaxis_title="Count",xaxis_title="Experience Level",xaxis={'categoryorder':'total descending'})
fig.show()


DISTRIBUCION POR TIPO DE CONTRATO

In [10]:
salarios.groupby(['employment_type']).size()

fig=px.histogram(data_frame=salarios,x="employment_type",
                 color_discrete_sequence=[px.colors.sequential.Aggrnyl],
                 template='plotly_white',title="Distribution of employment_type",
                 text_auto='.2f')

fig.update_layout(yaxis_title="Count",xaxis_title="employment_type",xaxis={'categoryorder':'total descending'})
fig.show()


TABLA CRUZADA (EXPERIENCIA X TIPO DE EMPLEO)

In [11]:
tabla_cruzada = pd.crosstab(
    index=salarios['experience_level'],
    columns=salarios['employment_type'],
    margins=True,
    margins_name="Total"
)
tabla_cruzada


employment_type,Contract,Freelance,Full_time,Part-time,Total
experience_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Entry,2,0,79,7,88
Executive,1,0,25,0,26
Mid,1,3,206,3,213
Senior,1,1,278,0,280
Total,5,4,588,10,607


HISTOGRAMAS AGRUPADOS

In [12]:
fig=px.histogram(data_frame=salarios,x="experience_level",color="employment_type",
                 barmode="group",histnorm="percent",
                 color_discrete_sequence=px.colors.sequential.Aggrnyl,
                 template="plotly_white",title="Distribution of Experience Level/Employment Type %",
                 text_auto=".2f")

fig.update_layout(yaxis_title="Percent",xaxis_title="Experience Level",xaxis={"categoryorder":"total descending"})
fig.show()
