In [12]:
import pycountry 
import pandas as pd
import plotly.express as px
import numpy as np

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/guilhermeonrails/data-jobs/refs/heads/main/salaries.csv")

In [6]:
df.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='str')

In [7]:
df.rename(columns={'work_year': 'ano', 'experience_level': 'senioridade', 'employment_type': 'contrato', 'job_title': 'cargo', 'salary': 'salario', 'salary_currency': 'moeda', 'salary_in_usd': 'usd', 'employee_residence': 'residencia', 'remote_ratio': 'remoto', 'company_location': 'empresa', 'company_size': 'tamanho_empresa'}, inplace=True)

In [8]:
df["senioridade"] = df["senioridade"].replace({
    'EN': 'Júnior',
    'MI': 'Pleno',
    'SE': 'Sênior',
    'EX': 'Executivo'
})

In [9]:
df["remoto"] = df["remoto"].replace({
    0: 'Presencial',
    50: 'Híbrido',
    100: 'Remoto'
})

In [10]:
df["tamanho_empresa"] = df["tamanho_empresa"].replace({
    'M': 'Médio',
    'S': 'Pequeno',
    'L': 'Grande'
})

In [13]:
df_salarios = pd.DataFrame({
    'nome': ["Ana",'Bruno','Carla','Daniel','Val'],
    'salario': [4000, np.nan, 5000, np.nan, 100000]
})
df_salarios['salario_media'] = df_salarios['salario'].fillna(df_salarios['salario'].mean().round(2))
df_salarios['salario_mediana'] = df_salarios['salario'].fillna(df_salarios['salario'].median().round(2))
df_salarios

Unnamed: 0,nome,salario,salario_media,salario_mediana
0,Ana,4000.0,4000.0,4000.0
1,Bruno,,36333.33,5000.0
2,Carla,5000.0,5000.0,5000.0
3,Daniel,,36333.33,5000.0
4,Val,100000.0,100000.0,100000.0


In [14]:
df_temperaturas = pd.DataFrame({
    'Dia': ["Segunda", "Terça", "Quarta", "Quinta", "Sexta"],
    'Temperatura': [30, np.nan, np.nan, 28, 27]
})
df_temperaturas['preenchido_ffill'] = df_temperaturas['Temperatura'].ffill()
df_temperaturas['preenchido_bfill'] = df_temperaturas['Temperatura'].bfill()
df_temperaturas

Unnamed: 0,Dia,Temperatura,preenchido_ffill,preenchido_bfill
0,Segunda,30.0,30.0,30.0
1,Terça,,30.0,28.0
2,Quarta,,30.0,28.0
3,Quinta,28.0,28.0,28.0
4,Sexta,27.0,27.0,27.0


In [15]:
df_limpo = df.dropna()
df_limpo.head()

Unnamed: 0,ano,senioridade,contrato,cargo,salario,moeda,usd,residencia,remoto,empresa,tamanho_empresa
0,2025.0,Sênior,FT,Solutions Engineer,214000,USD,214000,US,Remoto,US,Médio
1,2025.0,Sênior,FT,Solutions Engineer,136000,USD,136000,US,Remoto,US,Médio
2,2025.0,Pleno,FT,Data Engineer,158800,USD,158800,AU,Presencial,AU,Médio
3,2025.0,Pleno,FT,Data Engineer,139200,USD,139200,AU,Presencial,AU,Médio
4,2025.0,Júnior,FT,Data Engineer,90000,USD,90000,US,Presencial,US,Médio


In [16]:
df_limpo = df_limpo.assign(ano = df_limpo['ano'].astype('Int64'))

In [17]:
# Função para converter ISO-2 para ISO-3
def iso2_to_iso3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except:
        return None

# Criar nova coluna com código ISO-3
df_limpo['residencia_iso3'] = df_limpo['residencia'].apply(iso2_to_iso3)

# Calcular média salarial por país (ISO-3)
df_ds = df_limpo[df_limpo['cargo'] == 'Data Scientist']
media_ds_pais = df_ds.groupby('residencia_iso3')['usd'].mean().reset_index()

# Gerar o mapa
fig = px.choropleth(media_ds_pais,
                    locations='residencia_iso3',
                    color='usd',
                    color_continuous_scale='rdylgn',
                    title='Salário médio de Cientista de Dados por país',
                    labels={'usd': 'Salário médio (USD)', 'residencia_iso3': 'País'})

fig.show()

In [18]:
df_limpo.head()

Unnamed: 0,ano,senioridade,contrato,cargo,salario,moeda,usd,residencia,remoto,empresa,tamanho_empresa,residencia_iso3
0,2025,Sênior,FT,Solutions Engineer,214000,USD,214000,US,Remoto,US,Médio,USA
1,2025,Sênior,FT,Solutions Engineer,136000,USD,136000,US,Remoto,US,Médio,USA
2,2025,Pleno,FT,Data Engineer,158800,USD,158800,AU,Presencial,AU,Médio,AUS
3,2025,Pleno,FT,Data Engineer,139200,USD,139200,AU,Presencial,AU,Médio,AUS
4,2025,Júnior,FT,Data Engineer,90000,USD,90000,US,Presencial,US,Médio,USA


In [19]:
df_limpo.to_csv('dados-imersao-final.csv', index=False)