In [None]:
#Cargar datos limpios del EDA
import pandas as pd

df = pd.read_csv('../data/processed/devs_limpios.csv')
df.head()


Unnamed: 0,DevType,Country,RemoteWork,LearnCode,ConvertedCompYearly
0,Data scientist or machine learning specialist,Pakistan,"Hybrid (some remote, some in-person)",On the job training;Other online resources (e....,7322.0
1,Academic researcher,Austria,"Hybrid (some remote, some in-person)",Books / Physical media;Colleague;On the job tr...,30074.0
2,Data scientist or machine learning specialist,Turkey,Remote,Books / Physical media;Other online resources ...,91295.0
3,"Developer, back-end",France,Remote,Books / Physical media;On the job training;Oth...,53703.0
4,Student,United States of America,Remote,Books / Physical media;Colleague;On the job tr...,110000.0


In [2]:
#Limpieza y transformación
# DevType → convertir en columnas binarias (multi-label)
df['DevType'] = df['DevType'].str.split(';')

# Convertir roles a variables dummies (solo los top 10 más comunes)
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
devtype_dummies = pd.DataFrame(mlb.fit_transform(df['DevType']),
                               columns=mlb.classes_,
                               index=df.index)

# Opcional: quedate solo con los roles más comunes
top_roles = devtype_dummies.sum().sort_values(ascending=False).head(10).index
devtype_dummies = devtype_dummies[top_roles]


In [3]:
#Country → unificar países minoritarios en "Other"
top_countries = df['Country'].value_counts().head(15).index
df['Country'] = df['Country'].apply(lambda x: x if x in top_countries else 'Other')


In [4]:
#RemoteWork → codificar como 1 (remoto) y 0 (presencial/otro)
df['Remote'] = df['RemoteWork'].apply(lambda x: 1 if 'remote' in str(x).lower() else 0)


In [5]:
#LearnCode → binaria: ¿aprendió por sí mismo?
df['LearnCode'] = df['LearnCode'].str.lower()
df['SelfTaught'] = df['LearnCode'].apply(lambda x: 1 if 'self-taught' in str(x) else 0)


In [6]:
#Unir todo
df_final = pd.concat([df, devtype_dummies], axis=1)
df_final.drop(columns=['DevType', 'LearnCode', 'RemoteWork'], inplace=True)


In [7]:
#Opcional: eliminar outliers extremos de salario
# Mantener salarios entre el percentil 5 y 95
q_low = df_final['ConvertedCompYearly'].quantile(0.05)
q_high = df_final['ConvertedCompYearly'].quantile(0.95)
df_final = df_final[(df_final['ConvertedCompYearly'] >= q_low) &
                    (df_final['ConvertedCompYearly'] <= q_high)]


In [None]:
df_final.to_csv('../data/processed/devs_final.csv', index=False)

#🎯 ¿Qué contiene devs_final.csv?
#ConvertedCompYearly: salario anual.
#Country, SelfTaught, Remote: variables limpias.
#Roles como columnas binarias: Data scientist, Web developer, etc.