In [73]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
import xgboost as xgb

with open('../Pickles/data_2023.pickle', 'rb') as archivo:
    df1 = pickle.load(archivo)
with open('../Pickles/data_2024.pickle', 'rb') as archivo:
    df = pickle.load(archivo)
df = pd.concat([df1, df], ignore_index=True, join='inner')
# El modelo lo entrenare solo con estas columnas
df = df[['YearsCodePro', 'LearnCodeOnline', 'DevType', 'LearnCode', 'CodingActivities', 
         'DatabaseHaveWorkedWith', 'YearsCode', 'LanguageWantToWorkWith', 
         'LanguageHaveWorkedWith',  'EdLevel', 'Employment', 'ToolsTechHaveWorkedWith', 
           'AISent',  'Industry', 'Frequency_2', 'Frequency_1', 'CompTotal']]
# YearsCodePro
moda = df['YearsCodePro'].mode()[0]
df['YearsCodePro'] = df['YearsCodePro'].fillna(moda)
df['YearsCodePro']=df['YearsCodePro'].replace('Less than 1 year', 0)
df['YearsCodePro']=df['YearsCodePro'].astype(int)
# YearsCode
moda = df['YearsCode'].mode()[0]
df['YearsCode'] = df['YearsCode'].fillna(moda)
df['YearsCode'] = df['YearsCode'].replace('Less than 1 year', 0)
df['YearsCode'] = df['YearsCode'].replace('More than 50 years', 50)
df['YearsCode']=df['YearsCode'].astype(int)


# Funciones:
def process_multiple_categories(df, category_column, target_column, separator=','):
    """
    Realiza target encoding para columnas con múltiples valores separados por un delimitador.
    """
    # Llenar valores NaN en la columna categórica con 'Unknown'
    df[category_column] = df[category_column].fillna('Unknown')
    
    # Expandir los valores separados por comas en listas
    df[category_column] = df[category_column].apply(lambda x: x.split(separator) if isinstance(x, str) else [x])
    
    # Crear un DataFrame temporal para aplanar las listas
    exploded_df = df.explode(category_column)
    
    # Calcular el promedio del target por categoría
    target_map = exploded_df.groupby(category_column)[target_column].mean().to_dict()
    
    # Función para calcular el promedio de los valores codificados para una fila
    def calculate_row_encoding(categories):
        encoded_values = [target_map.get(cat, 0) for cat in categories]  # 0 para categorías desconocidas
        return sum(encoded_values) / len(encoded_values) if encoded_values else 0
    
    # Crear la nueva columna con el promedio del encoding
    df[f'{category_column}_encoded'] = df[category_column].apply(calculate_row_encoding)
    
    return df, target_map
#LearnCodeOnline
df, target_map = process_multiple_categories(df, 'LearnCodeOnline', 'CompTotal')
df = df.drop(['LearnCodeOnline'], axis=1, errors='ignore')
# DevType
df, target_map = process_multiple_categories(df, 'DevType', 'CompTotal')
df = df.drop(['DevType'], axis=1, errors='ignore')
# LearnCode
df, target_map = process_multiple_categories(df, 'LearnCode', 'CompTotal')
df = df.drop(['LearnCode'], axis=1, errors='ignore')
# CodingActivities
df, target_map = process_multiple_categories(df, 'CodingActivities', 'CompTotal')
df = df.drop(['CodingActivities'], axis=1, errors='ignore')

# # Crear un diccionario para almacenar los target_maps
# target_map_learncodeonline = {}

# # Lista de columnas a procesar
# columns_to_encode = ['LearnCodeOnline', 'DevType', 'LearnCode', 'CodingActivities']
# process_multiple_categories(df, category_column='LearnCodeOnline', target_column='CompTotal')

# # Aplicar la función a cada columna y guardar el target_map
# for column in columns_to_encode:
#     df, target_map = process_multiple_categories(df, category_column=column, target_column='CompTotal')
#     target_maps[column] = target_map  # Guardar el target_map en el diccionario


def process_and_encode(df, columns):
    for column in columns:
        # Separamos las categorías por el delimitador ";"
        df[column] = df[column].fillna('').str.split(';')

        # Creamos un objeto MultiLabelBinarizer
        mlb = MultiLabelBinarizer()

        # Aplicamos MultiLabelBinarizer a las categorías separadas
        encoded_values = mlb.fit_transform(df[column])

        # Creamos un DataFrame con los valores codificados
        encoded_df = pd.DataFrame(encoded_values, columns=[f"{column}_{c}" for c in mlb.classes_], index=df.index)

        # Añadimos los resultados al DataFrame copiado
        df = pd.concat([df, encoded_df], axis=1)

        # Eliminamos la columna original del DataFrame copiado
        df.drop(column, axis=1, inplace=True)

    return df



# DatabaseHaveWorkedWith, LanguageWantToWorkWith, LanguageHaveWorkedWith, ToolsTechHaveWorkedWith
columns_to_encode = ['DatabaseHaveWorkedWith', 'LanguageWantToWorkWith', 'LanguageHaveWorkedWith', 'ToolsTechHaveWorkedWith']
df = process_and_encode(df, columns_to_encode)

# EdLevel
labels5 = {
    'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 5,
    'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 4,
    'Professional degree (JD, MD, Ph.D, Ed.D, etc.)': 6,
    'Some college/university study without earning a degree': 2, 
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 1,
    'Associate degree (A.A., A.S., etc.)': 3, 
    'Something else': -1,
    'Primary/elementary school': 0}
df['EdLevel'] = df['EdLevel'].map(labels5).fillna(-1)
#Employment
df['Employment'] = df['Employment'].replace('Retired', 'I prefer not to say')
df['is_full_time'] = df['Employment'].str.contains('Employed, full-time').fillna(False).astype(int)
df['is_part_time'] = df['Employment'].str.contains('Employed, part-time').fillna(False).astype(int)
df['is_independent'] = df['Employment'].str.contains('Independent contractor, freelancer, or self-employed').fillna(False).astype(int)
df['num_jobs'] = df['Employment'].str.split(';').str.len().fillna(0).astype(int)
df['is_other_employment'] = ((df['is_full_time'] == 0) & (df['is_part_time'] == 0) &
                            (df['is_independent'] == 0)).astype(int)
df.drop('Employment', axis=1, inplace=True)
#AISent
df['AISent'] = df['AISent'].fillna('Unsure')
labels61 = {
    'Very favorable': 5, 
    'Favorable': 4, 
    'Indifferent': 3, 
    'Unfavorable': 2,
    'Very unfavorable': 1,
    'Unsure': 0}
df['AISent'] = df['AISent'].map(labels61).fillna(-1)
#Industry
industry_map = {
    'Information Services, IT, Software Development, or other Technology': 'Tecnología y Servicios Digitales',
    'Other:': 'Otros Servicios', 
    'Healthcare': 'Salud y Educación',
    'Retail and Consumer Services': 'Otros Servicios',
    'Legal Services': 'Otros Servicios',
    'Higher Education': 'Salud y Educación',
    'Financial Services': 'Servicios Financieros',
    'Advertising Services': 'Otros Servicios',
    'Manufacturing, Transportation, or Supply Chain': 'Industria y Energía',
    'Insurance': 'Servicios Financieros',
    'Wholesale': 'Otros Servicios',
    'Oil & Gas': 'Industria y Energía'}
def map_industry(value):
    if pd.isna(value):
        return 'Desconocido'
    return industry_map.get(value, 'Otros Servicios')
most_frequent_industry = df['Industry'].map(industry_map).mode()[0]
df['Industry_Category'] = df['Industry'].map(industry_map).fillna(most_frequent_industry)
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['Industry_Category']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Industry_Category']), index=df.index)
df = pd.concat([df, encoded_df], axis=1)
df = df.drop(['Industry_Category', 'Industry'], axis=1)
# Frequency_1, Frequency_2
labels88 = {
    '10+ times a week': 4, 
    '6-10 times a week': 3, 
    '3-5 times a week': 2,
    '1-2 times a week': 1, 
    'Never': 0, 
    'Other': -1
}
median_freq1 = df['Frequency_1'].map(labels88).median()
df['Frequency_1'] = df['Frequency_1'].map(labels88).fillna(median_freq1)
median_freq2 = df['Frequency_2'].map(labels88).median()
df['Frequency_2'] = df['Frequency_2'].map(labels88).fillna(median_freq2)

In [None]:

# Aplicar la función a cada columna y guardar el target_map
for column in columns_to_encode:
    df, target_map = process_multiple_categories(df, category_column=column, target_column='CompTotal')
    target_maps[column] = target_map  # Guardar el target_map en el diccionario


def process_and_encode(df, columns):
    for column in columns:
        # Separamos las categorías por el delimitador ";"
        df[column] = df[column].fillna('').str.split(';')

        # Creamos un objeto MultiLabelBinarizer
        mlb = MultiLabelBinarizer()

        # Aplicamos MultiLabelBinarizer a las categorías separadas
        encoded_values = mlb.fit_transform(df[column])

        # Creamos un DataFrame con los valores codificados
        encoded_df = pd.DataFrame(encoded_values, columns=[f"{column}_{c}" for c in mlb.classes_], index=df.index)

        # Añadimos los resultados al DataFrame copiado
        df = pd.concat([df, encoded_df], axis=1)

        # Eliminamos la columna original del DataFrame copiado
        df.drop(column, axis=1, inplace=True)

    return df


# DatabaseHaveWorkedWith, LanguageWantToWorkWith, LanguageHaveWorkedWith, ToolsTechHaveWorkedWith
columns_to_encode = ['DatabaseHaveWorkedWith', 'LanguageWantToWorkWith', 'LanguageHaveWorkedWith', 'ToolsTechHaveWorkedWith']
df = process_and_encode(df, columns_to_encode)

# EdLevel
labels5 = {
    'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 5,
    'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 4,
    'Professional degree (JD, MD, Ph.D, Ed.D, etc.)': 6,
    'Some college/university study without earning a degree': 2, 
    'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 1,
    'Associate degree (A.A., A.S., etc.)': 3, 
    'Something else': -1,
    'Primary/elementary school': 0}
df['EdLevel'] = df['EdLevel'].map(labels5).fillna(-1)
#Employment
df['Employment'] = df['Employment'].replace('Retired', 'I prefer not to say')
df['is_full_time'] = df['Employment'].str.contains('Employed, full-time').fillna(False).astype(int)
df['is_part_time'] = df['Employment'].str.contains('Employed, part-time').fillna(False).astype(int)
df['is_independent'] = df['Employment'].str.contains('Independent contractor, freelancer, or self-employed').fillna(False).astype(int)
df['num_jobs'] = df['Employment'].str.split(';').str.len().fillna(0).astype(int)
df['is_other_employment'] = ((df['is_full_time'] == 0) & (df['is_part_time'] == 0) &
                            (df['is_independent'] == 0)).astype(int)
df.drop('Employment', axis=1, inplace=True)
#AISent
df['AISent'] = df['AISent'].fillna('Unsure')
labels61 = {
    'Very favorable': 5, 
    'Favorable': 4, 
    'Indifferent': 3, 
    'Unfavorable': 2,
    'Very unfavorable': 1,
    'Unsure': 0}
df['AISent'] = df['AISent'].map(labels61).fillna(-1)
#Industry
industry_map = {
    'Information Services, IT, Software Development, or other Technology': 'Tecnología y Servicios Digitales',
    'Other:': 'Otros Servicios', 
    'Healthcare': 'Salud y Educación',
    'Retail and Consumer Services': 'Otros Servicios',
    'Legal Services': 'Otros Servicios',
    'Higher Education': 'Salud y Educación',
    'Financial Services': 'Servicios Financieros',
    'Advertising Services': 'Otros Servicios',
    'Manufacturing, Transportation, or Supply Chain': 'Industria y Energía',
    'Insurance': 'Servicios Financieros',
    'Wholesale': 'Otros Servicios',
    'Oil & Gas': 'Industria y Energía'}
def map_industry(value):
    if pd.isna(value):
        return 'Desconocido'
    return industry_map.get(value, 'Otros Servicios')
most_frequent_industry = df['Industry'].map(industry_map).mode()[0]
df['Industry_Category'] = df['Industry'].map(industry_map).fillna(most_frequent_industry)
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['Industry_Category']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Industry_Category']), index=df.index)
df = pd.concat([df, encoded_df], axis=1)
df = df.drop(['Industry_Category', 'Industry'], axis=1)
# Frequency_1, Frequency_2
labels88 = {
    '10+ times a week': 4, 
    '6-10 times a week': 3, 
    '3-5 times a week': 2,
    '1-2 times a week': 1, 
    'Never': 0, 
    'Other': -1
}
median_freq1 = df['Frequency_1'].map(labels88).median()
df['Frequency_1'] = df['Frequency_1'].map(labels88).fillna(median_freq1)
median_freq2 = df['Frequency_2'].map(labels88).median()
df['Frequency_2'] = df['Frequency_2'].map(labels88).fillna(median_freq2)

In [72]:
target_map

{'Bootstrapping a business': 88874.97297297297,
 'Bootstrapping a business;Freelance/contract work': 106727.27272727272,
 'Bootstrapping a business;Professional development or self-paced learning from online courses': 95467.5,
 'Bootstrapping a business;Professional development or self-paced learning from online courses;Freelance/contract work': 79500.0,
 'Bootstrapping a business;Professional development or self-paced learning from online courses;School or academic work': 45000.0,
 'Bootstrapping a business;School or academic work': 140000.0,
 'Bootstrapping a business;School or academic work;Professional development or self-paced learning from online courses': 30000.0,
 'Contribute to open-source projects': 92454.54545454546,
 'Contribute to open-source projects;Bootstrapping a business': 65333.333333333336,
 'Contribute to open-source projects;Bootstrapping a business;Freelance/contract work': 56333.333333333336,
 'Contribute to open-source projects;Bootstrapping a business;Professi

In [70]:
def process_years_code(df, column_name):
    moda = df[column_name].mode()[0]
    df[column_name] = df[column_name].fillna(moda)
    df[column_name] = df[column_name].replace('Less than 1 year', 0)
    df[column_name] = df[column_name].astype(int)
    return df

In [None]:
# Mapeos y configuraciones
edlevel_mapping = {
    'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 5,
    'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 4,
    'Professional degree (JD, MD, Ph.D, Ed.D, etc.)': 6,
    'Some college/university study without earning a degree': 2,
    'Secondary school (e.g. American high school)': 1,
    'Associate degree (A.A., A.S., etc.)': 3,
    'Something else': -1,
    'Primary/elementary school': 0
}

frequency_mapping = {
    '10+ times a week': 4, 
    '6-10 times a week': 3, 
    '3-5 times a week': 2,
    '1-2 times a week': 1, 
    'Never': 0, 
    'Other': -1
}

industry_mapping = {
    'Information Services, IT, Software Development, or other Technology': 'Tecnología y Servicios Digitales',
    'Healthcare': 'Salud y Educación',
    'Retail and Consumer Services': 'Otros Servicios',
    'Legal Services': 'Otros Servicios',
    'Higher Education': 'Salud y Educación',
    'Financial Services': 'Servicios Financieros',
    'Manufacturing': 'Industria y Energía',
    'Insurance': 'Servicios Financieros',
    'Oil & Gas': 'Industria y Energía'
}

# Función para aplicar transformaciones
def process_new_samples(df):
    for col in df.columns:
        if col in ['YearsCodePro', 'YearsCode']:
            df = process_years_code(df, col)
        elif col in ['LearnCodeOnline', 'DevType', 'LearnCode', 'CodingActivities']:
            df, _ = process_multiple_categories(df, col, 'CompTotal')
        elif col in ['DatabaseHaveWorkedWith', 'LanguageWantToWorkWith', 'LanguageHaveWorkedWith', 'ToolsTechHaveWorkedWith']:
            df = process_and_encode(df, [col])
        elif col == 'EdLevel':
            df = process_ordinal_column(df, col, edlevel_mapping)
        elif col == 'Employment':
            df = process_employment(df)
        elif col == 'Industry':
            df = process_industry(df, industry_mapping)
        elif col in ['Frequency_1', 'Frequency_2']:
            df = process_frequency(df, col, frequency_mapping)
        elif col == 'AISent':
            df = process_aisent(df)
    return df

In [68]:
new_df = pd.DataFrame({
    'YearsCodePro': ['1'],
    'LearnCodeOnline': ['Recorded coding sessions;How-to videos;Written Tutorials;Click to write Choice 20;Stack Overflow;Interactive tutorial'],
    'DevType': ['Developer, front-end'], 
    'LearnCode': ['Online Courses or Certification;On the job training;Other online resources (e.g., videos, blogs, forum);School (i.e., University, College, etc)'], 
    'CodingActivities': ['Hobby;Contribute to open-source projects;Professional development or self-paced learning from online courses;Freelance/contract work'], 
    'DatabaseHaveWorkedWith': ['BigQuery;Elasticsearch;MariaDB;MySQL;PostgreSQL;Redis'], 
    'YearsCode': ['5'],
    'LanguageWantToWorkWith': ['C;HTML/CSS;JavaScript;Python;Rust;SQL;TypeScript'], 
    'LanguageHaveWorkedWith': ['C;Dart;Java;JavaScript;Python;SQL;TypeScript'], 
    'EdLevel':['Master’s degree (M.A., M.S., M.Eng., MBA, etc.)'],
    'Employment':['Employed, full-time;Independent contractor, freelancer, or self-employed'], 
    'ToolsTechHaveWorkedWith':['Docker;npm'], 
    'AISent': ['Favorable'], 
    'Industry':['Information Services, IT, Software Development, or other Technology'], 
    'Frequency_2':['6-10 times a week'], 
    'Frequency_1':['1-2 times a week']
})

In [None]:
# Aplicar al nuevo dataset
new_df = process_new_samples(new_df)

In [65]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
import xgboost as xgb

def process_multiple_categories(df, category_column, target_map=None, separator=','):
    # Validar que target_map sea un diccionario
    if not isinstance(target_map, dict):
        raise ValueError(f"Se esperaba un diccionario para target_map, pero se recibió {type(target_map)}.")
    
    df[category_column] = df[category_column].fillna('Unknown')
    df[category_column] = df[category_column].apply(lambda x: x.split(separator) if isinstance(x, str) else [x])

    def calculate_row_encoding(categories):
        # Asegurarse de que target_map sea accesible como un diccionario
        encoded_values = [target_map.get(cat, 0) for cat in categories]
        return sum(encoded_values) / len(encoded_values) if encoded_values else 0

    df[f'{category_column}_encoded'] = df[category_column].apply(calculate_row_encoding)
    return df.drop(category_column, axis=1)

# Función para codificar columnas con valores múltiples usando MultiLabelBinarizer
def process_and_encode(df, columns):
    for column in columns:
        df[column] = df[column].fillna('').str.split(';')
        mlb = MultiLabelBinarizer()
        encoded_values = mlb.fit_transform(df[column])
        encoded_df = pd.DataFrame(encoded_values, columns=[f"{column}_{c}" for c in mlb.classes_], index=df.index)
        df = pd.concat([df, encoded_df], axis=1)
        df.drop(column, axis=1, inplace=True)
    return df

# Función para imputar, transformar y codificar YearsCode y YearsCodePro
def process_years_code(df, column):
    moda = df[column].mode()[0]
    df[column] = df[column].fillna(moda).replace({'Less than 1 year': 0, 'More than 50 years': 50}).astype(int)
    return df

# Función para codificar columnas categóricas ordinales
def process_ordinal_column(df, column, mapping):
    df[column] = df[column].map(mapping).fillna(-1)
    return df

# Función para procesar Employment
def process_employment(df):
    df['is_full_time'] = df['Employment'].str.contains('Employed, full-time').fillna(False).astype(int)
    df['is_part_time'] = df['Employment'].str.contains('Employed, part-time').fillna(False).astype(int)
    df['is_independent'] = df['Employment'].str.contains('Independent contractor').fillna(False).astype(int)
    df['num_jobs'] = df['Employment'].str.split(';').str.len().fillna(0).astype(int)
    df['is_other_employment'] = ((df['is_full_time'] == 0) & (df['is_part_time'] == 0) & 
                                 (df['is_independent'] == 0)).astype(int)
    return df.drop('Employment', axis=1)

# Función para procesar Industry
def process_industry(df, industry_map):
    df['Industry_Category'] = df['Industry'].map(industry_map).fillna('Otros Servicios')
    encoder = OneHotEncoder(sparse_output=False)
    encoded = encoder.fit_transform(df[['Industry_Category']])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Industry_Category']), index=df.index)
    df = pd.concat([df, encoded_df], axis=1)
    return df.drop(['Industry_Category', 'Industry'], axis=1)

# Función para procesar frecuencia
def process_frequency(df, column, mapping):
    median_value = df[column].map(mapping).median()
    df[column] = df[column].map(mapping).fillna(median_value)
    return df

# Función para procesar AISent
def process_aisent(df):
    df['AISent'] = df['AISent'].fillna('Unsure')
    mapping = {
        'Very favorable': 5, 
        'Favorable': 4, 
        'Indifferent': 3, 
        'Unfavorable': 2,
        'Very unfavorable': 1,
        'Unsure': 0
    }
    df['AISent'] = df['AISent'].map(mapping).fillna(-1)
    return df

# Main Workflow
with open('../Pickles/data_2023.pickle', 'rb') as archivo:
    df1 = pickle.load(archivo)
with open('../Pickles/data_2024.pickle', 'rb') as archivo:
    df2 = pickle.load(archivo)

# Combinar los datasets
df = pd.concat([df1, df2], ignore_index=True, join='inner')

# Selección de columnas relevantes
df = df[['YearsCodePro', 'LearnCodeOnline', 'DevType', 'LearnCode', 'CodingActivities', 
         'DatabaseHaveWorkedWith', 'YearsCode', 'LanguageWantToWorkWith', 
         'LanguageHaveWorkedWith', 'EdLevel', 'Employment', 'ToolsTechHaveWorkedWith', 
         'AISent', 'Industry', 'Frequency_2', 'Frequency_1', 'CompTotal']]

# Procesar YearsCodePro y YearsCode
df = process_years_code(df, 'YearsCodePro')
df = process_years_code(df, 'YearsCode')

# Procesar columnas con múltiples categorías y realizar target encoding
target_columns = ['LearnCodeOnline', 'DevType', 'LearnCode', 'CodingActivities']
for col in target_columns:
    df, _ = process_multiple_categories(df, col, 'CompTotal')

# Procesar columnas con valores múltiples usando MultiLabelBinarizer
multi_label_columns = ['DatabaseHaveWorkedWith', 'LanguageWantToWorkWith', 'LanguageHaveWorkedWith', 'ToolsTechHaveWorkedWith']
df = process_and_encode(df, multi_label_columns)

# Procesar columnas ordinales
edlevel_mapping = {
    'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 5,
    'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 4,
    'Professional degree (JD, MD, Ph.D, Ed.D, etc.)': 6,
    'Some college/university study without earning a degree': 2,
    'Secondary school (e.g. American high school)': 1,
    'Associate degree (A.A., A.S., etc.)': 3,
    'Something else': -1,
    'Primary/elementary school': 0
}
df = process_ordinal_column(df, 'EdLevel', edlevel_mapping)

# Procesar Employment
df = process_employment(df)

# Procesar Industry
industry_mapping = {
    'Information Services, IT, Software Development, or other Technology': 'Tecnología y Servicios Digitales',
    'Healthcare': 'Salud y Educación',
    'Retail and Consumer Services': 'Otros Servicios',
    'Legal Services': 'Otros Servicios',
    'Higher Education': 'Salud y Educación',
    'Financial Services': 'Servicios Financieros',
    'Manufacturing': 'Industria y Energía',
    'Insurance': 'Servicios Financieros',
    'Oil & Gas': 'Industria y Energía'
}
df = process_industry(df, industry_mapping)

# Procesar AISent
df = process_aisent(df)

# Procesar frecuencia
frequency_mapping = {
    '10+ times a week': 4, 
    '6-10 times a week': 3, 
    '3-5 times a week': 2,
    '1-2 times a week': 1, 
    'Never': 0, 
    'Other': -1
}
df = process_frequency(df, 'Frequency_1', frequency_mapping)
df = process_frequency(df, 'Frequency_2', frequency_mapping)

ValueError: Se esperaba un diccionario para target_map, pero se recibió <class 'str'>.

In [None]:

# # Dividir los datos
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Entrenar el pipeline
# pipeline.fit(X_train, y_train)

# # Guardar el pipeline
# with open('pipeline_model.pkl', 'wb') as f:
#     pickle.dump(pipeline, f)

# print("Pipeline entrenado y guardado con éxito.")

In [46]:
new_df

Unnamed: 0,YearsCodePro,LearnCodeOnline,DevType,LearnCode,CodingActivities,DatabaseHaveWorkedWith,YearsCode,LanguageWantToWorkWith,LanguageHaveWorkedWith,EdLevel,Employment,ToolsTechHaveWorkedWith,AISent,Industry,Frequency_2,Frequency_1
0,1,Recorded coding sessions;How-to videos;Written...,"Developer, front-end",Online Courses or Certification;On the job tra...,Hobby;Contribute to open-source projects;Profe...,BigQuery;Elasticsearch;MariaDB;MySQL;PostgreSQ...,5,C;HTML/CSS;JavaScript;Python;Rust;SQL;TypeScript,C;Dart;Java;JavaScript;Python;SQL;TypeScript,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","Employed, full-time;Independent contractor, fr...",Docker;npm,Favorable,"Information Services, IT, Software Development...",6-10 times a week,1-2 times a week


In [28]:
# Filtrar el DataFrame según límites razonables de la variable objetivo (CompTotal)
limite_inferior = 18000
limite_superior = 55000
df_filtrado = df[(df['CompTotal'] >= limite_inferior) & (df['CompTotal'] <= limite_superior)]
df = df_filtrado

# Aplicar una transformación logarítmica a la variable objetivo para reducir la variabilidad
df['CompTotal'] = np.log1p(df['CompTotal'])

# Separar la variable objetivo (y) del resto de las características
y = df['CompTotal']
df = df.drop(columns=['CompTotal'], axis=1)

# Seleccionar las columnas más relevantes para el modelo
df = df[['YearsCodePro', 'LearnCodeOnline_encoded', 'DevType_encoded', 'LearnCode_encoded', 'CodingActivities_encoded', 
         'DatabaseHaveWorkedWith_Redis', 'ToolsTechHaveWorkedWith_Docker', 'ToolsTechHaveWorkedWith_Ant', 'YearsCode', 
         'ToolsTechHaveWorkedWith_Homebrew', 'ToolsTechHaveWorkedWith_Kubernetes', 'LanguageWantToWorkWith_GDScript', 
         'LanguageHaveWorkedWith_PHP', 'DatabaseHaveWorkedWith_IBM DB2', 'DatabaseHaveWorkedWith_MySQL', 'EdLevel', 
         'LanguageWantToWorkWith_OCaml', 'is_full_time', 'ToolsTechHaveWorkedWith_Google Test', 'LanguageWantToWorkWith_Kotlin', 
         'ToolsTechHaveWorkedWith_APT', 'LanguageWantToWorkWith_PHP', 'AISent', 'LanguageWantToWorkWith_Lua', 
         'LanguageHaveWorkedWith_Kotlin', 'DatabaseHaveWorkedWith_PostgreSQL', 'Industry_Category_Salud y Educación', 
         'LanguageHaveWorkedWith_Ruby', 'LanguageWantToWorkWith_Visual Basic (.Net)', 'DatabaseHaveWorkedWith_Oracle', 
         'LanguageWantToWorkWith_Swift', 'Frequency_2', 'Frequency_1', 'LanguageHaveWorkedWith_MATLAB', 'LanguageWantToWorkWith_C']]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Escalar los datos para normalizar las características
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Definir los modelos base
random_forest = RandomForestRegressor(random_state=42)
xgb_reg = xgb.XGBRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)

# Crear un modelo Voting Regressor con pesos ajustados
weights = [2.8, 0.6, 1.5]  # Pesos asignados a los modelos base
voting_regressor = VotingRegressor(
    estimators=[
        ('rf', random_forest), 
        ('xgb', xgb_reg), 
        ('gb', gb_model)
    ],
    weights=weights
)

# Ajustar el Voting Regressor al conjunto de entrenamiento
voting_regressor.fit(X_train, y_train)

# Validación cruzada con 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores, mse_scores, mae_scores, mape_scores = [], [], [], []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Entrenar el modelo en el fold
    voting_regressor.fit(X_train_fold, y_train_fold)

    # Predecir en el conjunto de validación
    y_val_pred_log = voting_regressor.predict(X_val_fold)
    y_val_pred = np.expm1(y_val_pred_log)
    y_val_actual = np.expm1(y_val_fold)

    # Calcular métricas de validación
    r2_scores.append(r2_score(y_val_actual, y_val_pred))
    mse_scores.append(mean_squared_error(y_val_actual, y_val_pred))
    mae_scores.append(mean_absolute_error(y_val_actual, y_val_pred))
    mape_scores.append(np.mean(np.abs((y_val_actual - y_val_pred) / y_val_actual)) * 100)

# Calcular métricas promedio de validación cruzada
mean_r2_cv = np.mean(r2_scores)
mean_mse_cv = np.mean(mse_scores)
mean_rmse_cv = np.sqrt(mean_mse_cv)
mean_mae_cv = np.mean(mae_scores)
mean_mape_cv = np.mean(mape_scores)

print("=== Métricas de Cross-Validation ===")
print(f"R² (promedio CV): {mean_r2_cv:.4f}")
print(f"MSE (promedio CV): {mean_mse_cv:.4f}")
print(f"RMSE (promedio CV): {mean_rmse_cv:.4f}")
print(f"MAE (promedio CV): {mean_mae_cv:.4f}")
print(f"MAPE (promedio CV): {mean_mape_cv:.4f}%")

# Predicciones en el conjunto de prueba
y_test_pred_log = voting_regressor.predict(X_test)
y_test_pred = np.expm1(y_test_pred_log)
y_test_actual = np.expm1(y_test)

# Calcular métricas en el conjunto de prueba
r2_test = r2_score(y_test_actual, y_test_pred)
mse_test = mean_squared_error(y_test_actual, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test_actual, y_test_pred)
mape_test = np.mean(np.abs((y_test_actual - y_test_pred) / y_test_actual)) * 100

print("=== Métricas de Test ===")
print(f"R² (Test): {r2_test:.4f}")
print(f"MSE (Test): {mse_test:.4f}")
print(f"RMSE (Test): {rmse_test:.4f}")
print(f"MAE (Test): {mae_test:.4f}")
print(f"MAPE (Test): {mape_test:.4f}%")

=== Métricas de Cross-Validation ===
R² (promedio CV): 0.4924
MSE (promedio CV): 52655555.1991
RMSE (promedio CV): 7256.4148
MAE (promedio CV): 5519.3986
MAPE (promedio CV): 15.7585%
=== Métricas de Test ===
R² (Test): 0.5304
MSE (Test): 47033318.5395
RMSE (Test): 6858.0842
MAE (Test): 5229.4953
MAPE (Test): 15.1307%
