In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cluster import KMeans
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool, cv

In [124]:
# Cargar el DataFrame desde el archivo
with open('../Pickles/df_final.pickle', 'rb') as archivo:
    df = pickle.load(archivo)

In [125]:
nombres=df.drop(['CompTotal'], axis=1).columns

In [126]:
df.isnull().values.any()

False

In [127]:
# Establecer la opción para mostrar todas las columnas
pd.set_option('display.max_columns', None)

In [128]:
limite_inferior = 15000
limite_superior = 100000

df_filtrado = df[(df['CompTotal'] >= limite_inferior) & (df['CompTotal'] <= limite_superior)]

print(f"Tamaño del DataFrame original: {df.shape}")
print(f"Tamaño del DataFrame filtrado: {df_filtrado.shape}")

Tamaño del DataFrame original: (1934, 397)
Tamaño del DataFrame filtrado: (1769, 397)


In [129]:
df = df_filtrado

In [130]:
y = df['CompTotal']

In [131]:
df = df.drop(columns=['CompTotal'], axis=1)

In [10]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1415, 396)
(354, 396)
(1415,)
(354,)


In [13]:
best_params = {'colsample_bylevel': 0.7, 
              'depth': 7, 
              'grow_policy': 'Depthwise', 
              'iterations': 300, 
              'l2_leaf_reg': 7, 
              'learning_rate': 0.05, 
              'max_bin': 128, 
              'min_child_samples': 1, 
              'subsample': 0.6,
              'random_seed' : 42, 
              'loss_function' : 'RMSE', 
              'verbose' : 0
              }
# Crear el modelo CatBoost
catboost_model = CatBoostRegressor(**best_params)

# Entrenar el modelo 
catboost_model.fit(X_train, y_train)

# Hacer predicciones sobre el conjunto de prueba
y_pred = catboost_model.predict(X_test)

# --- Evaluación del Modelo ---
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")
print(f"MAPE: {mape:.2f}")

MSE: 136235543.56
RMSE: 11672.00
MAE: 8917.17
R²: 0.70
MAPE: 20.35


In [14]:
# --- Importancia de las Características (si es necesario) ---
feature_importances = catboost_model.get_feature_importance()

# Crear un DataFrame para ordenar las importancias
importance_df = pd.DataFrame({
    'Feature': nombres,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Seleccionar las N características más importantes (por ejemplo, las 10 principales)
top_features = importance_df['Feature'].head(50).tolist()  # Cambia el número según desees

# Filtrar los DataFrames de entrenamiento y prueba para incluir solo las características seleccionadas

# Si tienes los nombres de las columnas originales:
X_train = pd.DataFrame(X_train, columns=nombres)
X_test = pd.DataFrame(X_test, columns=nombres)

# Ahora ya puedes usar .loc:
X_train_top = X_train.loc[:, top_features]
X_test_top = X_test.loc[:, top_features]

print(X_train_top.shape)
print(X_test_top.shape)
print(y_train.shape)
print(y_test.shape) 

(1415, 50)
(354, 50)
(1415,)
(354,)


In [118]:
importance_df.tail(150)['Feature'].values

array(['LanguageWantToWorkWith_Julia', 'WebframeHaveWorkedWith_Svelte',
       'LanguageHaveWorkedWith_Assembly',
       'OfficeStackAsyncHaveWorkedWith_Microsoft Planner',
       'LanguageHaveWorkedWith_Swift',
       'NEWCollabToolsHaveWorkedWith_Xcode',
       'MiscTechHaveWorkedWith_Cordova', 'LanguageHaveWorkedWith_Perl',
       'PlatformHaveWorkedWith_OVH', 'LanguageHaveWorkedWith_R',
       'LanguageWantToWorkWith_PowerShell',
       'NEWCollabToolsHaveWorkedWith_Emacs',
       'LanguageWantToWorkWith_GDScript',
       'ToolsTechHaveWorkedWith_Unity 3D',
       'WebframeHaveWorkedWith_CodeIgniter',
       'PlatformHaveWorkedWith_Netlify', 'LanguageHaveWorkedWith_MATLAB',
       'NEWCollabToolsHaveWorkedWith_DataGrip',
       'MiscTechHaveWorkedWith_Tauri',
       'MiscTechHaveWorkedWith_Torch/PyTorch',
       'LanguageWantToWorkWith_Fortran', 'LanguageHaveWorkedWith_Dart',
       'WebframeHaveWorkedWith_AngularJS',
       'LanguageWantToWorkWith_Groovy',
       'ToolsTechHaveWor

In [119]:
df = df.drop(['LanguageWantToWorkWith_Julia', 'WebframeHaveWorkedWith_Svelte',
       'LanguageHaveWorkedWith_Assembly',
       'OfficeStackAsyncHaveWorkedWith_Microsoft Planner',
       'LanguageHaveWorkedWith_Swift',
       'NEWCollabToolsHaveWorkedWith_Xcode',
       'MiscTechHaveWorkedWith_Cordova', 'LanguageHaveWorkedWith_Perl',
       'PlatformHaveWorkedWith_OVH', 'LanguageHaveWorkedWith_R',
       'LanguageWantToWorkWith_PowerShell',
       'NEWCollabToolsHaveWorkedWith_Emacs',
       'LanguageWantToWorkWith_GDScript',
       'ToolsTechHaveWorkedWith_Unity 3D',
       'WebframeHaveWorkedWith_CodeIgniter',
       'PlatformHaveWorkedWith_Netlify', 'LanguageHaveWorkedWith_MATLAB',
       'NEWCollabToolsHaveWorkedWith_DataGrip',
       'MiscTechHaveWorkedWith_Tauri',
       'MiscTechHaveWorkedWith_Torch/PyTorch',
       'LanguageWantToWorkWith_Fortran', 'LanguageHaveWorkedWith_Dart',
       'WebframeHaveWorkedWith_AngularJS',
       'LanguageWantToWorkWith_Groovy',
       'ToolsTechHaveWorkedWith_Chocolatey',
       'LanguageHaveWorkedWith_Haskell',
       'OfficeStackAsyncHaveWorkedWith_Shortcut', 'is_part_time',
       'NEWCollabToolsHaveWorkedWith_RStudio',
       'PlatformHaveWorkedWith_Managed Hosting',
       'DatabaseHaveWorkedWith_Couch DB', 'LanguageHaveWorkedWith_Apex',
       'DatabaseHaveWorkedWith_Firebird', 'MiscTechHaveWorkedWith_GTK',
       'NEWCollabToolsHaveWorkedWith_',
       'NEWCollabToolsHaveWorkedWith_Kate',
       'DatabaseHaveWorkedWith_Cosmos DB', 'MiscTechHaveWorkedWith_Qt',
       'ToolsTechHaveWorkedWith_Pacman',
       'PlatformHaveWorkedWith_Oracle Cloud Infrastructure (OCI)',
       'PlatformHaveWorkedWith_Cloudflare',
       'BuyNewTool_Research companies that have emailed me',
       'OfficeStackAsyncHaveWorkedWith_Stack Overflow for Teams',
       'LanguageHaveWorkedWith_GDScript', 'Age_Grouped_Senior',
       'OfficeStackAsyncHaveWorkedWith_Clickup',
       'MiscTechHaveWorkedWith_Capacitor',
       'NEWCollabToolsHaveWorkedWith_Spyder',
       'WebframeHaveWorkedWith_Drupal',
       'PlatformHaveWorkedWith_OpenStack', 'LanguageHaveWorkedWith_',
       'OfficeStackAsyncHaveWorkedWith_Basecamp',
       'MiscTechHaveWorkedWith_Hadoop', 'MiscTechHaveWorkedWith_Electron',
       'ToolsTechHaveWorkedWith_pnpm', 'LanguageWantToWorkWith_Assembly',
       'LanguageWantToWorkWith_R',
       'NEWCollabToolsHaveWorkedWith_Qt Creator',
       'DatabaseHaveWorkedWith_Firebase Realtime Database',
       'MiscTechHaveWorkedWith_Opencv', 'LanguageHaveWorkedWith_Ada',
       'OfficeStackSyncHaveWorkedWith_', 'LanguageWantToWorkWith_Ada',
       'OfficeStackAsyncHaveWorkedWith_Doxygen',
       'LanguageWantToWorkWith_Apex',
       'OfficeStackSyncHaveWorkedWith_Zulip',
       'WebframeHaveWorkedWith_Blazor', 'DatabaseHaveWorkedWith_DuckDB',
       'LanguageHaveWorkedWith_Cobol',
       'NEWCollabToolsHaveWorkedWith_RubyMine',
       'NEWCollabToolsHaveWorkedWith_CLion',
       'MiscTechHaveWorkedWith_React Native',
       'LanguageWantToWorkWith_Zig', 'LanguageWantToWorkWith_F#',
       'ToolsTechHaveWorkedWith_Bun', 'WebframeHaveWorkedWith_Laravel',
       'LanguageHaveWorkedWith_Fortran',
       'OfficeStackAsyncHaveWorkedWith_Monday.com',
       'PlatformHaveWorkedWith_Scaleway',
       'DatabaseHaveWorkedWith_Supabase',
       'NEWCollabToolsHaveWorkedWith_Netbeans',
       'PlatformHaveWorkedWith_Vultr', 'WebframeHaveWorkedWith_Remix',
       'DatabaseHaveWorkedWith_Cockroachdb',
       'WebframeHaveWorkedWith_Solid.js',
       'OfficeStackSyncHaveWorkedWith_IRC', 'WebframeHaveWorkedWith_Deno',
       'MiscTechHaveWorkedWith_Ktor', 'PlatformHaveWorkedWith_Render',
       'OrgSize_Grouped_Desconocido',
       'WebframeHaveWorkedWith_Play Framework',
       'LanguageHaveWorkedWith_F#',
       'OfficeStackAsyncHaveWorkedWith_Smartsheet',
       'LanguageHaveWorkedWith_Clojure',
       'NEWCollabToolsHaveWorkedWith_Rad Studio (Delphi, C++ Builder)',
       'LanguageWantToWorkWith_Objective-C',
       'LanguageWantToWorkWith_Perl',
       'OfficeStackAsyncHaveWorkedWith_Linear',
       'ToolsTechHaveWorkedWith_Chef', 'LanguageWantToWorkWith_MATLAB',
       'NEWCollabToolsHaveWorkedWith_Rider',
       'MiscTechHaveWorkedWith_SwiftUI', 'ToolsTechHaveWorkedWith_Nix',
       'MiscTechHaveWorkedWith_JAX', 'DatabaseHaveWorkedWith_Couchbase',
       'DatabaseHaveWorkedWith_Neo4J', 'DatabaseHaveWorkedWith_Snowflake',
       'MiscTechHaveWorkedWith_MFC', 'MiscTechHaveWorkedWith_Tidyverse',
       'LanguageHaveWorkedWith_Lisp',
       'DatabaseHaveWorkedWith_Microsoft Access',
       'MiscTechHaveWorkedWith_.NET MAUI',
       'MiscTechHaveWorkedWith_Xamarin', 'LanguageWantToWorkWith_Lisp',
       'OfficeStackAsyncHaveWorkedWith_Microsoft Lists',
       'AIToolCurrently Using_Deployment and monitoring',
       'OfficeStackAsyncHaveWorkedWith_YouTrack',
       'ToolsTechHaveWorkedWith_Unreal Engine',
       'LanguageHaveWorkedWith_Prolog', 'MiscTechHaveWorkedWith_Keras',
       'NEWCollabToolsHaveWorkedWith_Code::Blocks',
       'LanguageWantToWorkWith_Prolog', 'WebframeHaveWorkedWith_Elm',
       'PlatformHaveWorkedWith_Fly.io', 'DatabaseHaveWorkedWith_RavenDB',
       'LanguageHaveWorkedWith_Crystal', 'ToolsTechHaveWorkedWith_Dagger',
       'NEWCollabToolsHaveWorkedWith_VSCodium',
       'WebframeHaveWorkedWith_Fastify', 'LanguageHaveWorkedWith_OCaml',
       'OfficeStackSyncHaveWorkedWith_Wire',
       'PlatformHaveWorkedWith_Colocation',
       'NEWCollabToolsHaveWorkedWith_Fleet',
       'NEWCollabToolsHaveWorkedWith_Helix', 'LanguageWantToWorkWith_Nim',
       'NEWCollabToolsHaveWorkedWith_Geany', 'is_other_employment',
       'DatabaseHaveWorkedWith_Datomic', 'MiscTechHaveWorkedWith_Quarkus',
       'PlatformHaveWorkedWith_IBM Cloud Or Watson',
       'OfficeStackSyncHaveWorkedWith_Symphony',
       'LanguageHaveWorkedWith_Nim',
       'OfficeStackSyncHaveWorkedWith_Rocketchat',
       'ToolsTechHaveWorkedWith_Pulumi', 'LanguageHaveWorkedWith_Zig',
       'LanguageWantToWorkWith_OCaml',
       'OfficeStackSyncHaveWorkedWith_Ringcentral',
       'OfficeStackSyncHaveWorkedWith_Wickr',
       'OfficeStackSyncHaveWorkedWith_Unify Circuit',
       'LanguageWantToWorkWith_Crystal'], axis=1) # + 'LanguageWantToWorkWith_MATLAB', 'ToolsTechHaveWorkedWith_Chef', 'OfficeStackAsyncHaveWorkedWith_Linear', 

In [120]:
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [121]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [122]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1415, 246)
(354, 246)
(1415,)
(354,)


In [123]:
# Crear el modelo CatBoost
catboost_model = CatBoostRegressor(**best_params)

# Entrenar el modelo 
catboost_model.fit(X_train, y_train)

# Hacer predicciones sobre el conjunto de prueba
y_pred = catboost_model.predict(X_test)

# --- Evaluación del Modelo ---
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")
print(f"MAPE: {mape:.2f}")

MSE: 144458104.06
RMSE: 12019.07
MAE: 9266.24
R²: 0.68
MAPE: 21.13


Pruebo eliminadno columnas: 
- LanguageWantToWorkWith_MATLAB: 
    - MSE: 139936185.68
    - RMSE: 11829.46
    - MAE: 9002.46
    - R²: 0.69
    - MAPE: 20.49
- LanguageWantToWorkWith_MATLAB y ToolsTechHaveWorkedWith_Chef:
    - MSE: 139768834.69
    - RMSE: 11822.39
    - MAE: 8958.95
    - R²: 0.69
    - MAPE: 20.43
- ToolsTechHaveWorkedWith_Chef:
    - MSE: 143268961.64
    - RMSE: 11969.50
    - MAE: 9114.46
    - R²: 0.68
    - MAPE: 20.90
- LanguageWantToWorkWith_MATLAB, ToolsTechHaveWorkedWith_Chef y OfficeStackAsyncHaveWorkedWith_Linear:
    - MSE: 136320556.08
    - RMSE: 11675.64
    - MAE: 8846.72
    - R²: 0.70
    - MAPE: 20.13
- LanguageWantToWorkWith_MATLAB, ToolsTechHaveWorkedWith_Chef, OfficeStackAsyncHaveWorkedWith_Linear y LanguageWantToWorkWith_Perl:
    - MSE: 136416563.51
    - RMSE: 11679.75
    - MAE: 8866.41
    - R²: 0.70
    - MAPE: 20.34
- drop.tail(50)
    - MSE: 134018285.27
    - RMSE: 11576.63
    - MAE: 8902.83
    - R²: 0.70
    - MAPE: 20.17
- drop.tail(100)
    - MSE: 141070392.49
    - RMSE: 11877.31
    - MAE: 9017.42
    - R²: 0.69
    - MAPE: 20.65
- drop.tail(150)
    - MSE: 144458104.06
    - RMSE: 12019.07
    - MAE: 9266.24
    - R²: 0.68
    - MAPE: 21.13

Pruebo con PCA, pero no mejora. 

In [133]:
from sklearn.decomposition import PCA

# --- Escalado de los datos (es necesario para PCA) ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Aplicar PCA ---
n_components = 100  # Número de componentes principales deseados
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Información sobre la varianza explicada
print(f"Varianza explicada acumulada por {n_components} componentes: {sum(pca.explained_variance_ratio_):.2f}")

# --- Crear el modelo CatBoost ---
best_params = {
    'colsample_bylevel': 0.7,
    'depth': 7,
    'grow_policy': 'Depthwise',
    'iterations': 300,
    'l2_leaf_reg': 7,
    'learning_rate': 0.05,
    'max_bin': 128,
    'min_child_samples': 1,
    'subsample': 0.6,
    'random_seed': 42,
    'loss_function': 'RMSE',
    'verbose': 0
}
catboost_model = CatBoostRegressor(**best_params)

# Entrenar el modelo con las características reducidas por PCA
catboost_model.fit(X_train_pca, y_train)

# Hacer predicciones sobre el conjunto de prueba
y_pred = catboost_model.predict(X_test_pca)

# --- Evaluación del Modelo ---
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")
print(f"MAPE: {mape:.2f}")

Varianza explicada acumulada por 100 componentes: 0.73
MSE: 241490568.94
RMSE: 15539.97
MAE: 11983.64
R²: 0.47
MAPE: 27.46
