# Model Comparison

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Importar los resultados guardados en formato CSV
mlr_results = pd.read_csv('mlr_results.csv')
ridge_results = pd.read_csv('ridge_results.csv')
lasso_results = pd.read_csv('lasso_results.csv')
elasticnet_results = pd.read_csv('elasticnet_results.csv')
rf_results = pd.read_csv('rf_results.csv')
adaboost_results = pd.read_csv('adaboost_results.csv')
xgb_results = pd.read_csv('xgb_results.csv')
svr_results = pd.read_csv('svr_results.csv')
knn_results = pd.read_csv('knn_results.csv')
mlp_results = pd.read_csv('mlp_results.csv')

# Crear un DataFrame con los resultados
results_df = pd.DataFrame({
    'Model': ['MLR', 'Ridge', 'Lasso', 'ElasticNet', 'Random Forest', 'AdaBoost', 'XGBoost', 'SVR', 'KNN', 'MLP'],
    
    'R2': [mlr_results['R2'].values[0], ridge_results['R2'].values[0], lasso_results['R2'].values[0], elasticnet_results['R2'].values[0], 
           rf_results['R2'].values[0], adaboost_results['R2'].values[0], xgb_results['R2'].values[0], svr_results['R2'].values[0], 
           knn_results['R2'].values[0], mlp_results['R2'].values[0]],
    
    'RMSE': [mlr_results['RMSE'].values[0], ridge_results['RMSE'].values[0], lasso_results['RMSE'].values[0], elasticnet_results['RMSE'].values[0], 
             rf_results['RMSE'].values[0], adaboost_results['RMSE'].values[0], xgb_results['RMSE'].values[0], svr_results['RMSE'].values[0], 
             knn_results['RMSE'].values[0], mlp_results['RMSE'].values[0]],
    
    'MAE': [mlr_results['MAE'].values[0], ridge_results['MAE'].values[0], lasso_results['MAE'].values[0], elasticnet_results['MAE'].values[0], 
            rf_results['MAE'].values[0], adaboost_results['MAE'].values[0], xgb_results['MAE'].values[0], svr_results['MAE'].values[0], 
            knn_results['MAE'].values[0], mlp_results['MAE'].values[0]]
})


FileNotFoundError: [Errno 2] No such file or directory: 'adaboost_results.csv'

In [None]:
results_df

Unnamed: 0,Model,R2,RMSE,MAE
0,MLR,-5.650413e+27,4.22247e+18,3.943971e+17
1,Ridge,0.3739627,49180.79,33406.17
2,Lasso,0.3891542,48580.15,32801.76
3,ElasticNet,0.390006,48545.97,32845.87
4,Random Forest,0.3738945,49592.2,33364.49
5,AdaBoost,-0.01213525,62794.92,52973.72
6,XGBoost,0.3279544,51395.84,34663.72
7,SVR,-0.07020983,64812.09,45460.8
8,KNN,0.1244763,58617.12,41800.96
9,MLP,0.3537812,50363.38,34391.49


In [None]:
results_df = results_df.round(3)
formatted_results_df = results_df.applymap(lambda x: "{:.3f}".format(x) if isinstance(x, (int, float)) else x)
best_r2 = formatted_results_df.sort_values(by=['R2'], ascending=False).reset_index(drop=True)
best_r2

Unnamed: 0,Model,R2,RMSE,MAE
0,ElasticNet,0.39,48545.975,32845.867
1,Lasso,0.389,48580.148,32801.765
2,Ridge,0.374,49180.786,33406.166
3,Random Forest,0.374,49592.199,33364.492
4,MLP,0.354,50363.381,34391.49
5,XGBoost,0.328,51395.839,34663.72
6,KNN,0.124,58617.117,41800.956
7,MLR,-5.650413461605654e+27,4.222470028657365e+18,3.943970612292197e+17
8,SVR,-0.07,64812.089,45460.8
9,AdaBoost,-0.012,62794.921,52973.719


In [None]:
df = pd.read_csv('clean_and_encoded_df.csv')

# Separar la variable objetivo (Annual_salary) de las variables independientes
X = df.drop('Annual_salary', axis=1)
y = df['Annual_salary']

# Calcular la media y la mediana de los salarios
mean_salary = y.mean()
median_salary = y.median()

# Crear predicciones constantes usando la media y la mediana de los salarios
mean_predictions = np.full(y.shape, mean_salary)
median_predictions = np.full(y.shape, median_salary)

# Calcular MAE y RMSE para el modelo de referencia utilizando la media
mean_mae = mean_absolute_error(y, mean_predictions)
mean_rmse = np.sqrt(mean_squared_error(y, mean_predictions))

# Calcular MAE y RMSE para el modelo de referencia utilizando la mediana
median_mae = mean_absolute_error(y, median_predictions)
median_rmse = np.sqrt(mean_squared_error(y, median_predictions))

print("Modelo de referencia usando la media:")
print(f"MAE: {mean_mae:.3f}")
print(f"RMSE: {mean_rmse:.3f}")

print("\nModelo de referencia usando la mediana:")
print(f"MAE: {median_mae:.3f}")
print(f"RMSE: {median_rmse:.3f}")


Modelo de referencia usando la media:
MAE: 47160.63
RMSE: 62143.18

Modelo de referencia usando la mediana:
MAE: 45208.09
RMSE: 64283.82


In [None]:
# Crear un DataFrame con los resultados de los modelos de referencia
reference_results_df = pd.DataFrame({
    'Model': ['Mean', 'Median'],
    'R2': [np.nan, np.nan],  # R2 no es aplicable en este caso
    'RMSE': [mean_rmse, median_rmse],
    'MAE': [mean_mae, median_mae]
})

# Combinar los resultados de los modelos de referencia con los resultados originales
all_results_df = pd.concat([results_df, reference_results_df], ignore_index=True)

# Agregar columnas adicionales para calcular las diferencias entre los valores de RMSE y MAE
results_df['VS Mean RMSE'] = results_df['RMSE'] - mean_rmse
results_df['VS Median RMSE'] = results_df['RMSE'] - median_rmse
results_df['VS Mean MAE'] = results_df['MAE'] - mean_mae
results_df['VS Median MAE'] = results_df['MAE'] - median_mae

# Redondear los valores y aplicar el formato deseado
formatted_results_df = results_df.round(3).applymap(lambda x: "{:.3f}".format(x) if isinstance(x, (int, float)) else x)

# Ordenar el DataFrame por RMSE y MAE
best_r2 = formatted_results_df.sort_values(by=['R2'], ascending=False).reset_index(drop=True)

# Mostrar el DataFrame ordenado por RMSE y MAE
best_r2

Unnamed: 0,Model,R2,RMSE,MAE,VS Mean RMSE,VS Median RMSE,VS Mean MAE,VS Median MAE
0,ElasticNet,0.39,48545.975,32845.867,-13597.206,-15737.849,-14314.761,-12362.224
1,Lasso,0.389,48580.148,32801.765,-13563.033,-15703.676,-14358.863,-12406.326
2,Ridge,0.374,49180.786,33406.166,-12962.395,-15103.038,-13754.462,-11801.925
3,Random Forest,0.374,49592.199,33364.492,-12550.982,-14691.625,-13796.136,-11843.599
4,MLP,0.354,50363.381,34391.49,-11779.8,-13920.443,-12769.138,-10816.601
5,XGBoost,0.328,51395.839,34663.72,-10747.342,-12887.985,-12496.908,-10544.371
6,KNN,0.124,58617.117,41800.956,-3526.064,-5666.707,-5359.672,-3407.135
7,MLR,-5.650413461605654e+27,4.222470028657365e+18,3.943970612292197e+17,4.222470028657303e+18,4.222470028657301e+18,3.943970612291725e+17,3.9439706122917446e+17
8,SVR,-0.07,64812.089,45460.8,2668.908,528.265,-1699.828,252.709
9,AdaBoost,-0.012,62794.921,52973.719,651.74,-1488.903,5813.091,7765.628


## Conclusions

In certain types of data or situations, the algorithms of Multiple Linear Regression, SVR (Support Vector Regression), and AdaBoost might not perform well. Below are some situations where these algorithms may not be the best choice:

**Multiple Linear Regression:**

- _Non-linear relationships:_ If the relationships between the independent variables and the dependent variable are non-linear, multiple linear regression might not perform well. In this case, it would be better to consider non-linear regression algorithms.

- _Multicollinearity:_ If there is a high correlation between some of the independent variables, this can negatively affect the accuracy of the coefficient estimates and the generalization ability of the model.

- _Outliers:_ Multiple linear regression is sensitive to outliers in the data. These atypical points can have a disproportionate impact on the estimated coefficients and the model's predictions.

**SVR (Support Vector Regression):**

- _Feature scaling:_ SVR is sensitive to the scale of the features. If the features have very different ranges, the model's performance may be negatively affected. In this case, it would be necessary to scale the features before training the model.

- _Large datasets:_ Training SVR models on very large datasets can be computationally expensive. Depending on the algorithm's implementation and configuration, SVR might not be the best choice for problems with a large amount of data.

- _Noise in the data:_ If the data has a lot of noise, SVR might not perform well. The algorithm tries to fit the maximum margin, which could be difficult in noisy data.

**AdaBoost:**

- _Noise in the data and outliers:_ AdaBoost is sensitive to noise and outliers in the data. This is because the algorithm assigns more weight to misclassified observations in each iteration, which could lead to overfitting in the presence of noise and outliers.

- _Complex base estimators:_ If a base estimator that is too complex and performs well on its own is used, AdaBoost might not significantly improve the performance. In this case, it would be better to consider simpler base estimators or try other ensemble algorithms.

- _Training time:_ Although AdaBoost works sequentially, which can increase the training time compared to other ensemble algorithms that can be trained in parallel, such as Random Forest.