# Cuanto ganara una persona al mes segun el ordenador prediga

In [49]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

In [32]:
# Cargar el dataset
df = pd.read_csv('../data/raw/adult.data',  header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], na_values=' ?', skipinitialspace=True)


In [33]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [34]:
df.shape

(32561, 15)

In [35]:
# Contar los valores nulos después del reemplazo
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

## Tranformamos las Variables Categoricas a Numericas

#### Trasformamos la variable objetivo "Income" en variable binaria para luego poder predecirla mejor.

In [36]:
df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

In [37]:
df = df.dropna()
df = df.drop(columns=['fnlwgt', 'education'])
df_encoded = pd.get_dummies(df, drop_first=True)

In [38]:
# Separar X e y
X = df_encoded.drop('income', axis=1)
y = df_encoded['income']

In [39]:
# Normalizar variables numéricas
numerical_cols = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

## Contruccion y Entrenamiento del Modelo de Clasificacion

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=18
)

In [41]:
# Inicializar el modelo
model = RandomForestClassifier(n_estimators=100, random_state=18)

In [42]:
# Entrenar el modelo
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Analisis de la Importancia de las Caracteristicas

In [43]:
# Obtener la importancia de las características
feature_importances = pd.Series(model.feature_importances_, index=X.columns)

# Ordenar las importancias y mostrar las 15 más importantes
top_features = feature_importances.nlargest(15)
print("Top 15 Feature Importances:\n", top_features)

Top 15 Feature Importances:
 age                                  0.226419
education-num                        0.130103
capital-gain                         0.114953
hours-per-week                       0.110884
marital-status_Married-civ-spouse    0.082953
capital-loss                         0.037139
marital-status_Never-married         0.034427
occupation_Exec-managerial           0.022712
relationship_Not-in-family           0.020593
sex_Male                             0.019935
occupation_Prof-specialty            0.019485
relationship_Own-child               0.013320
workclass_Private                    0.010666
relationship_Unmarried               0.008956
workclass_Self-emp-not-inc           0.008856
dtype: float64


# Construccion del Sistema de recomendacion

In [46]:
# Seleccionar un perfil para la prueba (el primer perfil con ingreso <=50K)
test_user_index = y_test[y_test == 0].index[0]
user_profile = X_test.loc[test_user_index].copy()
initial_prediction_proba = model.predict_proba(user_profile.values.reshape(1, -1))[0][1]
print(f"Probabilidad inicial de superar los $50K: {initial_prediction_proba:.2%}")

Probabilidad inicial de superar los $50K: 0.00%




### Tuve que utilizar una funcion pues no me dejaba hacer el proyecto con Shap. Con esta funcion cree la recomendacion, se puede ampliar y crear mas recomendaciones, completarla con mas clases y mas subclases pero la deje sencillita.

In [47]:
def get_recommendations(user_profile, model, X_columns, top_features_list):
    recommendations = {}
    initial_prob = model.predict_proba(user_profile.values.reshape(1, -1))[0][1]


    # Simular un aumento de horas de trabajo
    simulated_profile_hours = user_profile.copy()
    original_hours_normalized = simulated_profile_hours[numerical_cols[4]]
    simulated_profile_hours[numerical_cols[4]] = original_hours_normalized + 0.5
    new_prob_hours = model.predict_proba(simulated_profile_hours.values.reshape(1, -1))[0][1]
    recommendations['Aumentar horas trabajadas'] = new_prob_hours - initial_prob


    return recommendations


### Comprobamos la prediccion del modelo

In [48]:
# Hacemos el perfil de cada persona
single_person_profile = X_test.iloc[[0]]

# Predecimos la clase
predicted_class = model.predict(single_person_profile)

# Predecimos las probabilidades de cada clase
probabilities = model.predict_proba(single_person_profile)

print("--- Demostración de la Predicción del Modelo ---")
print("Perfil del individuo (normalizado):")
print(single_person_profile)
print("\nEl modelo predice que esta persona ganará:")
if predicted_class[0] == 1:
    print("> 50,000 USD al año")
else:
    print("<= 50,000 USD al año")

print(f"\nProbabilidad de que gane <= 50K: {probabilities[0][0]:.2%}")
print(f"Probabilidad de que gane > 50K:  {probabilities[0][1]:.2%}")

--- Demostración de la Predicción del Modelo ---
Perfil del individuo (normalizado):
            age  education-num  capital-gain  capital-loss  hours-per-week  \
27865 -0.042642       0.746039      -0.14592      -0.21666       -0.035429   

       workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \
27865                  False                False                   False   

       workclass_Private  workclass_Self-emp-inc  ...  \
27865               True                   False  ...   

       native-country_Portugal  native-country_Puerto-Rico  \
27865                    False                       False   

       native-country_Scotland  native-country_South  native-country_Taiwan  \
27865                    False                 False                  False   

       native-country_Thailand  native-country_Trinadad&Tobago  \
27865                    False                           False   

       native-country_United-States  native-country_Vietnam  \
27865   

## Guardamos los modelos y sus X_train y X_test

In [51]:
# Guardar el modelo entrenado
# joblib.dump es ideal para modelos de scikit-learn
joblib.dump(model, '../models/random_forest_model.joblib')

# Guardar el scaler, que es crucial para normalizar nuevos datos para la predicción
joblib.dump(scaler, '../models/scaler.joblib')

# Usamos .to_pickle() para guardar el DataFrame.
df_encoded.to_pickle('../data/processed/df_encoded.pkl')

# Guardar los conjuntos de datos de entrenamiento y prueba
# .to_pickle() es un método de pandas para guardar DataFrames y Series
X_train.to_pickle('../data/processed/X_train.pkl')
X_test.to_pickle('../data/processed/X_test.pkl')
y_train.to_pickle('../data/processed/y_train.pkl')
y_test.to_pickle('../data/processed/y_test.pkl')

print("Todos los modelos y DataFrames han sido guardados en los archivos correspondientes.")

Todos los modelos y DataFrames han sido guardados en los archivos correspondientes.
