# Explore here

In [52]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

import warnings

In [53]:
df = pd.read_csv('https://breathecode.herokuapp.com/asset/internal-link?id=2326&path=adult-census-income.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [54]:
df.shape

(32561, 15)

In [55]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [56]:
duplicados = df.duplicated()
num_duplicados = duplicados.sum()
print(num_duplicados)

24


Dado que hay un n√∫mero de duplicados muy bajo respecto del total de datos, estos van a ser eliminados del dataset.

In [57]:
df = df.drop_duplicates(keep='first')
df.shape

(32537, 15)

In [58]:
#Tratar valors faltantes
df = df.replace("?", np.nan)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [59]:
#Valores faltantes
df.isnull().sum().sort_values(ascending=False)

occupation        1843
workclass         1836
native.country     582
fnlwgt               0
education            0
education.num        0
age                  0
marital.status       0
relationship         0
sex                  0
race                 0
capital.gain         0
capital.loss         0
hours.per.week       0
income               0
dtype: int64

Dado que tenemos tres variables categ√≥ricas con valores faltantes, se las va a tratar con una imputaci√≥n para no intentar perder la m√≠nima informaci√≥n.

In [60]:
df_clean = df.copy()

# Contar valores antes
workclass_antes = df_clean['workclass'].isnull().sum()
occupation_antes = df_clean['occupation'].isnull().sum()
country_antes = df_clean['native.country'].isnull().sum()

# Imputar todos a la vez sin warnings
df_clean = df_clean.fillna({
    'workclass': 'Unknown',
    'occupation': 'Unknown', 
    'native.country': df_clean['native.country'].mode()[0]
})

print(f"‚úÖ Valores imputados:")
print(f"   - workclass: {workclass_antes} ‚Üí 'Unknown'")
print(f"   - occupation: {occupation_antes} ‚Üí 'Unknown'") 
print(f"   - native.country: {country_antes} ‚Üí '{df_clean['native.country'].mode()[0]}'")
print(f"üìä Filas preservadas: {len(df_clean):,} (100% de los datos)")

‚úÖ Valores imputados:
   - workclass: 1836 ‚Üí 'Unknown'
   - occupation: 1843 ‚Üí 'Unknown'
   - native.country: 582 ‚Üí 'United-States'
üìä Filas preservadas: 32,537 (100% de los datos)


#### Codificaci√≥n de variables categ√≥ricas

In [61]:
from IPython.display import display

categorical_cols = df_clean.select_dtypes(include=["object"]).columns 
print("=== AN√ÅLISIS DETALLADO CATEG√ìRICAS ===\n")
    
for col in categorical_cols:
    print(f"üìä {col.upper()}:")
    
    # Trabajar con copia de los datos
    col_data = df_clean[col].copy()
    value_counts = col_data.value_counts()
    n_unique = len(value_counts)  
    
    print(f"   Total valores √∫nicos: {n_unique}")

    display(value_counts) 
      
    if n_unique > 20:
        print(f"   ‚ö†Ô∏è  ALTA CARDINALIDAD - considerar agrupaci√≥n")
    elif n_unique > 10:
        print(f"   üí° Cardinalidad moderada")
    else:
        print(f"   ‚úÖ Cardinalidad baja - ideal para One-Hot Encoding")
        
    print("-" * 50)

=== AN√ÅLISIS DETALLADO CATEG√ìRICAS ===

üìä WORKCLASS:
   Total valores √∫nicos: 9


workclass
Private             22673
Self-emp-not-inc     2540
Local-gov            2093
Unknown              1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

   ‚úÖ Cardinalidad baja - ideal para One-Hot Encoding
--------------------------------------------------
üìä EDUCATION:
   Total valores √∫nicos: 16


education
HS-grad         10494
Some-college     7282
Bachelors        5353
Masters          1722
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           645
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           332
1st-4th           166
Preschool          50
Name: count, dtype: int64

   üí° Cardinalidad moderada
--------------------------------------------------
üìä MARITAL.STATUS:
   Total valores √∫nicos: 7


marital.status
Married-civ-spouse       14970
Never-married            10667
Divorced                  4441
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64

   ‚úÖ Cardinalidad baja - ideal para One-Hot Encoding
--------------------------------------------------
üìä OCCUPATION:
   Total valores √∫nicos: 15


occupation
Prof-specialty       4136
Craft-repair         4094
Exec-managerial      4065
Adm-clerical         3768
Sales                3650
Other-service        3291
Machine-op-inspct    2000
Unknown              1843
Transport-moving     1597
Handlers-cleaners    1369
Farming-fishing       992
Tech-support          927
Protective-serv       649
Priv-house-serv       147
Armed-Forces            9
Name: count, dtype: int64

   üí° Cardinalidad moderada
--------------------------------------------------
üìä RELATIONSHIP:
   Total valores √∫nicos: 6


relationship
Husband           13187
Not-in-family      8292
Own-child          5064
Unmarried          3445
Wife               1568
Other-relative      981
Name: count, dtype: int64

   ‚úÖ Cardinalidad baja - ideal para One-Hot Encoding
--------------------------------------------------
üìä RACE:
   Total valores √∫nicos: 5


race
White                 27795
Black                  3122
Asian-Pac-Islander     1038
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

   ‚úÖ Cardinalidad baja - ideal para One-Hot Encoding
--------------------------------------------------
üìä SEX:
   Total valores √∫nicos: 2


sex
Male      21775
Female    10762
Name: count, dtype: int64

   ‚úÖ Cardinalidad baja - ideal para One-Hot Encoding
--------------------------------------------------
üìä NATIVE.COUNTRY:
   Total valores √∫nicos: 41


native.country
United-States                 29735
Mexico                          639
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        62
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece                           29
France       

   ‚ö†Ô∏è  ALTA CARDINALIDAD - considerar agrupaci√≥n
--------------------------------------------------
üìä INCOME:
   Total valores √∫nicos: 2


income
<=50K    24698
>50K      7839
Name: count, dtype: int64

   ‚úÖ Cardinalidad baja - ideal para One-Hot Encoding
--------------------------------------------------


In [62]:
# Variables de baja cardinalidad - OneHotEncoder
baja_cardinalidad = ['workclass', 'marital.status', 'relationship', 'race', 'sex']
baja_cardinalidad = [col for col in baja_cardinalidad if col in df_clean.columns]

encoder_baja = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_baja = encoder_baja.fit_transform(df_clean[baja_cardinalidad])

feature_names_baja = []
for i, col in enumerate(baja_cardinalidad):
    for category in encoder_baja.categories_[i][1:]:
        feature_names_baja.append(f"{col}_{category}")

df_encoded = pd.DataFrame(encoded_baja, columns=feature_names_baja, index=df_clean.index)

In [63]:
# Variables de cardinalidad moderada - OneHotEncoder
moderada_cardinalidad = ['education', 'occupation']
encoder_moderada = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_moderada = encoder_moderada.fit_transform(df_clean[moderada_cardinalidad])

feature_names_moderada = []
for i, col in enumerate(moderada_cardinalidad):
    for category in encoder_moderada.categories_[i][1:]:
        feature_names_moderada.append(f"{col}_{category}")

df_moderada = pd.DataFrame(encoded_moderada, columns=feature_names_moderada, index=df_clean.index)
df_encoded = pd.concat([df_encoded, df_moderada], axis=1)

In [64]:
# native.country - Agrupar y OneHotEncoder
top_countries = df_clean['native.country'].value_counts().head(10).index
df_country_processed = df_clean['native.country'].apply(lambda x: x if x in top_countries else 'Other')

encoder_country = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_country = encoder_country.fit_transform(df_country_processed.values.reshape(-1, 1))

feature_names_country = []
for category in encoder_country.categories_[0][1:]:
    feature_names_country.append(f"country_{category}")

df_country = pd.DataFrame(encoded_country, columns=feature_names_country, index=df_clean.index)
df_encoded = pd.concat([df_encoded, df_country], axis=1)

In [65]:
# A√±adir variables num√©ricas
numerical_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
for col in numerical_cols:
    df_encoded[col] = df_clean[col]

In [66]:
# Variable objetivo - LabelEncoder
df_encoded['income_encoded'] = LabelEncoder().fit_transform(df_clean['income'])

print(f"Dataset final: {df_encoded.shape}")

Dataset final: (32537, 70)


#### Escalado

In [67]:
# Split
X = df_encoded.drop('income_encoded', axis=1)
y = df_encoded['income_encoded']

# Columnas num√©ricas que necesitan escalado
numeric_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
numeric_cols = [col for col in numeric_cols if col in X.columns]

# Las columnas One-Hot (0/1) NO se escalan
one_hot_cols = [col for col in X.columns if col not in numeric_cols]

print(f"üî¢ Num√©ricas a escalar: {len(numeric_cols)}")
print(f"üî§ One-Hot a mantener: {len(one_hot_cols)}")

üî¢ Num√©ricas a escalar: 6
üî§ One-Hot a mantener: 63


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [69]:
# Escalar solo las num√©ricas
scaler = StandardScaler()

# Crear copias
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Aplicar escalado solo a columnas num√©ricas
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

Ahora que el preprocesamiento de los datos ya est√° hecho, se fija el objetivo en recomendaci√≥n para ganar >=50k, donde los usuarios seran cada fila del dataset y las variables que definen el perfil seran todas excepto:
- income_encoded (es el target)
- fnlwgt (peso muestral, no relevante para recomendaci√≥n)
- native.country_* (poca relevancia para ocupaci√≥n)

Con todo esto, se pretende crear un sistema de recomendaci√≥n h√≠brido.

In [70]:
# Excluir variables no relevantes para el perfil
variables_excluir = ['fnlwgt'] + [col for col in X_train_scaled.columns if 'native.country' in col or 'country_' in col]

# Filtrar solo variables relevantes para el perfil
variables_perfil = [col for col in X_train_scaled.columns if col not in variables_excluir]

print(f"Variables excluidas: {len(variables_excluir)}")
print(f"Variables para perfil: {len(variables_perfil)}")
print(f"Shape final: {X_train_scaled[variables_perfil].shape}")

# Crear datasets para recomendaci√≥n
X_train_rec = X_train_scaled[variables_perfil]
X_test_rec = X_test_scaled[variables_perfil]

print(f"\n Datasets preparados:")
print(f"   X_train_rec: {X_train_rec.shape}")
print(f"   X_test_rec: {X_test_rec.shape}")

Variables excluidas: 11
Variables para perfil: 58
Shape final: (26029, 58)

 Datasets preparados:
   X_train_rec: (26029, 58)
   X_test_rec: (6508, 58)


Despu√©s de haber hecho algunas pruebas y dado que el kernel no aguanta una matriz tan grande para calcular las similitudes se va a hacer un random forest para encontrar las 15 variables mas relacionadas con el target. Por el mismo motivo, se decide hacer un sistema de recomendaci√≥n bsado en contenido en lugar de uno h√≠brido.

In [71]:
#Random Forest para selecci√≥n de variables m√°s importates
print("üîÑ Seleccionando variables importantes...")
rf_selector = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=42, n_jobs=-1)
rf_selector.fit(X_train_rec, y_train)

# Top 15 features m√°s importantes
importancias = pd.Series(rf_selector.feature_importances_, index=X_train_rec.columns)
top_15_features = importancias.nlargest(15).index.tolist()

print("‚úÖ TOP 15 VARIABLES:")
for i, feat in enumerate(top_15_features, 1):
    print(f"   {i:2d}. {feat}")

# Reducir datasets
X_train_light = X_train_rec[top_15_features].copy()
X_test_light = X_test_rec[top_15_features].copy()
print(f"üìä X_train_light: {X_train_light.shape}")
print(f"üìä X_test_light: {X_test_light.shape}")

üîÑ Seleccionando variables importantes...


‚úÖ TOP 15 VARIABLES:
    1. capital.gain
    2. marital.status_Married-civ-spouse
    3. education.num
    4. marital.status_Never-married
    5. age
    6. capital.loss
    7. hours.per.week
    8. relationship_Own-child
    9. education_Bachelors
   10. relationship_Not-in-family
   11. occupation_Exec-managerial
   12. sex_Male
   13. occupation_Prof-specialty
   14. relationship_Unmarried
   15. education_Masters
üìä X_train_light: (26029, 15)
üìä X_test_light: (6508, 15)


In [72]:
# An√°lisis de datos
df_high_income = df_clean[df_clean['income'] == '>50K']
df_low_income = df_clean[df_clean['income'] == '<=50K']

print("\nüìä DIFERENCIAS CLAVE:")
print(f"   Educaci√≥n >50K: {df_high_income['education.num'].mean():.1f} a√±os")
print(f"   Educaci√≥n <=50K: {df_low_income['education.num'].mean():.1f} a√±os")
print(f"   Horas >50K: {df_high_income['hours.per.week'].mean():.1f} h/semana")
print(f"   Horas <=50K: {df_low_income['hours.per.week'].mean():.1f} h/semana")


üìä DIFERENCIAS CLAVE:
   Educaci√≥n >50K: 11.6 a√±os
   Educaci√≥n <=50K: 9.6 a√±os
   Horas >50K: 45.5 h/semana
   Horas <=50K: 38.8 h/semana


### Sistema de recomendaci√≥n

In [73]:
print("üéØ RECOMENDACIONES PARA GANAR >50K")

perfiles_test = [
    idx for idx in X_train_light.index 
    if y_train.loc[idx] == 0
][:3]

for idx in perfiles_test:
    perfil = df_clean.loc[idx]
    print(f"\n=== PERFIL {idx} ===")
    print(f"Actual: {perfil['occupation']} - {perfil['income']}")
    
    # C√°lculos con X_train_light (escalado)
    high_income_mask = y_train == 1  
    X_high = X_train_light[high_income_mask]
    
    similitudes = cosine_similarity(X_train_light.loc[[idx]], X_high)[0]
    top_3 = similitudes.argsort()[-3:][::-1]
    similares = X_high.iloc[top_3]
    
    print("üí° MEJORAS:")
    
    # Visualizaci√≥n con df_clean (valores reales)
    educ_actual = perfil['education.num']
    educ_promedio_high = df_clean.loc[X_high.index, 'education.num'].mean()
    
    if educ_actual < educ_promedio_high:
        print(f"  üìö Educaci√≥n: {educ_actual} ‚Üí {educ_promedio_high:.1f} a√±os")
    
    horas_actual = perfil['hours.per.week']
    horas_promedio_high = df_clean.loc[X_high.index, 'hours.per.week'].mean()
    
    if horas_actual < horas_promedio_high:
        print(f"  ‚è∞ Horas: {horas_actual} ‚Üí {horas_promedio_high:.1f} h/semana")
    
    ocupaciones_recomendadas = df_clean.loc[similares.index, 'occupation'].unique()[:2]
    print(f"  üíº Ocupaciones: {list(ocupaciones_recomendadas)}")

üéØ RECOMENDACIONES PARA GANAR >50K

=== PERFIL 25267 ===
Actual: Unknown - <=50K
üí° MEJORAS:
  üìö Educaci√≥n: 10 ‚Üí 11.6 a√±os
  ‚è∞ Horas: 20 ‚Üí 45.5 h/semana
  üíº Ocupaciones: ['Other-service', 'Sales']

=== PERFIL 4584 ===
Actual: Handlers-cleaners - <=50K
üí° MEJORAS:
  üìö Educaci√≥n: 9 ‚Üí 11.6 a√±os
  ‚è∞ Horas: 40 ‚Üí 45.5 h/semana
  üíº Ocupaciones: ['Other-service', 'Craft-repair']

=== PERFIL 7003 ===
Actual: Prof-specialty - <=50K
üí° MEJORAS:
  üìö Educaci√≥n: 9 ‚Üí 11.6 a√±os
  ‚è∞ Horas: 18 ‚Üí 45.5 h/semana
  üíº Ocupaciones: ['Sales', 'Tech-support']


Con esto obtenemos un sistema de recomendaci√≥n basado en contenido para mejora profesional. 

### Probar con casos simulados


In [None]:
# Crear perfiles simulados
perfiles_simulados = [
    {
        'nombre': 'JOVEN CON POCA EDUCACI√ìN',
        'age': 22,
        'education.num': 9,  # Algo de secundaria
        'hours.per.week': 30,
        'occupation': 'Handlers-cleaners',
        'workclass': 'Private',
        'expected_improvement': 'alta'
    },
    {
        'nombre': 'ADULTO EDUCACI√ìN MEDIA', 
        'age': 35,
        'education.num': 12,  # Secundaria completa
        'hours.per.week': 40,
        'occupation': 'Other-service',
        'workclass': 'Local-gov',
        'expected_improvement': 'media'
    },
    {
        'nombre': 'PROFESIONAL MAL PAGADO',
        'age': 28, 
        'education.num': 14,  # Algo de universidad
        'hours.per.week': 35,
        'occupation': 'Adm-clerical',
        'workclass': 'Private',
        'expected_improvement': 'media-alta'
    },
    {
        'nombre': 'TRABAJADOR MAYOR',
        'age': 55,
        'education.num': 10,  # Secundaria incompleta
        'hours.per.week': 45,
        'occupation': 'Farming-fishing',
        'workclass': 'Self-emp-not-inc',
        'expected_improvement': 'baja'
    }
]

In [76]:
def recomendar_mejoras_50k(indices_perfiles):
    for idx in indices_perfiles:
        perfil = df_clean.loc[idx]
        print(f"\n=== PERFIL {idx} ===")
        print(f"Actual: {perfil['occupation']} - {perfil['income']}")
        
        # C√°lculos con X_train_light (escalado)
        high_income_mask = y_train == 1  
        X_high = X_train_light[high_income_mask]
        
        similitudes = cosine_similarity(X_train_light.loc[[idx]], X_high)[0]
        top_3 = similitudes.argsort()[-3:][::-1]
        similares = X_high.iloc[top_3]
        
        print("üí° MEJORAS:")
        
        # Visualizaci√≥n con df_clean (valores reales)
        educ_actual = perfil['education.num']
        educ_promedio_high = df_clean.loc[X_high.index, 'education.num'].mean()
        
        if educ_actual < educ_promedio_high:
            print(f"  üìö Educaci√≥n: {educ_actual} ‚Üí {educ_promedio_high:.1f} a√±os")
        
        horas_actual = perfil['hours.per.week']
        horas_promedio_high = df_clean.loc[X_high.index, 'hours.per.week'].mean()
        
        if horas_actual < horas_promedio_high:
            print(f"  ‚è∞ Horas: {horas_actual} ‚Üí {horas_promedio_high:.1f} h/semana")
        
        ocupaciones_recomendadas = df_clean.loc[similares.index, 'occupation'].unique()[:2]
        print(f"  üíº Ocupaciones: {list(ocupaciones_recomendadas)}")

In [79]:
def encontrar_indices_reales(perfiles_simulados):
    indices_reales = []
    
    for perfil_sim in perfiles_simulados:
        # Encontrar el perfil real m√°s similar al simulado
        low_income_mask = y_train == 0
        X_low = X_train_light[low_income_mask]
        
        # Crear vector del perfil simulado
        perfil_vector = np.array([
            perfil_sim['age'], 
            perfil_sim['education.num'], 
            perfil_sim['hours.per.week']
        ] + [0] * (len(X_train_light.columns) - 3)).reshape(1, -1)
        
        # Calcular similitudes
        similitudes = cosine_similarity(perfil_vector, X_low)[0]
        mejor_match_idx = similitudes.argmax()
        idx_real = X_low.iloc[mejor_match_idx].name
        
        indices_reales.append(idx_real)
        print(f"‚úÖ {perfil_sim['nombre']} -> √çndice real: {idx_real}")
    
    return indices_reales

# Convertir perfiles simulados a √≠ndices reales
indices_reales = encontrar_indices_reales(perfiles_simulados)

# Usar funci√≥n de recomendaci√≥n
print("\nüéØ RECOMENDACIONES PARA PERFILES SIMULADOS")
recomendar_mejoras_50k(indices_reales)

‚úÖ JOVEN CON POCA EDUCACI√ìN -> √çndice real: 3269
‚úÖ ADULTO EDUCACI√ìN MEDIA -> √çndice real: 1679
‚úÖ PROFESIONAL MAL PAGADO -> √çndice real: 3269
‚úÖ TRABAJADOR MAYOR -> √çndice real: 1679

üéØ RECOMENDACIONES PARA PERFILES SIMULADOS

=== PERFIL 3269 ===
Actual: Sales - <=50K
üí° MEJORAS:
  ‚è∞ Horas: 40 ‚Üí 45.5 h/semana
  üíº Ocupaciones: ['Craft-repair', 'Sales']

=== PERFIL 1679 ===
Actual: Farming-fishing - <=50K
üí° MEJORAS:
  üíº Ocupaciones: ['Prof-specialty', 'Exec-managerial']

=== PERFIL 3269 ===
Actual: Sales - <=50K
üí° MEJORAS:
  ‚è∞ Horas: 40 ‚Üí 45.5 h/semana
  üíº Ocupaciones: ['Craft-repair', 'Sales']

=== PERFIL 1679 ===
Actual: Farming-fishing - <=50K
üí° MEJORAS:
  üíº Ocupaciones: ['Prof-specialty', 'Exec-managerial']
