# Explore here

In [43]:
import pandas as pd
import numpy as np
import pickle



from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD

import warnings

In [44]:
url = "https://breathecode.herokuapp.com/asset/internal-link?id=2326&path=adult-census-income.csv"
pd.read_csv(url).to_csv('../data/raw/census_income.csv', index=False)

In [45]:
df = pd.read_csv('../data/raw/census_income.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [47]:
df = df.drop_duplicates(keep=False)
df = df.drop(["race", "native.country", "sex"], axis=1)

In [48]:
df = df.replace("?", np.nan)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,0,4356,40,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,0,4356,18,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,0,4356,40,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,0,3900,40,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,0,3900,40,<=50K


In [49]:
# Conteo de NaN por columna
print(df.isnull().sum())


age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
capital.gain         0
capital.loss         0
hours.per.week       0
income               0
dtype: int64


In [50]:
df["workclass"].unique()

array([nan, 'Private', 'State-gov', 'Federal-gov', 'Self-emp-not-inc',
       'Self-emp-inc', 'Local-gov', 'Without-pay', 'Never-worked'],
      dtype=object)

In [51]:
df['occupation'].unique()

array([nan, 'Exec-managerial', 'Machine-op-inspct', 'Prof-specialty',
       'Other-service', 'Adm-clerical', 'Craft-repair',
       'Transport-moving', 'Handlers-cleaners', 'Sales',
       'Farming-fishing', 'Tech-support', 'Protective-serv',
       'Armed-Forces', 'Priv-house-serv'], dtype=object)

In [52]:
df['marital.status'].unique()

array(['Widowed', 'Divorced', 'Separated', 'Never-married',
       'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'],
      dtype=object)

In [53]:
df["education"].unique()

array(['HS-grad', 'Some-college', '7th-8th', '10th', 'Doctorate',
       'Prof-school', 'Bachelors', 'Masters', '11th', 'Assoc-acdm',
       'Assoc-voc', '1st-4th', '5th-6th', '12th', '9th', 'Preschool'],
      dtype=object)

In [54]:
#  NaN en workclass
nan_workclass = df[df["workclass"].isnull()]

# Ver solo la columna age
print(nan_workclass["age"].unique())


[90 66 51 61 71 68 67 41 72 65 43 63 60 26 19 55 21 31 50 28 42 20 33 23
 22 30 25 18 39 53 76 27 24 69 58 17 75 79 38 52 49 48 80 77 64 59 34 57
 62 35 74 47 54 36 40 29 70 78 32 45 56 46 82 83 37 73 81 44 84 87]


In [55]:
df_clean = df.dropna(subset=["workclass", "occupation"]).reset_index(drop=True)

print(df_clean.shape) 


(30671, 12)


In [56]:


categorical_cols = df_clean.select_dtypes(include=["object"]).columns 
# LabelEncoder a cada columna categórica 
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Ver el mapeo de cada columna
for col, mapping in encoders.items():
    print(f"\nColumna: {col}")
    print(mapping)


Columna: workclass
{'Federal-gov': np.int64(0), 'Local-gov': np.int64(1), 'Private': np.int64(2), 'Self-emp-inc': np.int64(3), 'Self-emp-not-inc': np.int64(4), 'State-gov': np.int64(5), 'Without-pay': np.int64(6)}

Columna: education
{'10th': np.int64(0), '11th': np.int64(1), '12th': np.int64(2), '1st-4th': np.int64(3), '5th-6th': np.int64(4), '7th-8th': np.int64(5), '9th': np.int64(6), 'Assoc-acdm': np.int64(7), 'Assoc-voc': np.int64(8), 'Bachelors': np.int64(9), 'Doctorate': np.int64(10), 'HS-grad': np.int64(11), 'Masters': np.int64(12), 'Preschool': np.int64(13), 'Prof-school': np.int64(14), 'Some-college': np.int64(15)}

Columna: marital.status
{'Divorced': np.int64(0), 'Married-AF-spouse': np.int64(1), 'Married-civ-spouse': np.int64(2), 'Married-spouse-absent': np.int64(3), 'Never-married': np.int64(4), 'Separated': np.int64(5), 'Widowed': np.int64(6)}

Columna: occupation
{'Adm-clerical': np.int64(0), 'Armed-Forces': np.int64(1), 'Craft-repair': np.int64(2), 'Exec-managerial': n

In [57]:
# escalador - SOLO para features (sin 'income')
scaler = StandardScaler()
features = df_clean.drop("income", axis=1)
df_features_scaled = pd.DataFrame(
    scaler.fit_transform(features),
    columns=features.columns,
    index=features.index
)

print(df_features_scaled.head())

        age  workclass    fnlwgt  ...  capital.gain  capital.loss  hours.per.week
0  3.321325  -0.209642 -0.540267  ...     -0.147631     10.511127       -1.915735
1  1.185856  -0.209642 -0.469260  ...     -0.147631      9.387826       -0.079795
2  0.194388  -0.209642  0.709329  ...     -0.147631      9.387826       -0.079795
3 -0.339479  -0.209642  0.256123  ...     -0.147631      9.067586        0.337464
4 -0.034412  -0.209642 -0.372150  ...     -0.147631      9.067586       -0.079795

[5 rows x 11 columns]


In [58]:
income_mapping = encoders['income']
high_income_value = income_mapping['>50K']


In [59]:
print(df_clean['income'].value_counts())


income
0    23025
1     7646
Name: count, dtype: int64


In [60]:
df_high_income = df_clean[df_clean['income'] == high_income_value]
X_high = df_features_scaled.loc[df_high_income.index]
y_high = df_high_income['income']

X_train, X_test, y_train, y_test = train_test_split(X_high, y_high, test_size=0.2, random_state=42)


In [61]:
print(y_train.value_counts())
print(y_test.value_counts())

income
1    6116
Name: count, dtype: int64
income
1    1530
Name: count, dtype: int64


In [62]:
y_train

2229     1
407      1
19820    1
18188    1
638      1
        ..
18494    1
19328    1
1564     1
30428    1
28867    1
Name: income, Length: 6116, dtype: int64

In [63]:

knn = NearestNeighbors(n_neighbors=10, metric="euclidean")
knn.fit(X_train)


0,1,2
,n_neighbors,10
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'euclidean'
,p,2
,metric_params,
,n_jobs,


In [64]:
def recomendador (user):

    distance, index = knn.kneighbors(user)

    neighbours = X_train.iloc[index[0]]
    income = y_train.iloc[index[0]]
    neighbours = pd.DataFrame(scaler.inverse_transform(neighbours), columns=X_train.columns)
    neighbours["income"] = income.values

    user = pd.DataFrame(scaler.inverse_transform(user), columns=X_train.columns)

    for col in neighbours.columns:
        if col in encoders and col != "income":
            reverse_mapping = {v: k for k, v in encoders[col].items()}
            neighbours[col] = neighbours[col].map(reverse_mapping)
            user[col] = user[col].map(reverse_mapping)



    return neighbours, user 

In [65]:
vecinos, user = recomendador(X_test.iloc[[3]])

vecinos

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week,income
0,50.0,Private,143953.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,0.0,0.0,65.0,1
1,44.0,Private,191256.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,0.0,0.0,65.0,1
2,40.0,Private,168538.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,0.0,0.0,60.0,1
3,43.0,Private,200355.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,0.0,0.0,70.0,1
4,40.0,Private,146659.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,0.0,0.0,60.0,1
5,49.0,Private,128132.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,0.0,0.0,60.0,1
6,40.0,Private,155972.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,0.0,0.0,60.0,1
7,50.0,Private,137815.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,0.0,0.0,60.0,1
8,52.0,Private,147629.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,0.0,0.0,60.0,1
9,50.0,Private,185846.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,0.0,0.0,55.0,1


In [66]:
user

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week
0,46.0,Private,185385.0,HS-grad,9.0,Married-civ-spouse,Sales,Husband,0.0,0.0,65.0


In [67]:
def analyze_neighbors(neighbors, user):
    """
    Analyzes neighbors and generates recommendations for the user to improve their income level.
    """

    # Ordered rankings for categorical variables
    workclass_order = [
        "Never-worked",
        "Without-pay",
        "Local-gov",
        "State-gov",
        "Federal-gov",
        "Self-emp-not-inc",
        "Self-emp-inc",
        "Private"]

    marital_status_order = [
        "Never-married",
        "Separated",
        "Divorced",
        "Widowed",
        "Married-spouse-absent",
        "Married-AF-spouse",
        "Married-civ-spouse"]

    occupation_order = [
        "Priv-house-serv",
        "Handlers-cleaners",
        "Other-service",
        "Farming-fishing",
        "Machine-op-inspct",
        "Transport-moving",
        "Protective-serv",
        "Adm-clerical",
        "Sales",
        "Craft-repair",
        "Tech-support",
        "Exec-managerial",
        "Prof-specialty"]

    category_orders = {
        "workclass": workclass_order,
        "marital-status": marital_status_order,
        "occupation": occupation_order}

    #Compare categories
    def is_worse(user_value, best_value, order_list):
        """Returns True if user_value is lower-ranked than best_value."""
        if pd.isna(user_value):
            return True
        if user_value not in order_list or best_value not in order_list:
            return False
        return order_list.index(user_value) < order_list.index(best_value)

    # DISPLAY USER PROFILE
    print("\n" + "="*80)
    print("YOUR PROFILE:")
    print("="*80)
    print(user.iloc[0].to_string())

    # DISPLAY NEIGHBORS
    print("\n" + "="*80)
    print("NEIGHBORS WITH INCOME >50K:")
    print("="*80)
    print(neighbors.to_string())

    # RECOMMENDATIONS
    print("\n" + "="*80)
    print("RECOMMENDATIONS - WHAT TO CHANGE:")
    print("="*80)

    user_values = user.iloc[0]

    for col in user.columns:
        neighbor_vals = neighbors[col]

        # NUMERIC VARIABLES
        if neighbor_vals.dtype in ["int64", "float64"]:
            avg = neighbor_vals.mean()
            diff = avg - user_values[col]

            if abs(diff) > 2 and col not in ["age", "fnlwgt"]:
                print(f"\n• {col.upper()}:")
                print(f"  Your value: {user_values[col]:.1f}")
                print(f"  Neighbors' average: {avg:.1f}")
                print(f"  Difference: {diff:+.1f}")
                print("  → Recommendation:", "INCREASE" if diff > 0 else "DECREASE")

        # CATEGORICAL VARIABLES
        else:
            mode = neighbor_vals.mode()[0]

            if col in category_orders:
                order_list = category_orders[col]

                if is_worse(user_values[col], mode, order_list):
                    print(f"\n• {col.upper()}:")
                    print(f"  Your value: {user_values[col]}")
                    print(f"  Better common value: {mode}")
                    print(f"  → Recommendation: CHANGE to {mode}")

            else:
                # Fallback: suggest changing to the most common value
                if user_values[col] != mode:
                    mode_pct = (neighbor_vals == mode).sum() / len(neighbor_vals) * 100
                    if mode_pct > 30:
                        print(f"\n• {col.upper()}:")
                        print(f"  Your value: {user_values[col]}")
                        print(f"  Most common value: {mode} ({mode_pct:.1f}% of neighbors)")
                        print(f"  → Recommendation: CHANGE to {mode}")

In [72]:
analyze_neighbors(vecinos, user)


YOUR PROFILE:
age                             46.0
workclass                    Private
fnlwgt                      185385.0
education                    HS-grad
education.num                    9.0
marital.status    Married-civ-spouse
occupation                     Sales
relationship                 Husband
capital.gain                     0.0
capital.loss                     0.0
hours.per.week                  65.0

NEIGHBORS WITH INCOME >50K:
    age workclass    fnlwgt education  education.num      marital.status        occupation relationship  capital.gain  capital.loss  hours.per.week  income
0  50.0   Private  143953.0   HS-grad            9.0  Married-civ-spouse             Sales      Husband           0.0           0.0            65.0       1
1  44.0   Private  191256.0   HS-grad            9.0  Married-civ-spouse  Transport-moving      Husband           0.0           0.0            65.0       1
2  40.0   Private  168538.0   HS-grad            9.0  Married-civ-spouse         