# Explore here

In [143]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors

In [144]:
url = 'https://breathecode.herokuapp.com/asset/internal-link?id=2326&path=adult-census-income.csv'
pd.read_csv(url).to_csv('../data/raw/adult-census-income.csv')

In [145]:
df = pd.read_csv('../data/raw/adult-census-income.csv').drop('Unnamed: 0', axis= 1).drop_duplicates()
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   workclass       32537 non-null  object
 2   fnlwgt          32537 non-null  int64 
 3   education       32537 non-null  object
 4   education.num   32537 non-null  int64 
 5   marital.status  32537 non-null  object
 6   occupation      32537 non-null  object
 7   relationship    32537 non-null  object
 8   race            32537 non-null  object
 9   sex             32537 non-null  object
 10  capital.gain    32537 non-null  int64 
 11  capital.loss    32537 non-null  int64 
 12  hours.per.week  32537 non-null  int64 
 13  native.country  32537 non-null  object
 14  income          32537 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB


In [147]:
df = df.replace('?', np.nan)
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     582
income               0
dtype: int64

In [148]:
df['workclass'].unique()

array([nan, 'Private', 'State-gov', 'Federal-gov', 'Self-emp-not-inc',
       'Self-emp-inc', 'Local-gov', 'Without-pay', 'Never-worked'],
      dtype=object)

In [149]:
df['marital.status'].unique()

array(['Widowed', 'Divorced', 'Separated', 'Never-married',
       'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'],
      dtype=object)

In [150]:
df['occupation'].unique()

array([nan, 'Exec-managerial', 'Machine-op-inspct', 'Prof-specialty',
       'Other-service', 'Adm-clerical', 'Craft-repair',
       'Transport-moving', 'Handlers-cleaners', 'Sales',
       'Farming-fishing', 'Tech-support', 'Protective-serv',
       'Armed-Forces', 'Priv-house-serv'], dtype=object)

In [151]:
df['native.country'].unique()

array(['United-States', nan, 'Mexico', 'Greece', 'Vietnam', 'China',
       'Taiwan', 'India', 'Philippines', 'Trinadad&Tobago', 'Canada',
       'South', 'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran',
       'England', 'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba',
       'Ireland', 'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic',
       'Haiti', 'El-Salvador', 'Hungary', 'Columbia', 'Guatemala',
       'Jamaica', 'Ecuador', 'France', 'Yugoslavia', 'Scotland',
       'Portugal', 'Laos', 'Thailand', 'Outlying-US(Guam-USVI-etc)'],
      dtype=object)

In [152]:
df = df.dropna().reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30139 entries, 0 to 30138
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             30139 non-null  int64 
 1   workclass       30139 non-null  object
 2   fnlwgt          30139 non-null  int64 
 3   education       30139 non-null  object
 4   education.num   30139 non-null  int64 
 5   marital.status  30139 non-null  object
 6   occupation      30139 non-null  object
 7   relationship    30139 non-null  object
 8   race            30139 non-null  object
 9   sex             30139 non-null  object
 10  capital.gain    30139 non-null  int64 
 11  capital.loss    30139 non-null  int64 
 12  hours.per.week  30139 non-null  int64 
 13  native.country  30139 non-null  object
 14  income          30139 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.4+ MB


In [153]:
#drop unusefull colums

df = df.drop(['race', 'sex', 'native.country'], axis= 1) #people cannot change race or sex

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week,income
0,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,0,4356,18,<=50K
1,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,0,3900,40,<=50K
2,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,0,3900,40,<=50K
3,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,0,3770,45,<=50K
4,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,0,3770,40,<=50K


In [154]:
#encode categorical
df_encoded = df.drop(['income'], axis= 1)

encoders = {} #save encoders for future use

for col in df_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()

    df_encoded[col] = le.fit_transform(df_encoded[col])

    encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))

In [155]:
#scaler

scaler = StandardScaler()

scaler.fit(df_encoded)

df_encoded = pd.DataFrame(scaler.transform(df_encoded), columns= df_encoded.columns)

df_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week
0,3.317157,-0.20898,-0.538773,0.174959,-0.440434,2.283147,-0.73457,-0.261297,-0.147502,10.551814,-1.914647
1,1.184832,-0.20898,-0.467892,-1.39902,-2.402221,-1.722039,0.009847,1.611826,-0.147502,9.424325,-0.078031
2,0.194824,-0.20898,0.708595,1.224279,-0.048076,1.615616,0.754264,0.987452,-0.147502,9.424325,-0.078031
3,-0.338257,-0.20898,0.256197,0.174959,-0.440434,-1.722039,0.257986,1.611826,-0.147502,9.102892,0.339381
4,-0.033639,-0.20898,-0.370956,-2.71067,-1.617506,1.615616,-1.478987,1.611826,-0.147502,9.102892,-0.078031


In [156]:
#train test
X = df_encoded
y = df['income'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size= 0.2, random_state= 42)


In [157]:
y_train

2290     1
4472     0
5478     0
27310    1
28148    0
        ..
29802    0
5390     0
860      0
15795    0
23654    0
Name: income, Length: 24111, dtype: int64

In [158]:
#separate high

high_income = df[df['income'] == '>50K']
high_income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week,income
5,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,0,3683,20,>50K
7,45,Private,172274,Doctorate,16,Divorced,Prof-specialty,Unmarried,0,3004,35,>50K
8,38,Self-emp-not-inc,164526,Prof-school,15,Never-married,Prof-specialty,Not-in-family,0,2824,45,>50K
9,52,Private,129177,Bachelors,13,Widowed,Other-service,Not-in-family,0,2824,20,>50K
10,32,Private,136204,Masters,14,Separated,Exec-managerial,Not-in-family,0,2824,55,>50K


In [159]:
# Find high income index in common with x train
common_indices = X_train.index.intersection(high_income.index)

# Filtrer only that index
X_train = X_train.loc[common_indices]
y_train = y_train.loc[common_indices]

test_indices = X_test.index.difference(common_indices)
X_test = X_test.loc[test_indices]
y_test = y_test.loc[test_indices]

In [160]:
y_test

6        0
7        1
17       0
29       1
34       1
        ..
30108    0
30119    0
30121    0
30122    0
30124    0
Name: income, Length: 6028, dtype: int64

In [161]:
model_content = NearestNeighbors(n_neighbors= 10, metric= 'cosine')

model_content.fit(X_train) #train only with high income to recommend high income neighbors

0,1,2
,n_neighbors,10
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [162]:
def recommender(user):

    distance, index = model_content.kneighbors(user)

    neighbors = X_train.iloc[index[0]]
    income = y_train.iloc[index[0]]

    neighbors = pd.DataFrame(scaler.inverse_transform(neighbors), columns=X_train.columns)
    neighbors['income'] = income.values
    neighbors['similarity'] = 1 - distance[0] #cosine similarity

    
    user = pd.DataFrame(scaler.inverse_transform(user), columns=X_train.columns)

    for col in neighbors.columns:
        if col in encoders:
            reverse_map = {v: k for k, v in encoders[col].items()}
            neighbors[col] = neighbors[col].map(reverse_map)
            user[col] = user[col].map(reverse_map)
            
    return neighbors, user

In [163]:
nn, user = recommender(X_test.iloc[[23]])
nn


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week,income,similarity
0,38.0,Private,237608.0,Bachelors,13.0,Never-married,Sales,Not-in-family,0.0,2444.0,45.0,1,0.993143
1,34.0,Private,203034.0,Bachelors,13.0,Separated,Sales,Not-in-family,0.0,2824.0,50.0,1,0.981526
2,51.0,Private,216475.0,Bachelors,13.0,Never-married,Sales,Not-in-family,0.0,1564.0,43.0,1,0.965283
3,45.0,Private,196584.0,Assoc-voc,11.0,Never-married,Prof-specialty,Not-in-family,0.0,1564.0,40.0,1,0.961505
4,39.0,Private,141584.0,Masters,14.0,Never-married,Sales,Not-in-family,0.0,2444.0,45.0,1,0.96092
5,39.0,Private,284166.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,0.0,1902.0,50.0,1,0.954595
6,37.0,Private,241998.0,Bachelors,13.0,Married-civ-spouse,Tech-support,Husband,0.0,1977.0,40.0,1,0.953616
7,43.0,Private,293305.0,Bachelors,13.0,Married-civ-spouse,Sales,Husband,0.0,1887.0,40.0,1,0.953061
8,34.0,Private,345705.0,Bachelors,13.0,Married-civ-spouse,Sales,Husband,0.0,1977.0,50.0,1,0.952995
9,36.0,Private,237943.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,0.0,1977.0,45.0,1,0.952208


In [164]:
user

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,capital.gain,capital.loss,hours.per.week
0,40.0,Private,287983.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,0.0,2258.0,48.0


In [165]:
def analyze_neighbors(neighbors, user):
    """
    Analyzes neighbors and generates recommendations on what to change,
    using numeric differences and ordered rankings for categorical variables.
    """

    # Ordered rankings for categorical variables
    workclass_order = [
        "Never-worked",
        "Without-pay",
        "Local-gov",
        "State-gov",
        "Federal-gov",
        "Self-emp-not-inc",
        "Self-emp-inc",
        "Private"
    ]

    marital_status_order = [
        "Never-married",
        "Separated",
        "Divorced",
        "Widowed",
        "Married-spouse-absent",
        "Married-AF-spouse",
        "Married-civ-spouse"
    ]

    occupation_order = [
        "Priv-house-serv",
        "Handlers-cleaners",
        "Other-service",
        "Farming-fishing",
        "Machine-op-inspct",
        "Transport-moving",
        "Protective-serv",
        "Adm-clerical",
        "Sales",
        "Craft-repair",
        "Tech-support",
        "Exec-managerial",
        "Prof-specialty"
    ]

    category_orders = {
        "workclass": workclass_order,
        "marital-status": marital_status_order,
        "occupation": occupation_order
    }

    # Helper function to compare categories
    def is_worse(user_value, best_value, order_list):
        """Returns True if user_value is lower-ranked than best_value."""
        if pd.isna(user_value):
            return True
        if user_value not in order_list or best_value not in order_list:
            return False
        return order_list.index(user_value) < order_list.index(best_value)

    # DISPLAY USER PROFILE
    print("\n" + "="*80)
    print("YOUR PROFILE:")
    print("="*80)
    print(user.iloc[0].to_string())

    # DISPLAY NEIGHBORS
    print("\n" + "="*80)
    print("NEIGHBORS WITH INCOME >50K:")
    print("="*80)
    print(neighbors.to_string())

    # RECOMMENDATIONS
    print("\n" + "="*80)
    print("RECOMMENDATIONS - WHAT TO CHANGE:")
    print("="*80)

    user_values = user.iloc[0]

    for col in user.columns:
        neighbor_vals = neighbors[col]

        # NUMERIC VARIABLES
        if neighbor_vals.dtype in ["int64", "float64"]:
            avg = neighbor_vals.mean()
            diff = avg - user_values[col]

            if abs(diff) > 2 and col not in ["age", "fnlwgt"]:
                print(f"\n• {col.upper()}:")
                print(f"  Your value: {user_values[col]:.1f}")
                print(f"  Neighbors' average: {avg:.1f}")
                print(f"  Difference: {diff:+.1f}")
                print("  → Recommendation:", "INCREASE" if diff > 0 else "DECREASE")

        # CATEGORICAL VARIABLES
        else:
            mode = neighbor_vals.mode()[0]

            if col in category_orders:
                order_list = category_orders[col]

                if is_worse(user_values[col], mode, order_list):
                    print(f"\n• {col.upper()}:")
                    print(f"  Your value: {user_values[col]}")
                    print(f"  Better common value: {mode}")
                    print(f"  → Recommendation: CHANGE to {mode}")

            else:
                # Fallback: suggest changing to the most common value
                if user_values[col] != mode:
                    mode_pct = (neighbor_vals == mode).sum() / len(neighbor_vals) * 100
                    if mode_pct > 30:
                        print(f"\n• {col.upper()}:")
                        print(f"  Your value: {user_values[col]}")
                        print(f"  Most common value: {mode} ({mode_pct:.1f}% of neighbors)")
                        print(f"  → Recommendation: CHANGE to {mode}")


In [166]:
analyze_neighbors(nn, user)


YOUR PROFILE:
age                        40.0
workclass               Private
fnlwgt                 287983.0
education             Bachelors
education.num              13.0
marital.status    Never-married
occupation         Tech-support
relationship      Not-in-family
capital.gain                0.0
capital.loss             2258.0
hours.per.week             48.0

NEIGHBORS WITH INCOME >50K:
    age workclass    fnlwgt  education  education.num      marital.status      occupation   relationship  capital.gain  capital.loss  hours.per.week  income  similarity
0  38.0   Private  237608.0  Bachelors           13.0       Never-married           Sales  Not-in-family           0.0        2444.0            45.0       1    0.993143
1  34.0   Private  203034.0  Bachelors           13.0           Separated           Sales  Not-in-family           0.0        2824.0            50.0       1    0.981526
2  51.0   Private  216475.0  Bachelors           13.0       Never-married           Sales  Not-in