# Explore here

In [215]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder

import warnings

In [216]:
url = "https://breathecode.herokuapp.com/asset/internal-link?id=2326&path=adult-census-income.csv"
pd.read_csv(url).to_csv('../data/raw/census_income.csv', index=False)

In [217]:
df = pd.read_csv('../data/raw/census_income.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [218]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [219]:
df = df.drop_duplicates(keep=False)

In [220]:
df = df.replace("?", np.nan)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [221]:
# Conteo de NaN por columna
print(df.isnull().sum())


age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     581
income               0
dtype: int64


In [222]:
#  NaN en workclass
nan_workclass = df[df["workclass"].isnull()]

# Ver solo la columna age
print(nan_workclass["age"].unique())


[90 66 51 61 71 68 67 41 72 65 43 63 60 26 19 55 21 31 50 28 42 20 33 23
 22 30 25 18 39 53 76 27 24 69 58 17 75 79 38 52 49 48 80 77 64 59 34 57
 62 35 74 47 54 36 40 29 70 78 32 45 56 46 82 83 37 73 81 44 84 87]


In [223]:
df_clean = df.dropna(subset=["workclass", "occupation", "native.country"]).reset_index(drop=True)

print(df_clean.shape) 


(30117, 15)


In [224]:


categorical_cols = df_clean.select_dtypes(include=["object"]).columns 
# LabelEncoder a cada columna categórica 
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Ver el mapeo de cada columna
for col, mapping in encoders.items():
    print(f"\nColumna: {col}")
    print(mapping)


Columna: workclass
{'Federal-gov': np.int64(0), 'Local-gov': np.int64(1), 'Private': np.int64(2), 'Self-emp-inc': np.int64(3), 'Self-emp-not-inc': np.int64(4), 'State-gov': np.int64(5), 'Without-pay': np.int64(6)}

Columna: education
{'10th': np.int64(0), '11th': np.int64(1), '12th': np.int64(2), '1st-4th': np.int64(3), '5th-6th': np.int64(4), '7th-8th': np.int64(5), '9th': np.int64(6), 'Assoc-acdm': np.int64(7), 'Assoc-voc': np.int64(8), 'Bachelors': np.int64(9), 'Doctorate': np.int64(10), 'HS-grad': np.int64(11), 'Masters': np.int64(12), 'Preschool': np.int64(13), 'Prof-school': np.int64(14), 'Some-college': np.int64(15)}

Columna: marital.status
{'Divorced': np.int64(0), 'Married-AF-spouse': np.int64(1), 'Married-civ-spouse': np.int64(2), 'Married-spouse-absent': np.int64(3), 'Never-married': np.int64(4), 'Separated': np.int64(5), 'Widowed': np.int64(6)}

Columna: occupation
{'Adm-clerical': np.int64(0), 'Armed-Forces': np.int64(1), 'Craft-repair': np.int64(2), 'Exec-managerial': n

In [225]:
# escalador
scaler = StandardScaler()

df_encoded = scaler.fit_transform(df_clean)

df_scaled = pd.DataFrame(df_encoded, 
                         columns=df_clean.columns, 
                         index=df_clean.index)

print(df_scaled.head())

        age  workclass    fnlwgt  education  education.num  marital.status  \
0  3.317717  -0.209003 -0.538766   0.175212      -0.440959        2.283298   
1  1.184867  -0.209003 -0.467888  -1.398794      -2.403417       -1.721716   
2  0.194615  -0.209003  0.708556   1.224549      -0.048467        1.615796   
3 -0.338597  -0.209003  0.256174   0.175212      -0.440959       -1.721716   
4 -0.033905  -0.209003 -0.370955  -2.710466      -1.618434        1.615796   

   occupation  relationship     race       sex  capital.gain  capital.loss  \
0   -0.734621     -0.261341  0.38516 -1.443668     -0.147557     10.547987   
1    0.009713      1.611470  0.38516 -1.443668     -0.147557      9.420889   
2    0.754046      0.987200  0.38516 -1.443668     -0.147557      9.420889   
3    0.257824      1.611470  0.38516 -1.443668     -0.147557      9.099568   
4   -1.478954      1.611470  0.38516  0.692680     -0.147557      9.099568   

   hours.per.week  native.country    income  
0       -1.91516

In [226]:
#train test
X = df_scaled.drop("income", axis=1)   # todas las columnas menos la target
y = df_scaled["income"]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Tamaño train:", X_train.shape)
print("Tamaño test:", X_test.shape)


Tamaño train: (24093, 14)
Tamaño test: (6024, 14)
