In [1]:
import pandas as pd

df = pd.read_csv("../data/adult.csv")
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [2]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [3]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

num_cols, cat_cols


(Index(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
        'hours-per-week'],
       dtype='object'),
 Index(['workclass', 'education', 'marital-status', 'occupation',
        'relationship', 'race', 'gender', 'native-country', 'income'],
       dtype='object'))

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['income_encoded'] = le.fit_transform(df['income'])

df[['income', 'income_encoded']].head()


Unnamed: 0,income,income_encoded
0,<=50K,0
1,<=50K,0
2,>50K,1
3,>50K,1
4,<=50K,0


### Scaling
StandardScaler transforms features to mean = 0 and std = 1.
This helps algorithms like KNN, SVM, Logistic Regression perform better.


In [6]:
# One-Hot Encoding for unordered categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols = cat_cols.drop('income')  # we already encoded income

df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df_encoded.head()


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,income_encoded,workclass_Federal-gov,workclass_Local-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,<=50K,0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,38,89814,9,0,0,50,<=50K,0,False,False,...,False,False,False,False,False,False,False,True,False,False
2,28,336951,12,0,0,40,>50K,1,False,True,...,False,False,False,False,False,False,False,True,False,False
3,44,160323,10,7688,0,40,>50K,1,False,False,...,False,False,False,False,False,False,False,True,False,False
4,18,103497,10,0,0,30,<=50K,0,False,False,...,False,False,False,False,False,False,False,True,False,False


### One-Hot Encoding
One-Hot Encoding is used when categories have no natural order, such as
workclass, education, and occupation.  
Each category becomes a separate binary column (0 or 1).


In [7]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns after encoding
num_cols = df_encoded.select_dtypes(include=['int64', 'float64']).columns

scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

df_encoded[num_cols].head()


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income_encoded
0,-0.995129,0.351675,-1.197259,-0.144804,-0.217127,-0.034087,-0.560845
1,-0.046942,-0.945524,-0.419335,-0.144804,-0.217127,0.77293,-0.560845
2,-0.776316,1.394723,0.74755,-0.144804,-0.217127,-0.034087,1.783024
3,0.390683,-0.277844,-0.030373,0.886874,-0.217127,-0.034087,1.783024
4,-1.505691,-0.815954,-0.030373,-0.144804,-0.217127,-0.841104,-0.560845


### Scaling
StandardScaler transforms numerical features to have mean = 0 and standard deviation = 1.
This helps machine learning algorithms like KNN, SVM, and Logistic Regression
perform better and converge faster.


In [8]:
print("Before scaling:")
print(df[num_cols].describe())

print("\nAfter scaling:")
print(df_encoded[num_cols].describe())


Before scaling:
                age        fnlwgt  educational-num  capital-gain  \
count  48842.000000  4.884200e+04     48842.000000  48842.000000   
mean      38.643585  1.896641e+05        10.078089   1079.067626   
std       13.710510  1.056040e+05         2.570973   7452.019058   
min       17.000000  1.228500e+04         1.000000      0.000000   
25%       28.000000  1.175505e+05         9.000000      0.000000   
50%       37.000000  1.781445e+05        10.000000      0.000000   
75%       48.000000  2.376420e+05        12.000000      0.000000   
max       90.000000  1.490400e+06        16.000000  99999.000000   

       capital-loss  hours-per-week  income_encoded  
count  48842.000000    48842.000000    48842.000000  
mean      87.502314       40.422382        0.239282  
std      403.004552       12.391444        0.426649  
min        0.000000        1.000000        0.000000  
25%        0.000000       40.000000        0.000000  
50%        0.000000       40.000000        0.00

In [9]:
df_encoded.to_csv("../data/processed_adult.csv", index=False)
print("processed_adult.csv saved successfully.")


processed_adult.csv saved successfully.
