In [15]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler


In [16]:
# Load dataset
df = pd.read_csv("../Datasets/adult.csv")

# Display first 5 rows
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [17]:
# Shape of dataset
df.shape


(32561, 15)

In [18]:
# Data types and non-null counts
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [19]:
# Statistical summary
df.describe()


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [20]:
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Check missing values
df.isnull().sum()


age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [21]:
# Drop rows with missing values
df.dropna(inplace=True)

# Confirm removal
df.isnull().sum()


age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [22]:
# Separate features by type
categorical_features = df.select_dtypes(include='object').columns.tolist()
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_features, numerical_features


(['workclass',
  'education',
  'marital.status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native.country',
  'income'],
 ['age',
  'fnlwgt',
  'education.num',
  'capital.gain',
  'capital.loss',
  'hours.per.week'])

In [23]:
# Encode target variable
le_income = LabelEncoder()
df['income'] = le_income.fit_transform(df['income'])

df['income'].value_counts()


income
0    22654
1     7508
Name: count, dtype: int64

In [24]:
# Nominal categorical features
nominal_features = [
    'workclass',
    'marital.status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native.country'
]

# Apply One-Hot Encoding
df = pd.get_dummies(df, columns=nominal_features, drop_first=True)


In [25]:
df[numerical_features].describe()


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,38.437902,189793.8,10.121312,1092.007858,88.372489,40.931238
std,13.134665,105653.0,2.549995,7406.346497,404.29837,11.979984
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.2,9.0,0.0,0.0,40.0
50%,37.0,178425.0,10.0,0.0,0.0,40.0
75%,47.0,237628.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])


In [27]:
df[numerical_features].describe()


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,30162.0,30162.0,30162.0,30162.0,30162.0,30162.0
mean,1.545375e-16,1.696143e-17,-3.128442e-16,1.507683e-17,-6.030732e-17,-2.789214e-16
std,1.000017,1.000017,1.000017,1.000017,1.000017,1.000017
min,-1.632189,-1.666094,-3.577051,-0.1474446,-0.218586,-3.333218
25%,-0.7946967,-0.6830644,-0.4397382,-0.1474446,-0.218586,-0.07773411
50%,-0.1094756,-0.1076072,-0.04757405,-0.1474446,-0.218586,-0.07773411
75%,0.6518811,0.4527602,1.128918,-0.1474446,-0.218586,0.3396356
max,3.925715,12.25647,2.305411,13.35458,10.55581,4.847229


In [28]:
# Save final processed dataset
df.to_csv("adult_processed.csv", index=False)

print("Processed dataset saved successfully.")


Processed dataset saved successfully.


In [29]:
df.head()


Unnamed: 0,age,fnlwgt,education,education.num,capital.gain,capital.loss,hours.per.week,income,workclass_Local-gov,workclass_Private,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
1,3.31663,-0.53879,HS-grad,-0.439738,-0.147445,10.555814,-1.914161,0,False,True,...,False,False,False,False,False,False,False,True,False,False
3,1.184831,-0.467906,7th-8th,-2.400559,-0.147445,9.427915,-0.077734,0,False,True,...,False,False,False,False,False,False,False,True,False,False
4,0.195067,0.708645,Some-college,-0.047574,-0.147445,9.427915,-0.077734,0,False,True,...,False,False,False,False,False,False,False,True,False,False
5,-0.337883,0.256222,HS-grad,-0.439738,-0.147445,9.106365,0.339636,0,False,True,...,False,False,False,False,False,False,False,True,False,False
6,-0.03334,-0.370964,10th,-1.616231,-0.147445,9.106365,-0.077734,0,False,True,...,False,False,False,False,False,False,False,True,False,False


In [30]:
df.shape


(30162, 83)