In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb

df = pd.read_csv('adult.csv')

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
(df == '?').sum().sort_values(ascending=False)

occupation        1843
workclass         1836
native.country     583
fnlwgt               0
education            0
education.num        0
age                  0
marital.status       0
relationship         0
sex                  0
race                 0
capital.gain         0
capital.loss         0
hours.per.week       0
income               0
dtype: int64

In [5]:
df = df.replace('?', np.nan, inplace=False)

(df == '?').sum().sort_values(ascending=False)

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(24)

In [7]:
df = df.drop_duplicates(inplace=False)

df.duplicated().sum()

np.int64(0)

In [8]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [9]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     582
income               0
dtype: int64

In [10]:
X = df.drop(columns=['income'])
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42
)

In [11]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
4278,50,Federal-gov,251585,Bachelors,13,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,55,United-States
22615,25,Private,209286,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
14364,29,Private,232784,Assoc-acdm,12,Never-married,Exec-managerial,Own-child,White,Male,0,0,40,United-States
29870,19,Private,340567,1st-4th,2,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,55,Mexico
3362,21,Private,20728,HS-grad,9,Never-married,Sales,Own-child,White,Female,4101,0,40,United-States


In [12]:
for feature in ['workclass', 'occupation', 'native.country']:
    feature_mode = X_train[feature].mode()[0]
    X_train[feature] = X_train[feature].fillna(feature_mode)
    X_test[feature] = X_test[feature].fillna(feature_mode)

In [13]:
print("X training set\n\n", X_train.isnull().sum())
print("\nX test set\n\n", X_test.isnull().sum())

X training set

 age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

X test set

 age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64


In [14]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
4278,50,Federal-gov,251585,Bachelors,13,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,55,United-States
22615,25,Private,209286,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States
14364,29,Private,232784,Assoc-acdm,12,Never-married,Exec-managerial,Own-child,White,Male,0,0,40,United-States
29870,19,Private,340567,1st-4th,2,Never-married,Handlers-cleaners,Not-in-family,White,Male,0,0,55,Mexico
3362,21,Private,20728,HS-grad,9,Never-married,Sales,Own-child,White,Female,4101,0,40,United-States


In [15]:
label_encoder = LabelEncoder()

for feature in ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']:
    X_train[feature] = label_encoder.fit_transform(X_train[feature])
    X_test[feature] = label_encoder.transform(X_test[feature])

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [16]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
4278,50,0,251585,9,13,0,3,1,4,1,0,0,55,38
22615,25,3,209286,15,10,2,2,0,4,1,0,0,40,38
14364,29,3,232784,7,12,4,3,3,4,1,0,0,40,38
29870,19,3,340567,3,2,4,5,1,4,1,0,0,55,25
3362,21,3,20728,11,9,4,11,3,4,0,4101,0,40,38


In [None]:
model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.1,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

score = accuracy_score(y_test, y_pred)

print("Accuracy Score:", score)

Score: 0.8708051628764597
