In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/kaggle/input/adult-income-dataset/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [4]:
cols = df.columns
unique_values = {}
for col in cols:
    unique_values[col] = df[col].unique()
print(unique_values)

# workclass has ? as value
# occupation has ? as value 
# native-country ? as value

print(np.sum(df['workclass'] == '?') / len(df))
print(np.sum(df['occupation'] == '?') / len(df))
print(np.sum(df['native-country'] == '?') / len(df))

{'age': array([25, 38, 28, 44, 18, 34, 29, 63, 24, 55, 65, 36, 26, 58, 48, 43, 20,
       37, 40, 72, 45, 22, 23, 54, 32, 46, 56, 17, 39, 52, 21, 42, 33, 30,
       47, 41, 19, 69, 50, 31, 59, 49, 51, 27, 57, 61, 64, 79, 73, 53, 77,
       80, 62, 35, 68, 66, 75, 60, 67, 71, 70, 90, 81, 74, 78, 82, 83, 85,
       76, 84, 89, 88, 87, 86]), 'workclass': array(['Private', 'Local-gov', '?', 'Self-emp-not-inc', 'Federal-gov',
       'State-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object), 'fnlwgt': array([226802,  89814, 336951, ..., 129912, 255835, 257302]), 'education': array(['11th', 'HS-grad', 'Assoc-acdm', 'Some-college', '10th',
       'Prof-school', '7th-8th', 'Bachelors', 'Masters', 'Doctorate',
       '5th-6th', 'Assoc-voc', '9th', '12th', '1st-4th', 'Preschool'],
      dtype=object), 'educational-num': array([ 7,  9, 12, 10,  6, 15,  4, 13, 14, 16,  3, 11,  5,  8,  2,  1]), 'marital-status': array(['Never-married', 'Married-civ-spouse', 'Widowed', 'Divorce

In [5]:
df['workclass'] = df['workclass'].replace('?', np.nan)
df['occupation'] = df['occupation'].replace('?', np.nan)
df['native-country'] = df['native-country'].replace('?', np.nan)


print(np.sum(df['workclass'] == '?') / len(df))
print(np.sum(df['occupation'] == '?') / len(df))
print(np.sum(df['native-country'] == '?') / len(df))


0.0
0.0
0.0


In [6]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45222 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              45222 non-null  int64 
 1   workclass        45222 non-null  object
 2   fnlwgt           45222 non-null  int64 
 3   education        45222 non-null  object
 4   educational-num  45222 non-null  int64 
 5   marital-status   45222 non-null  object
 6   occupation       45222 non-null  object
 7   relationship     45222 non-null  object
 8   race             45222 non-null  object
 9   gender           45222 non-null  object
 10  capital-gain     45222 non-null  int64 
 11  capital-loss     45222 non-null  int64 
 12  hours-per-week   45222 non-null  int64 
 13  native-country   45222 non-null  object
 14  income           45222 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


## Prepare the label for KNN model

In [7]:
df['income'] = np.where(df['income'] == '<=50K', 0, 1)

## Scale data

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

numeric_cols = []
for col in cols:
    if df[col].dtypes == 'int64':
        numeric_cols.append(col)
# print(numeric_cols)
numeric_cols.remove('income')
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-1.024983,Private,0.350889,11th,-1.221559,Never-married,Machine-op-inspct,Own-child,Black,Male,-0.146733,-0.21878,-0.07812,United-States,0
1,-0.041455,Private,-0.945878,HS-grad,-0.438122,Married-civ-spouse,Farming-fishing,Husband,White,Male,-0.146733,-0.21878,0.754701,United-States,0
2,-0.798015,Local-gov,1.393592,Assoc-acdm,0.737034,Married-civ-spouse,Protective-serv,Husband,White,Male,-0.146733,-0.21878,-0.07812,United-States,1
3,0.412481,Private,-0.27842,Some-college,-0.046403,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.877467,-0.21878,-0.07812,United-States,1
5,-0.344079,Private,0.084802,10th,-1.613277,Never-married,Other-service,Not-in-family,White,Male,-0.146733,-0.21878,-0.910942,United-States,0


In [9]:
df = pd.get_dummies(df, drop_first=True, dtype=float)
df

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,-1.024983,0.350889,-1.221559,-0.146733,-0.21878,-0.078120,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.041455,-0.945878,-0.438122,-0.146733,-0.21878,0.754701,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.798015,1.393592,0.737034,-0.146733,-0.21878,-0.078120,1,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.412481,-0.278420,-0.046403,0.877467,-0.21878,-0.078120,1,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,-0.344079,0.084802,-1.613277,-0.146733,-0.21878,-0.910942,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.873671,0.639611,0.737034,-0.146733,-0.21878,-0.244684,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,0.109857,-0.334735,-0.438122,-0.146733,-0.21878,-0.078120,1,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,1.471665,-0.358060,-0.438122,-0.146733,-0.21878,-0.078120,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,-1.251951,0.111279,-0.438122,-0.146733,-0.21878,-1.743763,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Train data

In [10]:
from sklearn.model_selection import train_test_split
X = df.drop('income', axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

In [12]:
from sklearn.metrics import classification_report
y_pred = knn_model.predict(X_test)
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.87      0.89      0.88      6842\n           1       0.64      0.60      0.62      2203\n\n    accuracy                           0.82      9045\n   macro avg       0.76      0.75      0.75      9045\nweighted avg       0.82      0.82      0.82      9045\n'