# Adult Salary Prediction

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

## Importing the dataset

In [2]:
df = pd.read_csv("adult_data.csv")
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hpw,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


### Checking the Info

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   martial-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hpw             32561 non-null  int64 
 13  country         32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### Checking null values

In [4]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
martial-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hpw               0
country           0
salary            0
dtype: int64

### Removing the duplicate values 

In [5]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hpw,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hpw
count,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0
mean,38.585549,189780.8,10.081815,1078.443741,87.368227,40.440329
std,13.637984,105556.5,2.571633,7387.957424,403.101833,12.346889
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,236993.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


## Handling the missing values 

In [7]:
df.replace(' ?', np.nan, inplace=True)
imputer = SimpleImputer(strategy='most_frequent')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

### Droping the similar information columns

In [8]:
df.drop(['education','workclass'],axis=1,inplace=True)
df

Unnamed: 0,age,fnlwgt,education-num,martial-status,occupation,relationship,race,sex,capital-gain,capital-loss,hpw,country,salary
0,39,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,27,257302,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32533,40,154374,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32534,58,151910,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32535,22,201490,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Separating the features and target

In [9]:
X = df.drop('salary', axis=1).values
y = df['salary'].values

## Splitting the dataset

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Encoding the categorical data

In [11]:
ct = ColumnTransformer(transformers=[
    ('encoder', OneHotEncoder(), [3, 4, 5, 6, 7, 11]),
], remainder='passthrough')
le = LabelEncoder()

In [12]:
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [13]:
print(X_train)

  (0, 5)	1.0
  (0, 7)	1.0
  (0, 22)	1.0
  (0, 31)	1.0
  (0, 32)	1.0
  (0, 72)	1.0
  (0, 75)	36.0
  (0, 76)	127573.0
  (0, 77)	9.0
  (0, 80)	38.0
  (1, 0)	1.0
  (1, 9)	1.0
  (1, 22)	1.0
  (1, 31)	1.0
  (1, 33)	1.0
  (1, 72)	1.0
  (1, 75)	52.0
  (1, 76)	114228.0
  (1, 77)	9.0
  (1, 78)	3325.0
  (1, 80)	40.0
  (2, 2)	1.0
  (2, 7)	1.0
  (2, 21)	1.0
  (2, 31)	1.0
  :	:
  (26026, 72)	1.0
  (26026, 75)	18.0
  (26026, 76)	216284.0
  (26026, 77)	7.0
  (26026, 80)	20.0
  (26027, 2)	1.0
  (26027, 9)	1.0
  (26027, 21)	1.0
  (26027, 31)	1.0
  (26027, 33)	1.0
  (26027, 72)	1.0
  (26027, 75)	34.0
  (26027, 76)	242960.0
  (26027, 77)	11.0
  (26027, 80)	50.0
  (26028, 4)	1.0
  (26028, 12)	1.0
  (26028, 24)	1.0
  (26028, 31)	1.0
  (26028, 33)	1.0
  (26028, 72)	1.0
  (26028, 75)	23.0
  (26028, 76)	329925.0
  (26028, 77)	10.0
  (26028, 80)	30.0


In [14]:
print(y_train)

[0 0 1 ... 0 0 0]


In [15]:
print(y_test)

[0 0 0 ... 0 0 1]


In [16]:
print(X_test)

  (0, 2)	1.0
  (0, 9)	1.0
  (0, 21)	1.0
  (0, 31)	1.0
  (0, 33)	1.0
  (0, 72)	1.0
  (0, 75)	31.0
  (0, 76)	33308.0
  (0, 77)	11.0
  (0, 80)	40.0
  (1, 4)	1.0
  (1, 10)	1.0
  (1, 22)	1.0
  (1, 31)	1.0
  (1, 33)	1.0
  (1, 72)	1.0
  (1, 75)	24.0
  (1, 76)	130534.0
  (1, 77)	13.0
  (1, 80)	40.0
  (2, 2)	1.0
  (2, 14)	1.0
  (2, 21)	1.0
  (2, 28)	1.0
  (2, 33)	1.0
  :	:
  (6505, 75)	43.0
  (6505, 76)	183479.0
  (6505, 77)	9.0
  (6505, 80)	30.0
  (6506, 2)	1.0
  (6506, 17)	1.0
  (6506, 21)	1.0
  (6506, 31)	1.0
  (6506, 33)	1.0
  (6506, 72)	1.0
  (6506, 75)	32.0
  (6506, 76)	112650.0
  (6506, 77)	9.0
  (6506, 80)	40.0
  (6507, 2)	1.0
  (6507, 19)	1.0
  (6507, 21)	1.0
  (6507, 31)	1.0
  (6507, 33)	1.0
  (6507, 72)	1.0
  (6507, 75)	46.0
  (6507, 76)	33794.0
  (6507, 77)	13.0
  (6507, 78)	3103.0
  (6507, 80)	40.0



## Function to train and evaluate a classifier

In [17]:
# Function to train and evaluate the model
def train_and_evaluate(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    return model, cm, accuracy

## Different Predictions

In [18]:
# Train and evaluate different classification models
models = [
    LogisticRegression(random_state=0),
    KNeighborsClassifier(n_neighbors=5),
    # SVC(kernel='linear', random_state=0),
    SVC(kernel='rbf', random_state=0),
    DecisionTreeClassifier(criterion='entropy', max_depth=10),
    RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0),
    # GaussianNB(),
    xgb.XGBClassifier()
]

## Predicting the values 

In [19]:
for model in models:
    model_name = model.__class__.__name__
    trained_model, cm, accuracy = train_and_evaluate(model, X_train, y_train, X_test, y_test)
    print(f"{model_name}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Accuracy: {accuracy:.2f}\n")

LogisticRegression
Confusion Matrix:
[[4730  175]
 [1154  449]]
Accuracy: 0.80

KNeighborsClassifier
Confusion Matrix:
[[4496  409]
 [1070  533]]
Accuracy: 0.77

SVC
Confusion Matrix:
[[4900    5]
 [1324  279]]
Accuracy: 0.80

DecisionTreeClassifier
Confusion Matrix:
[[4685  220]
 [ 709  894]]
Accuracy: 0.86

RandomForestClassifier
Confusion Matrix:
[[4559  346]
 [ 634  969]]
Accuracy: 0.85

XGBClassifier
Confusion Matrix:
[[4602  303]
 [ 547 1056]]
Accuracy: 0.87

