In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
import pickle

In [34]:
df=pd.read_csv('IncomeData.csv')

In [35]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [36]:
df.shape

(32561, 14)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       30725 non-null  object
 2    education       32561 non-null  object
 3    education-num   32561 non-null  int64 
 4    marital-status  32561 non-null  object
 5    occupation      30718 non-null  object
 6    relationship    32561 non-null  object
 7    race            32561 non-null  object
 8    sex             32561 non-null  object
 9    capital-gain    32561 non-null  int64 
 10   capital-loss    32561 non-null  int64 
 11   hours-per-week  32561 non-null  int64 
 12   native-country  31978 non-null  object
 13   income          32561 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.5+ MB


In [38]:
df.columns

Index(['age', ' workclass', ' education', ' education-num', ' marital-status',
       ' occupation', ' relationship', ' race', ' sex', ' capital-gain',
       ' capital-loss', ' hours-per-week', ' native-country', ' income'],
      dtype='object')

## removing empty spaces from column names

In [39]:
df.columns=df.columns.str.replace(' ','')
df.columns=df.columns.str.replace('-','_')

In [40]:
df.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'income'],
      dtype='object')

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   education       32561 non-null  object
 3   education_num   32561 non-null  int64 
 4   marital_status  32561 non-null  object
 5   occupation      30718 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   capital_gain    32561 non-null  int64 
 10  capital_loss    32561 non-null  int64 
 11  hours_per_week  32561 non-null  int64 
 12  native_country  31978 non-null  object
 13  income          32561 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.5+ MB


In [42]:
df.shape

(32561, 14)

## Unique Values in each column

In [43]:
for i in df.columns:
    print(i)
    print('*****************')
    print(df[i].unique())
    print('                  ')

age
*****************
[39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]
                  
workclass
*****************
[' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 nan ' Self-emp-inc' ' Without-pay' ' Never-worked']
                  
education
*****************
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
                  
education_num
*****************
[13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]
                  
marital_status
*****************
[' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
                  
occupation
*****************
[' Adm-clerical' ' Exe

## Numerical Features

### capital_gain

In [44]:
len(df[df['capital_gain']==0])

29849

In [45]:
len(df[df['capital_gain']!=0])

2712

### capital_loss

In [46]:
len(df[df['capital_loss']==0])

31042

In [47]:
len(df[df['capital_loss']!=0])

1519

#### the distribution among the values not proper (contains majorly zeros)
#### education and education_num proviodes same information, so removing one
##### hence remove those columns

In [48]:
df.drop(['capital_gain','capital_loss','education'],axis=1,inplace=True)

In [49]:
df.head()

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   education_num   32561 non-null  int64 
 3   marital_status  32561 non-null  object
 4   occupation      30718 non-null  object
 5   relationship    32561 non-null  object
 6   race            32561 non-null  object
 7   sex             32561 non-null  object
 8   hours_per_week  32561 non-null  int64 
 9   native_country  31978 non-null  object
 10  income          32561 non-null  object
dtypes: int64(3), object(8)
memory usage: 2.7+ MB


## Categorical Features

#### imputing MODE values in empty categorical features

In [51]:
df.isnull().sum()

age                  0
workclass         1836
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
hours_per_week       0
native_country     583
income               0
dtype: int64

In [52]:
df['workclass'].fillna(df['workclass'].mode()[0], inplace=True)
df['occupation'].fillna(df['occupation'].mode()[0], inplace=True)
df['native_country'].fillna(df['native_country'].mode()[0], inplace=True) 

In [53]:
df.isnull().sum()

age               0
workclass         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
hours_per_week    0
native_country    0
income            0
dtype: int64

### function to change categorical to numerical

In [54]:
def feature_engineering(df):
    
    #workclass
    workClass={value:key for key,value in enumerate(df['workclass'].unique())}
    df['workclass']=df['workclass'].map(workClass)
    
    #marital_status
    maritalStatus={value:key for key,value in enumerate(df['marital_status'].unique())}
    df['marital_status']=df['marital_status'].map(maritalStatus)
    
    #occupation
    occuPation={value:key for key,value in enumerate(df['occupation'].unique())}
    df['occupation']=df['occupation'].map(occuPation)
    
    #relationship
    relationShip={value:key for key,value in enumerate(df['relationship'].unique())}
    df['relationship']=df['relationship'].map(relationShip)
    
    #race
    Race={value:key for key,value in enumerate(df['race'].unique())}
    df['race']=df['race'].map(Race)
    
    #sex
    Sex={value:key for key,value in enumerate(df['sex'].unique())}
    df['sex']=df['sex'].map(Sex)
    
    #native_country
    nativeCountry={value:key for key,value in enumerate(df['native_country'].unique())}
    df['native_country']=df['native_country'].map(nativeCountry)
    
    return df

In [55]:
df=feature_engineering(df)

In [56]:
df.head()

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country,income
0,39,0,13,0,0,0,0,0,40,0,<=50K
1,50,1,13,1,1,1,0,0,13,0,<=50K
2,38,2,9,2,2,0,0,0,40,0,<=50K
3,53,2,7,1,2,1,1,0,40,0,<=50K
4,28,2,13,1,3,2,1,1,40,1,<=50K


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  int64 
 2   education_num   32561 non-null  int64 
 3   marital_status  32561 non-null  int64 
 4   occupation      32561 non-null  int64 
 5   relationship    32561 non-null  int64 
 6   race            32561 non-null  int64 
 7   sex             32561 non-null  int64 
 8   hours_per_week  32561 non-null  int64 
 9   native_country  32561 non-null  int64 
 10  income          32561 non-null  object
dtypes: int64(10), object(1)
memory usage: 2.7+ MB


## Independent and Dependent Feature

In [58]:
X=df.iloc[:,:-1]

In [59]:
X.head()

Unnamed: 0,age,workclass,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,native_country
0,39,0,13,0,0,0,0,0,40,0
1,50,1,13,1,1,1,0,0,13,0
2,38,2,9,2,2,0,0,0,40,0
3,53,2,7,1,2,1,1,0,40,0
4,28,2,13,1,3,2,1,1,40,1


In [60]:
y=df['income']

In [61]:
y.value_counts()

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

## Splitting Data

In [62]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [63]:
X_train.shape,X_test.shape

((22792, 10), (9769, 10))

## Fitting Model

In [64]:
from xgboost import XGBClassifier

xgb=XGBClassifier()

In [65]:
xgb.fit(X_train,y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [66]:
y_pred=xgb.predict(X_test)

In [67]:
accuracy_score(y_test,y_pred)

0.8378544375063978

In [68]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90      7455
        >50K       0.68      0.59      0.63      2314

    accuracy                           0.84      9769
   macro avg       0.78      0.75      0.76      9769
weighted avg       0.83      0.84      0.83      9769



In [69]:
result={
    'Actual':y_test,
    'Predcited':y_pred
}

In [70]:
pd.DataFrame(result)

Unnamed: 0,Actual,Predcited
14160,<=50K,<=50K
27048,<=50K,>50K
28868,>50K,>50K
5667,<=50K,<=50K
7827,<=50K,<=50K
...,...,...
32476,<=50K,>50K
21100,<=50K,<=50K
27131,<=50K,<=50K
25526,>50K,>50K


###### as we can can most of the predictions are good !

## Saving our Model

In [71]:
with open('xgbModel.pkl', 'wb') as file:
    pickle.dump(xgb, file)