In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import make_pipeline
from feature_engine.encoding import OrdinalEncoder
from xgboost import XGBClassifier



df = pd.read_csv("train_.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
3,58,State-gov,110199,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,0
4,25,State-gov,149248,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,0,0,40,United-States,0


In [2]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income_>50K'],
      dtype='object')

In [3]:
df.shape

(43957, 15)

In [4]:
# cheaking for null values
df.isna().sum()

age                   0
workclass          2498
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2506
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      763
income_>50K           0
dtype: int64

In [5]:
df['workclass'].head()

0      Private
1      Private
2      Private
3    State-gov
4    State-gov
Name: workclass, dtype: object

In [6]:
df['workclass'].value_counts()

workclass
Private             30587
Self-emp-not-inc     3464
Local-gov            2822
State-gov            1756
Self-emp-inc         1518
Federal-gov          1284
Without-pay            20
Never-worked            8
Name: count, dtype: int64

In [7]:
df['occupation']

0          Exec-managerial
1            Other-service
2          Exec-managerial
3         Transport-moving
4            Other-service
               ...        
43952      Exec-managerial
43953        Other-service
43954                Sales
43955                Sales
43956    Handlers-cleaners
Name: occupation, Length: 43957, dtype: object

In [8]:
df['occupation'].value_counts()

occupation
Craft-repair         5519
Prof-specialty       5518
Exec-managerial      5506
Adm-clerical         5004
Sales                4965
Other-service        4448
Machine-op-inspct    2711
Transport-moving     2121
Handlers-cleaners    1878
Farming-fishing      1348
Tech-support         1321
Protective-serv       874
Priv-house-serv       225
Armed-Forces           13
Name: count, dtype: int64

In [9]:
df['native-country']

0        United-States
1        United-States
2        United-States
3        United-States
4        United-States
             ...      
43952    United-States
43953    United-States
43954    United-States
43955    United-States
43956    United-States
Name: native-country, Length: 43957, dtype: object

In [10]:
df['native-country'].value_counts()

native-country
United-States                 39429
Mexico                          880
Philippines                     273
Germany                         188
Puerto-Rico                     167
Canada                          158
El-Salvador                     145
India                           134
Cuba                            124
China                           113
England                         109
South                           105
Dominican-Republic               97
Jamaica                          97
Italy                            94
Japan                            83
Guatemala                        79
Vietnam                          77
Columbia                         75
Poland                           72
Haiti                            71
Portugal                         59
Taiwan                           58
Iran                             52
Nicaragua                        46
Greece                           44
Ecuador                          42
Peru         

In [11]:
"""
after cheaking null values, i realised i can not change it with mean, 
median or mode because those are categorical columns so i have decided to drop null values
"""

df.dropna(inplace = True)
df.isna().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income_>50K        0
dtype: int64

In [12]:
df.shape
# around 3k value has been deleted which whould not metter as those are null values

(40727, 15)

In [13]:
# categorical columns 
cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship','race', 'gender', 'native-country']
num_cols = [col for col in df.columns if col not in cat_cols]
num_cols.remove("income_>50K")
print(num_cols)

['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [14]:
df["income_>50K"].value_counts()

income_>50K
0    30635
1    10092
Name: count, dtype: int64

In [15]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income_>50K'],
      dtype='object')

In [16]:
X = df.drop(columns = ["income_>50K"])
y = df["income_>50K"]

print(X.columns)
print(y.name)

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')
income_>50K


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1234)

# scalling the numerical columns 
X_train_num = X_train[num_cols]
X_test_num = X_test[num_cols]
sc = StandardScaler()
X_train_num_sc = sc.fit_transform(X_train_num)
X_test_num_sc = sc.transform(X_test_num)

# converting scalled np array to df
X_train_sc_df = pd.DataFrame(X_train_num_sc, columns = num_cols, index=X_train.index)
X_test_sc_df = pd.DataFrame(X_test_num_sc, columns = num_cols, index=X_test.index)


# adding scalled df to other categorical 
X_train_final = pd.concat([X_train_sc_df, X_train[cat_cols]], axis = 1)
X_test_final = pd.concat([X_test_sc_df, X_test[cat_cols]], axis = 1)


# making a pipline in which i am encoding the categorical variables and also making the model with the hyperperameters
model =  make_pipeline(OrdinalEncoder(encoding_method='arbitrary', missing_values='ignore', variables=cat_cols),
                    XGBClassifier(n_estimators=100, 
                                  learning_rate=0.1, 
                                  max_depth=10,
                                  random_state=42))

model.fit(X_train_final, y_train)

In [19]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

predictions = model.predict_proba(X_test_final)
pre = predictions[:,1] >= .5
X_test['predicted'] = pre


print("Acc:",accuracy_score(y_test, pre))
print("Precision:",precision_score(y_test, pre))
print("Recall:",recall_score(y_test, pre))
print("F1: ",f1_score(y_test, pre))

Acc: 0.8686471888043211
Precision: 0.7697841726618705
Recall: 0.6517766497461929
F1:  0.7058823529411765
