In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline  import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.tree import 	DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [30]:
df = pd.read_csv("adult.csv")

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [32]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [33]:
df["income"].unique()

array(['<=50K', '>50K'], dtype=object)

In [34]:
(df == "?").sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [35]:
df = df.replace("?",np.nan)

In [36]:
df.nunique()

age                  73
workclass             8
fnlwgt            21648
education            16
education.num        16
marital.status        7
occupation           14
relationship          6
race                  5
sex                   2
capital.gain        119
capital.loss         92
hours.per.week       94
native.country       41
income                2
dtype: int64

In [37]:
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [38]:
x = df.drop(columns=("income"))

y = df["income"]

In [39]:
cat_col = x.select_dtypes(include="object").columns

num_col = x.select_dtypes(include=["int64","float64"]).columns

In [40]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.8,random_state=42)

In [41]:
cat_pipeline = Pipeline(
    steps=[
        ("cat_col",SimpleImputer(strategy="most_frequent")),
        ("onehot",OneHotEncoder())
				]
)

num_pipeline = Pipeline(
    steps=[
        ("num_col",SimpleImputer(strategy="mean")),
        ("scalar",StandardScaler())
				]
)


preprocessing = ColumnTransformer(
    transformers=[
        ("cat_pre",cat_pipeline,cat_col),
        ("num_pre",num_pipeline,num_col)
				]
)

In [42]:
pipeline = Pipeline(
    steps=[
        ("preprocessor",preprocessing),
        ("decisiontree",DecisionTreeClassifier(random_state=42))
				]
)

In [43]:
pipeline

In [44]:
pipeline.fit(xtrain,ytrain)

In [45]:
pipeline.score(xtest,ytest)

0.8142177184093352

In [46]:
pipeline.score(xtrain,ytrain)

0.9999616093366094

# bagging

In [47]:
pipeline_bag = pipeline = Pipeline(
    steps=[
        ("preprocessor",preprocessing),
        ("randomforest",RandomForestClassifier())
				]
)

In [48]:
pipeline_bag.fit(xtrain,ytrain)

In [49]:
pipeline_bag.score(xtest,ytest)

0.8473821587594043

In [50]:
pipeline_bag.score(xtrain,ytrain)

0.9999616093366094

# gridsearchcv

In [51]:
params = {
    "randomforest__criterion":["entropy","gini"],
    "randomforest__min_samples_split":[2,3,5,7,10],
    "randomforest__max_depth":[10,50,100,200,300],
    "randomforest__min_samples_leaf":[2,3,5,7,10]
}

gridsearchcv = GridSearchCV(pipeline_bag,params,cv=5,n_jobs=-1)

In [52]:
gridsearchcv

In [55]:
# gridsearchcv.fit(xtrain,ytrain)

In [56]:
# gridsearchcv.score(xtest,ytest)