## Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Read the Dataset

In [3]:
df = pd.read_csv(r"C:/Users/Administrator/Downloads/adult_dataset.csv")

## View the Dataset

In [4]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


## Inspection of Dataset

In [5]:
# to view length of dataset
df.shape

(32561, 15)

In [6]:
# to view datatypes of all columns 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
# to display all statistical view of dataset for all numerical column
df.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [8]:
# replace all ? with null values in dataset

df.replace('?',np.nan,inplace = True)

In [9]:
# to ckeck all null values

df.isnull().sum() 

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

In [10]:
# Drop all 3 columns for null values because these col isn't important

df.drop(["workclass","occupation", "native.country"], axis = 1, inplace = True)

In [11]:
df.select_dtypes(include = "object").columns

Index(['education', 'marital.status', 'relationship', 'race', 'sex', 'income'], dtype='object')

In [12]:
# to know all distinct values count of this column
df["education"].value_counts()

education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64

In [13]:
# to know all distinct values count of this column
df['marital.status'].value_counts()

marital.status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64

In [14]:
# to know all distinct values count of this column
df['relationship'].value_counts()

relationship
Husband           13193
Not-in-family      8305
Own-child          5068
Unmarried          3446
Wife               1568
Other-relative      981
Name: count, dtype: int64

In [15]:
# to know all distinct values count of this column
df['race'].value_counts()

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

In [16]:
# to know all distinct values count of this column
df['sex'].value_counts()

sex
Male      21790
Female    10771
Name: count, dtype: int64

In [17]:
# to know all distinct values count of this column
df['income'].value_counts()   #Class Imbalance means बहुत ज्यादा अंतर है वैल्यूस में । 

income
<=50K    24720
>50K      7841
Name: count, dtype: int64

In [18]:
# sex and income both are candidates of binary encoding so replace them with 0 and 1

df['sex'].replace({'Male': 1, 'Female': 0}, inplace=True)

df['income'].replace({'>50K': 1, '<=50K': 0}, inplace=True)

In [19]:
# now ["relationship", "race", "marital.status"] are candidates for dummy variables or one-hot encoding

dum = pd.get_dummies(df[["relationship", "race", "marital.status"]],drop_first = True, dtype = int)

In [20]:
dum

Unnamed: 0,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,marital.status_Married-AF-spouse,marital.status_Married-civ-spouse,marital.status_Married-spouse-absent,marital.status_Never-married,marital.status_Separated,marital.status_Widowed
0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1
2,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
32557,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0
32558,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
32559,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1


In [21]:
# i've replaced these columns with one-hot encoding we now we don't need main columns

df.drop(["relationship","race","marital.status"], inplace =True,axis = 1)

In [22]:
# now concat df and dum into one dataframe and store this into new dataframe

df1 = pd.concat([df,dum],axis = 1)

In [23]:
df1.head(5)

Unnamed: 0,age,fnlwgt,education,education.num,sex,capital.gain,capital.loss,hours.per.week,income,relationship_Not-in-family,...,race_Asian-Pac-Islander,race_Black,race_Other,race_White,marital.status_Married-AF-spouse,marital.status_Married-civ-spouse,marital.status_Married-spouse-absent,marital.status_Never-married,marital.status_Separated,marital.status_Widowed
0,90,77053,HS-grad,9,0,0,4356,40,0,1,...,0,0,0,1,0,0,0,0,0,1
1,82,132870,HS-grad,9,0,0,4356,18,0,1,...,0,0,0,1,0,0,0,0,0,1
2,66,186061,Some-college,10,0,0,4356,40,0,0,...,0,1,0,0,0,0,0,0,0,1
3,54,140359,7th-8th,4,0,0,3900,40,0,0,...,0,0,0,1,0,0,0,0,0,0
4,41,264663,Some-college,10,0,0,3900,40,0,0,...,0,0,0,1,0,0,0,0,1,0


In [24]:
# because 'education' is in order form so here we need ordinal encoding for this column

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoder.fit(df1[["education"]])
df1["education"] = encoder.transform(df1[["education"]])

In [25]:
df1

Unnamed: 0,age,fnlwgt,education,education.num,sex,capital.gain,capital.loss,hours.per.week,income,relationship_Not-in-family,...,race_Asian-Pac-Islander,race_Black,race_Other,race_White,marital.status_Married-AF-spouse,marital.status_Married-civ-spouse,marital.status_Married-spouse-absent,marital.status_Never-married,marital.status_Separated,marital.status_Widowed
0,90,77053,11.0,9,0,0,4356,40,0,1,...,0,0,0,1,0,0,0,0,0,1
1,82,132870,11.0,9,0,0,4356,18,0,1,...,0,0,0,1,0,0,0,0,0,1
2,66,186061,15.0,10,0,0,4356,40,0,0,...,0,1,0,0,0,0,0,0,0,1
3,54,140359,5.0,4,0,0,3900,40,0,0,...,0,0,0,1,0,0,0,0,0,0
4,41,264663,15.0,10,0,0,3900,40,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,310152,15.0,10,1,0,0,40,0,1,...,0,0,0,1,0,0,0,1,0,0
32557,27,257302,7.0,12,0,0,0,38,0,0,...,0,0,0,1,0,1,0,0,0,0
32558,40,154374,11.0,9,1,0,0,40,1,0,...,0,0,0,1,0,1,0,0,0,0
32559,58,151910,11.0,9,0,0,0,40,0,0,...,0,0,0,1,0,0,0,0,0,1


In [26]:
df1["education"].value_counts()

education
11.0    10501
15.0     7291
9.0      5355
12.0     1723
8.0      1382
1.0      1175
7.0      1067
0.0       933
5.0       646
14.0      576
6.0       514
2.0       433
10.0      413
4.0       333
3.0       168
13.0       51
Name: count, dtype: int64

### Now split dataset into two parts (X for all undependent columns and y for dependent column (which to be predict))

In [27]:
X = df1.drop("income",axis = 1)
y = df1["income"]

### now split the dataset into two parts, 1 for training and 2 for testing

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

### Normalization (Scaling)

In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
scaler = MinMaxScaler()

In [31]:
X_train = scaler.fit_transform(X_train)

In [32]:
X_test = scaler.fit_transform(X_test)

### Create a model using Decision Tree Model

In [33]:
from sklearn.tree import DecisionTreeClassifier

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=3)
dt_default.fit(X, y)

### To show all metrics report for this model

In [34]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [35]:
y_pred = dt_default.predict(X_test)

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      4920
           1       0.00      0.00      0.00      1593

    accuracy                           0.76      6513
   macro avg       0.38      0.50      0.43      6513
weighted avg       0.57      0.76      0.65      6513



### To display Decision Tree in PDF Format

In [37]:
# Importing required packages for visualization
from IPython.display import Image  
from six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz

# Putting features
features = list(X.columns[0:])
features

['age',
 'fnlwgt',
 'education',
 'education.num',
 'sex',
 'capital.gain',
 'capital.loss',
 'hours.per.week',
 'relationship_Not-in-family',
 'relationship_Other-relative',
 'relationship_Own-child',
 'relationship_Unmarried',
 'relationship_Wife',
 'race_Asian-Pac-Islander',
 'race_Black',
 'race_Other',
 'race_White',
 'marital.status_Married-AF-spouse',
 'marital.status_Married-civ-spouse',
 'marital.status_Married-spouse-absent',
 'marital.status_Never-married',
 'marital.status_Separated',
 'marital.status_Widowed']

In [38]:
# If you're on windows:
# Specifing path for dot file.
import os
os.environ["PATH"] += os.pathsep + "C:\Program Files (x86)/bin"

In [39]:
# plotting tree with max_depth=3
dot_data = StringIO()  
export_graphviz(dt_default, out_file=dot_data,
                feature_names=features, filled=True,rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("adult_income.pdf")

True

In [40]:
# the pdf with the tree will be written in this directory on your system
import os
os.getcwd()

'C:\\Users\\Administrator\\Downloads\\python files'