# **Machine learning model using decision tree algorithm**

Mount the Google Drive to access the CSV file.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Import necessary libraries numpy and pandas

In [None]:
import pandas as pd
import numpy as np

Reads the CSV file into a Pandas dataframe, and displays the first 10 rows and last 5 rows of the dataframe.

In [None]:
df=pd.read_csv('/content/drive/MyDrive/DataSet for Decision Tree.csv')

In [None]:
# Display the first 10 rows of the dataset
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,x,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,0,18,United-States,<=50K
2,66,x,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,0,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,0,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,0,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,0,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,0,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,0,60,?,>50K


In [None]:
# Display the last five rows of the dataset
df.tail(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32560,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


Import necessary libraries from scikit-learn for preprocessing the data, building the decision tree classifier, evaluating the classifier's performance using metrics like accuracy, precision, recall, F1 score, and confusion matrix.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

Replace all the missing values represented by "?" and "X" with NaN, and then drops the rows with NaN values to clean the dataset.

In [None]:
df.replace("?", np.nan, inplace=True)
df.replace("X", np.nan, inplace=True)
df.dropna(inplace=True)


Use LabelEncoder from scikit-learn to convert categorical variables in the dataframe to numerical values for the decision tree classifier to work with

In [None]:
le = LabelEncoder()
df['workclass'] = le.fit_transform(df['workclass'])
df['education'] = le.fit_transform(df['education'])
df['marital.status'] = le.fit_transform(df['marital.status'])
df['occupation'] = le.fit_transform(df['occupation'])
df['relationship'] = le.fit_transform(df['relationship'])
df['race'] = le.fit_transform(df['race'])
df['sex'] = le.fit_transform(df['sex'])
df['native.country'] = le.fit_transform(df['native.country'])
df['income'] = le.fit_transform(df['income'])

In [None]:
# Display the first five rows of the transformed dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,2,132870,11,9,6,3,1,4,0,0,0,18,38,0
3,54,2,140359,5,4,0,6,4,4,0,0,0,40,38,0
4,41,2,264663,15,10,5,9,3,4,0,0,0,40,38,0
5,34,2,216864,11,9,0,7,4,4,0,0,0,45,38,0
6,38,2,150601,0,6,5,0,4,4,1,0,0,40,38,0


Define the input features X and target variable y for the decision tree classifier. It drops the fnlwgt and income columns from the input features X.

In [None]:
X = df.drop(['fnlwgt', 'income'], axis=1)

y = df['income']

Use train_test_split from scikit-learn to split the data into training and testing sets. It sets aside 20% of the data for testing and uses a random state of 42 for reproducibility.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Build the decision tree classifier, fits it to the training data, and predicts the target variable for the testing data.

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Evaluate the performance of the decision tree classifier using metrics like accuracy, precision, recall, F1 score.

In [None]:
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")

Accuracy: 0.775
Precision: 0.549
Recall: 0.529
F1 score: 0.539


Plot confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[3880  653]
 [ 706  794]]
