## Build a decision tree classifier to predict whether a customer will purchase a product or service based on their demographic and behavioral data.

# Fetch Dataset

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 


{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'title': 'A data-driven approach to predict the success of bank telemarketing'

# Import Python Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score

In [4]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,


In [5]:
y

Unnamed: 0,y
0,no
1,no
2,no
3,no
4,no
...,...
45206,yes
45207,yes
45208,yes
45209,no


In [6]:
mapping={
    'yes':1,
    'no':0
}
X['default']=X['default'].map(mapping)
X['housing']=X['housing'].map(mapping)
X['loan']=X['loan'].map(mapping)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['default']=X['default'].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['housing']=X['housing'].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['loan']=X['loan'].map(mapping)


In [7]:
X

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,0,2143,1,0,,5,may,261,1,-1,0,
1,44,technician,single,secondary,0,29,1,0,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,0,2,1,1,,5,may,76,1,-1,0,
3,47,blue-collar,married,,0,1506,1,0,,5,may,92,1,-1,0,
4,33,,single,,0,1,0,0,,5,may,198,1,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,0,825,0,0,cellular,17,nov,977,3,-1,0,
45207,71,retired,divorced,primary,0,1729,0,0,cellular,17,nov,456,2,-1,0,
45208,72,retired,married,secondary,0,5715,0,0,cellular,17,nov,1127,5,184,3,success
45209,57,blue-collar,married,secondary,0,668,0,0,telephone,17,nov,508,4,-1,0,


# count  the values 

In [8]:
X['job'].value_counts()

job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
Name: count, dtype: int64

In [9]:
X['poutcome'].value_counts()

poutcome
failure    4901
other      1840
success    1511
Name: count, dtype: int64

# Mapping of Poutcome column values

In [10]:
mapping1={
    'failure':0,
    'success':1,
    'other':2
}
X['poutcome']=X['poutcome'].map(mapping1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['poutcome']=X['poutcome'].map(mapping1)


# Total null values

In [11]:
X.isnull().sum()

age                0
job              288
marital            0
education       1857
default            0
balance            0
housing            0
loan               0
contact        13020
day_of_week        0
month              0
duration           0
campaign           0
pdays              0
previous           0
poutcome       36959
dtype: int64

# Drop columns

In [12]:
X=X.drop(['education','contact','poutcome'],axis=1)

# Concatinate X and Y

In [13]:
data=pd.concat([X,y],axis=1)

In [14]:
data

Unnamed: 0,age,job,marital,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,y
0,58,management,married,0,2143,1,0,5,may,261,1,-1,0,no
1,44,technician,single,0,29,1,0,5,may,151,1,-1,0,no
2,33,entrepreneur,married,0,2,1,1,5,may,76,1,-1,0,no
3,47,blue-collar,married,0,1506,1,0,5,may,92,1,-1,0,no
4,33,,single,0,1,0,0,5,may,198,1,-1,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,0,825,0,0,17,nov,977,3,-1,0,yes
45207,71,retired,divorced,0,1729,0,0,17,nov,456,2,-1,0,yes
45208,72,retired,married,0,5715,0,0,17,nov,1127,5,184,3,yes
45209,57,blue-collar,married,0,668,0,0,17,nov,508,4,-1,0,no


# Drop all na values

In [15]:
data=data.dropna()

In [16]:
data

Unnamed: 0,age,job,marital,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous,y
0,58,management,married,0,2143,1,0,5,may,261,1,-1,0,no
1,44,technician,single,0,29,1,0,5,may,151,1,-1,0,no
2,33,entrepreneur,married,0,2,1,1,5,may,76,1,-1,0,no
3,47,blue-collar,married,0,1506,1,0,5,may,92,1,-1,0,no
5,35,management,married,0,231,1,0,5,may,139,1,-1,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,0,825,0,0,17,nov,977,3,-1,0,yes
45207,71,retired,divorced,0,1729,0,0,17,nov,456,2,-1,0,yes
45208,72,retired,married,0,5715,0,0,17,nov,1127,5,184,3,yes
45209,57,blue-collar,married,0,668,0,0,17,nov,508,4,-1,0,no


In [20]:
X=data.drop('y',axis=1)

In [21]:
y=data['y']

In [22]:
X

Unnamed: 0,age,job,marital,default,balance,housing,loan,day_of_week,month,duration,campaign,pdays,previous
0,58,management,married,0,2143,1,0,5,may,261,1,-1,0
1,44,technician,single,0,29,1,0,5,may,151,1,-1,0
2,33,entrepreneur,married,0,2,1,1,5,may,76,1,-1,0
3,47,blue-collar,married,0,1506,1,0,5,may,92,1,-1,0
5,35,management,married,0,231,1,0,5,may,139,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,0,825,0,0,17,nov,977,3,-1,0
45207,71,retired,divorced,0,1729,0,0,17,nov,456,2,-1,0
45208,72,retired,married,0,5715,0,0,17,nov,1127,5,184,3
45209,57,blue-collar,married,0,668,0,0,17,nov,508,4,-1,0


# Convert values of column job in frequency and map them as well

In [32]:
freq=X['job'].value_counts()

In [33]:
X['job']=X['job'].map(freq)

In [35]:
mapping2={
    'married':1,
    'divorced':2,
    'single':0
}
X['marital']=X['marital'].map(mapping2)

In [37]:
X=X.drop('month',axis=1)

In [38]:
X

Unnamed: 0,age,job,marital,default,balance,housing,loan,day_of_week,duration,campaign,pdays,previous
0,58,9458,1,0,2143,1,0,5,261,1,-1,0
1,44,7597,0,0,29,1,0,5,151,1,-1,0
2,33,1487,1,0,2,1,1,5,76,1,-1,0
3,47,9732,1,0,1506,1,0,5,92,1,-1,0
5,35,9458,1,0,231,1,0,5,139,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,7597,1,0,825,0,0,17,977,3,-1,0
45207,71,2264,2,0,1729,0,0,17,456,2,-1,0
45208,72,2264,1,0,5715,0,0,17,1127,5,184,3
45209,57,9732,1,0,668,0,0,17,508,4,-1,0


In [40]:
y

0         no
1         no
2         no
3         no
5         no
        ... 
45206    yes
45207    yes
45208    yes
45209     no
45210     no
Name: y, Length: 44923, dtype: object

# Train the dataset

In [42]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=3)

# Assigment  and fitting of model 

In [44]:
model=DecisionTreeClassifier()

In [45]:
model.fit(X_train,y_train)

# Model prediction

In [46]:
y_pred=model.predict(X_test)

# Model accuracy

In [48]:
accuracy=accuracy_score(y_test,y_pred)

In [49]:
accuracy

0.8579802626697336

# SUMMARY

1.Loading the Data:

- Use the Bank Marketing dataset from the UCI Machine Learning Repository.

2.Data Cleaning and Preprocessing:

- Handle missing values, if any.
- Convert categorical variables to numerical using techniques like one-hot encoding.
- Split the dataset into training and testing sets.

3.Building the Decision Tree Classifier:

- Use scikit-learn to build the decision tree model.
- Train the model on the training dataset.

4.Evaluating the Model:

- Evaluate the model’s performance using metrics such as accuracy.
- Visualize the decision tree.
5.Interpreting Results:

- Interpret the decision tree to understand which features are most influential in predicting customer purchases.