In [1]:
import pandas as pd

dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [2]:
dataset = dataset.drop("User ID", axis = 1)
dataset = pd.get_dummies(dataset, drop_first = True, dtype = int)
dataset.columns

Index(['Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [3]:
independent = dataset.drop(columns = ['Purchased'])
dependent = dataset[['Purchased']]
dependent.value_counts()

Purchased
0            257
1            143
Name: count, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size = 0.30, random_state = 0) 
x_train, x_test, y_train, y_test

(     Age  EstimatedSalary  Gender_Male
 92    26            15000            1
 223   60           102000            1
 234   38           112000            0
 232   40           107000            1
 377   42            53000            0
 ..   ...              ...          ...
 323   48            30000            0
 192   29            43000            1
 117   36            52000            1
 47    27            54000            0
 172   26           118000            0
 
 [280 rows x 3 columns],
      Age  EstimatedSalary  Gender_Male
 132   30            87000            1
 309   38            50000            0
 341   35            75000            1
 196   30            79000            0
 246   35            50000            0
 ..   ...              ...          ...
 216   49            65000            1
 259   45           131000            0
 49    31            89000            0
 238   46            82000            0
 343   47            51000            0
 
 [120 rows 

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

In [9]:
def decision_tree_model(criterion_param, splitter_param, max_features_param):
    try:
        classifier = DecisionTreeClassifier(criterion = criterion_param, splitter = splitter_param, max_features = max_features_param)
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        clf_report = classification_report(y_test, y_pred)
        print(str(cm) + ": Criterion = " + criterion_param + " splitter = " + splitter_param + " max_features = " + str(max_features_param))
        print(clf_report)
    except Exception as e:
        print(e)

In [11]:
max_criterion = None
max_splitter = None
max_max_features = None
criterion_list = ['gini','entropy','log_loss']
splitter_list = ['best','random']
max_features_list = [None,'sqrt','log2']

for criterion in criterion_list:
    for max_features in max_features_list:
        for splitter in splitter_list:
            decision_tree_model(criterion, splitter, max_features)

[[71  8]
 [ 6 35]]: Criterion = gini splitter = best max_features = None
              precision    recall  f1-score   support

           0       0.92      0.90      0.91        79
           1       0.81      0.85      0.83        41

    accuracy                           0.88       120
   macro avg       0.87      0.88      0.87       120
weighted avg       0.89      0.88      0.88       120

[[72  7]
 [ 8 33]]: Criterion = gini splitter = random max_features = None
              precision    recall  f1-score   support

           0       0.90      0.91      0.91        79
           1       0.82      0.80      0.81        41

    accuracy                           0.88       120
   macro avg       0.86      0.86      0.86       120
weighted avg       0.87      0.88      0.87       120

[[72  7]
 [ 7 34]]: Criterion = gini splitter = best max_features = sqrt
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        79
           1    

In [18]:
classifier = DecisionTreeClassifier(criterion = 'log_loss', splitter = 'random', max_features = 'sqrt')
classifier = classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

clf_report = classification_report(y_test, y_pred)
print(clf_report)

[[74  5]
 [ 8 33]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92        79
           1       0.87      0.80      0.84        41

    accuracy                           0.89       120
   macro avg       0.89      0.87      0.88       120
weighted avg       0.89      0.89      0.89       120



In [19]:
classifier.predict([[40,30000,1]])

array([0], dtype=int64)