In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Read in the dataset
url = "https://raw.githubusercontent.com/Ben-Liao/MBA6693-Business-Data-Analysis/main/i01-information-based-learning/data/tennis.txt"
df = pd.read_csv(url, sep="\t")

# One-hot encode the categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X = df.drop("playtennis", axis=1)
X = pd.DataFrame(encoder.fit_transform(X).toarray(), columns=encoder.get_feature_names(input_features=X.columns))
y = df["playtennis"]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize the ID3 decision tree model with entropy criterion
clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=0)

# Train the model on the training set
clf_entropy.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf_entropy.predict(X_test)

# Evaluate the model performance using accuracy score
acc_entropy = accuracy_score(y_test, y_pred)

# Initialize the ID3 decision tree model with gini criterion
clf_gini = DecisionTreeClassifier(criterion="gini", random_state=0)

# Train the model on the training set
clf_gini.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf_gini.predict(X_test)

# Evaluate the model performance using accuracy score
acc_gini = accuracy_score(y_test, y_pred)

# Compare the performance of the two models
print("Accuracy score using entropy criterion: ", acc_entropy)
print("Accuracy score using gini criterion: ", acc_gini)


Accuracy score using entropy criterion:  0.3333333333333333
Accuracy score using gini criterion:  0.3333333333333333




In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Read in the dataset
url = "https://raw.githubusercontent.com/Ben-Liao/MBA6693-Business-Data-Analysis/main/i01-information-based-learning/data/tennis.txt"
df = pd.read_csv(url, sep="\t")

# One-hot encode the categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X = df.drop("playtennis", axis=1)
X = pd.DataFrame(encoder.fit_transform(X).toarray(), columns=encoder.get_feature_names(input_features=X.columns))
y = df["playtennis"]

# Convert the target classes to binary values
y = y.replace({"yes": 1, "no": 0})

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize the ID3 decision tree model with entropy criterion
clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=0)

# Train the model on the training set
clf_entropy.fit(X_train, y_train)

# Make predictions on the test set
y_pred_entropy = clf_entropy.predict(X_test)

# Evaluate the model performance using accuracy score
acc_entropy = accuracy_score(y_test, y_pred_entropy)

# Evaluate the model performance using precision score
prec_entropy = precision_score(y_test, y_pred_entropy)

# Evaluate the model performance using recall score
recall_entropy = recall_score(y_test, y_pred_entropy)

# Evaluate the model performance using balanced accuracy score
bal_acc_entropy = balanced_accuracy_score(y_test, y_pred_entropy)

print("Model Entropy - no max depth")
print("Accuracy:", acc_entropy)
print("Balanced accuracy:", bal_acc_entropy)
print("Precision score:", prec_entropy)
print("Recall score:", recall_entropy)



Model Entropy - no max depth
Accuracy: 0.3333333333333333
Balanced accuracy: 0.3333333333333333
Precision score: 1.0
Recall score: 0.3333333333333333




In [3]:
# Initialize the ID3 decision tree model with Gini impurity criterion
clf_gini = DecisionTreeClassifier(criterion="gini", random_state=0)

# Train the model on the training set
clf_gini.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gini = clf_gini.predict(X_test)

# Evaluate the model performance using accuracy score
acc_gini = accuracy_score(y_test, y_pred_gini)

# Evaluate the model performance using precision score
prec_gini = precision_score(y_test, y_pred_gini)

# Evaluate the model performance using recall score
recall_gini = recall_score(y_test, y_pred_gini)

# Evaluate the model performance using balanced accuracy score
bal_acc_gini = balanced_accuracy_score(y_test, y_pred_gini)

print("Model Gini impurity model")
print("Accuracy:", acc_gini)
print("Balanced accuracy:", bal_acc_gini)
print("Precision score:", prec_gini)
print("Recall score:", recall_gini)


Model Gini impurity model
Accuracy: 0.3333333333333333
Balanced accuracy: 0.3333333333333333
Precision score: 1.0
Recall score: 0.3333333333333333




In [4]:
new_row = {"outlook": "sunny", "temperature": "hot", "humidity": "high", "wind": "weak"}
df = df.append(new_row, ignore_index=True)

  df = df.append(new_row, ignore_index=True)


In [5]:
new_row = {"outlook": "sunny", "temperature": "hot", "humidity": "high", "wind": "weak"}

new_row = pd.DataFrame([new_row])
new_row = pd.DataFrame(encoder.transform(new_row).toarray(), columns=encoder.get_feature_names(input_features=['outlook', 'temperature', 'humidity', 'wind']))


#new_row = pd.DataFrame(encoder.transform([new_row]).toarray(), columns=encoder.get_feature_names(input_features=X.columns))

prediction = clf_entropy.predict(new_row)[0]

if prediction == 1:
    print("The player is playing tennis.")
else:
    print("The player is not playing tennis.")

The player is not playing tennis.


