In [None]:
# Load some test data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import graphviz


data = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv")
data.head()
# Drop missing values from embarked
data = data.dropna()

The above loads the require libraries and reads the bank csv file

The code below runs a decision tree and nothing special about it besides that it follows the sample decision tree code provided in this module. 

In [None]:
# Apply the hard rule: Remove all "student" job entries
#data = data[data['job'] != 'student']

# List of job categories to be grouped as "unwanted"
unwanted_jobs = ['blue-collar', 'services', 'housemaid', 'entrepreneur']  # Add more as needed
data['job'] = data['job'].replace(unwanted_jobs, 'unwanted')

unwanted_education = ['basic.9y', 'basic.6y']  # Add more as needed
data['education'] = data['education'].replace(unwanted_education, 'basic9n6y')

data['campaign'] = np.where(data['campaign'] > 17, 18, data['campaign'])


# Encode categorical variables
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                    'contact', 'month', 'day_of_week', 'poutcome']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Encode our features and target as needed
features = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 
            'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 
            'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'] # , 'age', 'default'
X = pd.get_dummies(data[features], drop_first=True)
y = data['y'].map({'yes': 1, 'no': 0})  # Convert target to binary (1,0)

# Split our data into training and test data, with 30% reserved for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Build the decision tree
clf = DecisionTreeClassifier(
    criterion="gini", # gini, entropy
    max_depth=4,  # Prevents overfitting
    min_samples_split=10,  # Ensures sufficient samples per split
    random_state=42
)

# Train it
clf.fit(X_train, y_train)

# Test and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")

The code below makes a decision tree vizual. It will download an imagine just fyi

In [None]:
# Generate high-quality decision tree visualization using Graphviz
dot_data = export_graphviz(clf, out_file=None, 
                           feature_names=X.columns,  
                           class_names=["No", "Yes"],  
                           filled=True, rounded=True,  
                           special_characters=True)  

# Render and display the decision tree
graph = graphviz.Source(dot_data)
graph.format = "png"  # Set output format
graph.render("decision_tree")  # Save as file

# Display the generated tree
graph.view()

The code below is the one that uses the above machine learning model to do predictions on the hold out data set. 

once downloaded the column still needs to be rename to 'predictions'. I guess i can just add code to change the name of that column. 



In [None]:
# Load the holdout dataset
holdout_data = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test.csv")

# Perform the same transformations as on the training set
holdout_data_encoded = pd.get_dummies(holdout_data[features], drop_first=True)

# Align the columns of holdout_data_encoded with X (training data)
holdout_data_encoded = holdout_data_encoded.reindex(columns=X.columns, fill_value=0)

# Make predictions on the holdout dataset
holdout_predictions = clf.predict(holdout_data_encoded)

# Convert the predictions to a DataFrame and label the column 'y'
predictions_df = pd.DataFrame(holdout_predictions, columns=['y'])

# Save the predictions to a CSV file
team_number = "3"  # Replace with your team number
file_name = f"team{team_number}-module2-predictions.csv"
predictions_df.to_csv(file_name, index=False)

print(f"Predictions saved to {file_name}")