# Data Science Specialization (Spring 2025, RUC)
## Workshop: Decision Trees
## Exercise Part II

## 1. Imports

In [None]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

## 2. Data Preparation

In [None]:
train = pd.read_csv(r'airline_kaggle_train.csv')
test = pd.read_csv(r'airline_kaggle_test.csv')
train.head()

In [None]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)
train.info()
test.info()

In [None]:
train.columns

In [None]:
feature_cols = train.columns.drop(['satisfaction'])
feature_cols

In [None]:
train.describe()

In [None]:
# Show values for categorical features
for feature in ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']:
    print("'{0}' values: {1}:\r".format(feature, train[feature].unique()))

In [None]:
# Convert Gender, Customer Type and Type of Travel to numeric by factorization
for feature in ['Gender', 'Customer Type', 'Type of Travel']:
    train[feature] = pd.factorize(train[feature])[0]
    test[feature] = pd.factorize(test[feature])[0]

train.info()
test.info()

In [None]:
# Use one-hot encoding for Class (only remaining object type feature)
train = pd.get_dummies(train, columns=['Class'])
test = pd.get_dummies(test, columns=['Class'])

train.info()
test.info()

In [None]:
# Drop instances with NaN (or should we set them to zero?)
train = train.dropna()
test = test.dropna()

train.info()
test.info()

In [None]:
feature_cols = train.columns.drop(['satisfaction'])

X_train = train[feature_cols]
y_train = train.satisfaction

X_test = test[feature_cols]
y_test = test.satisfaction

## 3. Training a Single Decision Tree

In [None]:
dtree = DecisionTreeClassifier(criterion='entropy')
dtree = dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
print("Depth:", dtree.get_depth())
print("Leaves:", dtree.get_n_leaves())
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred, pos_label='satisfied'))
print("Precision:", metrics.precision_score(y_test, y_pred, pos_label='satisfied'))

In [None]:
from sklearn import tree
from sklearn.tree import export_graphviz
import graphviz

export_graphviz(dtree, out_file='airline_dt.dot',
                class_names=['neudis', 'satisfied'],
                feature_names=feature_cols,
                impurity=False,
                filled=True)

! dot -Tpng airline_dt.dot -o airline_dt.png

from IPython import display
display.Image("airline_dt.png")

## 4. Vary Metric and Other Parameters

In [None]:
criteria = ['gini', 'entropy']
for c in criteria:
    for md in range(2, 8):
        # Model intializing
        dtree = DecisionTreeClassifier(criterion=c, max_depth=md)
        
        # Model training/fitting
        dtree.fit(X_train, y_train)
               
        # Model validation/test
        y_pred = dtree.predict(X_test)
        
        print("criterion={0}, max_depth={1}:\r".format(c, md))
        print("Accuracy: {}\r\n".format(metrics.accuracy_score(y_test, y_pred)))

## 5. Ensemble Methods

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='entropy', max_features=10, n_estimators=100, random_state=0) 
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

print("Accuracy of Random Forest: {}".format(metrics.accuracy_score(y_test, y_pred)))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

forest2 = ExtraTreesClassifier(criterion='gini', n_estimators=10, max_depth=None,  min_samples_split=2, random_state=0)
forest2.fit(X_train, y_train)
y_pred = forest2.predict(X_test)

print("Accuracy of Extra Trees: {}".format(metrics.accuracy_score(y_test, y_pred)))

The accuracy values are not overwhelming. Determine what parameter values the classifiers take, and vary them!