# Lab Task: Decision Tree Classification on Golf Dataset

## Objective
Predict if golf will be played based on weather using Decision Trees.

In [None]:
# Task 1: Data Loading and Exploration
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data (header=None because dataset has no column names initially)
df = pd.read_csv('golf-dataset.csv', header=None)

# Rename columns
df.columns = ['Outlook', 'Temp', 'Humidity', 'Windy', 'PlayGolf']

# Display structure
print("First 5 rows:")
print(df.head())

# Unique values per column
print("\nUnique values:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

# Drop missing values
df = df.dropna()

In [None]:
# Task 2: Data Preprocessing
le = preprocessing.LabelEncoder()

# Convert text to numbers
for col in df.columns:
    df[col] = le.fit_transform(df[col])

# Print stats
print("\nStats (Encoded):")
for col in df.columns:
    print(f"{col} - Mean: {df[col].mean():.2f}, Std: {df[col].std():.2f}")

In [None]:
# Task 3: Data Splitting
X = df.drop('PlayGolf', axis=1)
y = df['PlayGolf']

# 70% Train, 30% Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

In [None]:
# Task 4: Entropy Model
clf_entropy = DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf_entropy.fit(X_train, y_train)
y_pred_entropy = clf_entropy.predict(X_test)

print("\n--- Entropy Model Report ---")
print(classification_report(y_test, y_pred_entropy))

In [None]:
# Task 5: Gini Model
clf_gini = DecisionTreeClassifier(criterion="gini", max_depth=3)
clf_gini.fit(X_train, y_train)
y_pred_gini = clf_gini.predict(X_test)

print("\n--- Gini Model Report ---")
print(classification_report(y_test, y_pred_gini))

### Bonus Questions Answers

1. **Comparison:** Both usually give the same result on small datasets like this.
2. **Difference:** Entropy uses logarithms (slower); Gini uses probabilities squared (faster).
3. **Why differ?** Their math formulas are slightly different, so they might pick different split points on complex data.
4. **max_depth=3:** It stops the tree from growing deeper than 3 levels to prevent overfitting.