In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data"
column_names = ['lettr'] + [f'x{i}' for i in range(1, 17)]
data = pd.read_csv(url, names=column_names)

print("Dataset shape:", data.shape)
print("\nFirst few rows:")
print(data.head())

Dataset shape: (20000, 17)

First few rows:
  lettr  x1  x2  x3  x4  x5  x6  x7  x8  x9  x10  x11  x12  x13  x14  x15  x16
0     T   2   8   3   5   1   8  13   0   6    6   10    8    0    8    0    8
1     I   5  12   3   7   2  10   5   5   4   13    3    9    2    8    4   10
2     D   4  11   6   8   6  10   6   2   6   10    3    7    3    7    3    9
3     N   7  11   6   6   3   5   9   4   6    4    4   10    6   10    2    8
4     G   2   1   3   1   1   8   6   6   6    6    5    9    1    7    5   10


In [3]:
# Split features and target
X = data.drop('lettr', axis=1)
y = data['lettr']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Decision Tree Classifier
print("\n--- Decision Tree ---")
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
dt_acc = accuracy_score(y_test, dt_pred)

print(f"Accuracy: {dt_acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, dt_pred))


--- Decision Tree ---
Accuracy: 0.8835

Classification Report:
              precision    recall  f1-score   support

           A       0.96      0.93      0.95       149
           B       0.84      0.84      0.84       153
           C       0.93      0.82      0.87       137
           D       0.82      0.90      0.86       156
           E       0.79      0.91      0.84       141
           F       0.79      0.85      0.82       140
           G       0.84      0.87      0.86       160
           H       0.77      0.79      0.78       144
           I       0.90      0.91      0.91       146
           J       0.89      0.89      0.89       149
           K       0.84      0.84      0.84       130
           L       0.91      0.92      0.92       155
           M       0.92      0.91      0.91       168
           N       0.93      0.87      0.90       151
           O       0.89      0.87      0.88       145
           P       0.92      0.88      0.90       173
           Q     

In [5]:
# Random Forest Classifier
print("\n--- Random Forest ---")
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Accuracy: {rf_acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_pred))


--- Random Forest ---
Accuracy: 0.9615

Classification Report:
              precision    recall  f1-score   support

           A       0.98      1.00      0.99       149
           B       0.89      0.97      0.93       153
           C       0.99      0.92      0.95       137
           D       0.90      0.97      0.94       156
           E       0.90      0.99      0.95       141
           F       0.93      0.97      0.95       140
           G       0.98      0.96      0.97       160
           H       0.96      0.89      0.92       144
           I       0.99      0.92      0.96       146
           J       0.95      0.98      0.97       149
           K       0.94      0.91      0.93       130
           L       0.99      0.97      0.98       155
           M       0.97      0.99      0.98       168
           N       0.98      0.92      0.95       151
           O       0.95      0.95      0.95       145
           P       0.97      0.95      0.96       173
           Q     

In [6]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_clf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importance.head())


Top 5 Most Important Features:
   feature  importance
12     x13    0.116806
14     x15    0.104144
8       x9    0.093058
10     x11    0.083969
11     x12    0.082482
