Classifying Digit Images Using Decision Trees

In [1]:
import tensorflow as tf

# Load the MNIST dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

# Check the shape of the data
print(X_train.shape, X_test.shape)  # (60000, 28, 28) (10000, 28, 28)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 4us/step
(60000, 28, 28) (10000, 28, 28)


In [2]:
# Normalizing pixel values to be between 0 and 1
X_train = X_train / 255.0
X_test = X_test / 255.0


In [3]:
# Flattening the 28x28 images into vectors of size 784 (28*28)
X_train_flat = X_train.reshape(-1, 28*28)
X_test_flat = X_test.reshape(-1, 28*28)

# New shape
print(X_train_flat.shape, X_test_flat.shape)  # (60000, 784) (10000, 784)


(60000, 784) (10000, 784)


In [4]:
from sklearn.tree import DecisionTreeClassifier

# Creating the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Fitting the classifier on the training data
clf.fit(X_train_flat, y_train)


In [5]:
# Making predictions on the test set
y_pred = clf.predict(X_test_flat)


In [6]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.8754
Confusion Matrix:
[[ 914    1    7    4    6    9   16    5    8   10]
 [   0 1084    9    8    2    9    5    3   14    1]
 [  13   11  887   29   15    6    9   24   30    8]
 [   7    8   34  861    8   40    3    6   17   26]
 [   8    4   11    6  858    5   18   10   20   42]
 [  15    8    5   39    6  740   25    6   32   16]
 [  21    5   11    9   23   15  846    3   20    5]
 [   2    7   21   24   12    5    3  925    9   20]
 [   8    9   33   34   21   32   14   12  785   26]
 [  14    5   10   22   45   10    6   20   23  854]]


Classifying Wine Quality Using Decision Trees



In [1]:
import pandas as pd

# Load the dataset
wine_data = pd.read_csv('WineQT.csv')

# Check the first few rows of the dataset
print(wine_data.head())


   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  Id  
0      9.4        5   0  
1      9.8        5   1  
2      9

In [3]:
# Checking for missing values
print(wine_data.isnull().sum())

# Filling missing values with the mean of the columns
wine_data.fillna(wine_data.mean(), inplace=True)


fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64


In [4]:
# Converting 'quality' into a binary classification: Good (>=6) or Bad (<6)
wine_data['quality_label'] = wine_data['quality'].apply(lambda x: 1 if x >= 6 else 0)


In [5]:
from sklearn.preprocessing import StandardScaler

# Selecting features and standardizing them
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
            'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
            'pH', 'sulphates', 'alcohol']

scaler = StandardScaler()
wine_data[features] = scaler.fit_transform(wine_data[features])


In [6]:
# Splitting the data into features (X) and target (y)
X = wine_data[features]
y = wine_data['quality_label']


In [7]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [8]:
from sklearn.tree import DecisionTreeClassifier

# Creating a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Training the classifier on the training data
clf.fit(X_train, y_train)


In [9]:
# Predict the quality on the test data
y_pred = clf.predict(X_test)


In [10]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.6880


In [11]:
from sklearn.metrics import roc_auc_score

# Probability predictions for the positive class
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")


ROC-AUC Score: 0.6870


In [12]:
from sklearn.metrics import confusion_matrix

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[103  49]
 [ 58 133]]


In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score



In [2]:
# Loading the dataset
df = pd.read_csv('lending_club.csv')  # Replace with the correct file path

In [4]:
df.isnull().sum()

Unnamed: 0                             0
emp_title                            833
emp_length                           817
state                                  0
homeownership                          0
annual_income                          0
verified_income                        0
debt_to_income                        24
annual_income_joint                 8505
verification_income_joint           8545
debt_to_income_joint                8505
delinq_2y                              0
months_since_last_delinq            5658
earliest_credit_line                   0
inquiries_last_12m                     0
total_credit_lines                     0
open_credit_lines                      0
total_credit_limit                     0
total_credit_utilized                  0
num_collections_last_12m               0
num_historical_failed_to_pay           0
months_since_90d_late               7715
current_accounts_delinq                0
total_collection_amount_ever           0
current_installm