<a href="https://colab.research.google.com/github/AIsoroush/deep-learning-projects/blob/main/DeepLearning1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

"""
Project: Drug Classification using Deep Learning
Author : Soroush Taqaddos
Email  : itissoroush@gmail.com
Date   : 2025-09-13
Description:
    This project demonstrates a multi-class deep learning model
    to predict drug types based on patient features such as
    Sex, Age, BP, Cholesterol, and Na_to_K levels.

    The project includes:
    - Downloading dataset from Google Drive
    - Data preprocessing (Label Encoding, handling missing values)
    - Train/Test split
    - Deep Learning model with early stopping and dropout
    - Model evaluation using accuracy, confusion matrix, and classification report

    Instructions:
    1. Run `scripts/download_data.py` to download the dataset.
    2. Run `src/preprocess.py` for preprocessing.
    3. Run `src/train.py` to train the model.
    4. Run `src/evaluate.py` to evaluate the model performance.
"""


In [17]:
# -------------------------------
# Import required libraries
# -------------------------------
import tensorflow as tf        # TensorFlow for Deep Learning
from tensorflow import keras   # Keras API for building neural networks
import pandas as pd            # Pandas for data manipulation
import numpy as np             # NumPy for numerical operations


### Download dataset from Google Drive

In [18]:
# -------------------------------
# Download dataset
# -------------------------------
import gdown
import os

# Make sure the 'data' folder exists
os.makedirs("data", exist_ok=True)

# Google Drive file ID
file_id = "1jsbjvaITnAPPJ7LAEmnfyHOEvORmoJfw"

# Construct direct download URL
url = f"https://drive.google.com/uc?id={file_id}"

# Path to save the downloaded dataset
out_path = "data/drug_dataset.csv"

# Download the dataset
print("Downloading dataset...")
gdown.download(url, out_path, quiet=False)
print(f"✅ Dataset downloaded to {out_path}")

# Store file path in variable for later use
file = out_path


Downloading dataset...


Downloading...
From: https://drive.google.com/uc?id=1jsbjvaITnAPPJ7LAEmnfyHOEvORmoJfw
To: /content/data/drug_dataset.csv
100%|██████████| 5.83k/5.83k [00:00<00:00, 7.63MB/s]

✅ Dataset downloaded to data/drug_dataset.csv





In [19]:
# -------------------------------
# Import dataset from CSV file
# -------------------------------
import pandas as pd

df = pd.read_csv(file)  # Load dataset into a pandas DataFrame

# Preview the first 5 rows to verify the data
df.head()


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [9]:
# -------------------------------
# Convert categorical features to numeric using LabelEncoder
# -------------------------------
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Encode 'Sex' column (e.g., Male/Female → 0/1)
le.fit(df['Sex'])
df['Sex'] = le.transform(df['Sex'])

# Encode 'BP' column (e.g., HIGH/NORMAL/LOW → 0/1/2)
le.fit(df['BP'])
df['BP'] = le.transform(df['BP'])

# Encode 'Cholesterol' column (e.g., HIGH/NORMAL → 0/1)
le.fit(df['Cholesterol'])
df['Cholesterol'] = le.transform(df['Cholesterol'])

# Encode target column 'Drug' (convert drug types to numeric labels)
le.fit(df['Drug'])
df['Drug'] = le.transform(df['Drug'])

# Now all categorical columns are numeric, ready for ML/DL models


In [10]:
# -------------------------------
# Drop rows with missing values
# -------------------------------
df.dropna(inplace=True)  # Remove any rows that contain NaN values

# Preview first 5 rows to verify changes
df.head()


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4


In [11]:
# Separate features (X) and target (y)
x = df.drop('Drug',axis=1).values #all columns except 'Drug' as input features
y = df['Drug'].values

# Split dataset into training and testing sets
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.2,random_state=42)

# Print shapes of train and test sets
print(f'train data {train_x.shape} , {train_y.shape}')
print(f'test data {test_x.shape} , {test_y.shape}')


train data (160, 5) , (160,)
test data (40, 5) , (40,)


In [16]:
df.Drug.unique()

array([4, 2, 3, 0, 1])

In [23]:
# -------------------------------
# Build the Deep Learning Model
# -------------------------------
model = keras.Sequential([
    keras.Input(shape=(5,)),                  # Input layer with 5 features
    keras.layers.Dense(256, activation='relu'), # First hidden layer with 256 neurons and ReLU
    keras.layers.Dropout(0.3),               # Dropout layer to reduce overfitting (30% dropout)
    keras.layers.Dense(256, activation='relu'), # Second hidden layer
    keras.layers.Dense(256, activation='relu'), # Third hidden layer
    keras.layers.Dense(5, activation='softmax') # Output layer for 5 classes with softmax
])

# -------------------------------
# Compile the Model
# -------------------------------
model.compile(
    optimizer='Adam',                         # Adam optimizer
    loss='sparse_categorical_crossentropy',   # Sparse categorical crossentropy for multi-class classification
    metrics=['accuracy']                       # Track accuracy during training
)

# -------------------------------
# Setup Early Stopping Callback
# -------------------------------
from keras.callbacks import EarlyStopping

es = EarlyStopping(
    monitor='val_loss',         # Monitor validation loss
    patience=5,                 # Stop training after 5 epochs of no improvement
    restore_best_weights=True   # Restore model weights from the epoch with the best validation loss
)

# -------------------------------
# Train the Model
# -------------------------------
history = model.fit(
    train_x, train_y,           # Training data
    epochs=1000,                # Maximum number of epochs
    validation_data=(test_x, test_y),  # Validation data
    callbacks=[es]              # Use early stopping to prevent overfitting
)


Epoch 1/1000
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 381ms/step - accuracy: 0.3720 - loss: 3.2158 - val_accuracy: 0.5000 - val_loss: 1.7702
Epoch 2/1000
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.5095 - loss: 1.9209 - val_accuracy: 0.3750 - val_loss: 1.2985
Epoch 3/1000
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.3707 - loss: 1.5819 - val_accuracy: 0.4500 - val_loss: 1.1588
Epoch 4/1000
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4560 - loss: 1.3583 - val_accuracy: 0.5000 - val_loss: 1.3495
Epoch 5/1000
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5223 - loss: 1.4450 - val_accuracy: 0.5500 - val_loss: 1.1181
Epoch 6/1000
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5696 - loss: 1.1965 - val_accuracy: 0.5250 - val_loss: 1.0784
Epoch 7/1000
[1m5/5[0m [32m━━━

In [28]:
# -------------------------------
# Evaluate the Model on Test Data
# -------------------------------
loss, acc = model.evaluate(test_x, test_y)  # Compute loss and accuracy on test set

# Print results with 2 decimal places
print(f'Loss: {loss:.2f}')
print(f'Accuracy: {acc:.2f}')


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5542 - loss: 0.9738
loss: 0.98
accuracy : 0.55.


In [14]:
# -------------------------------
# Make Predictions on Test Data
# -------------------------------
y_pred = model.predict(test_x)  # Predict probabilities for each class

# Convert probabilities to class labels (choose the class with highest probability)
y_pred_classes = np.argmax(y_pred, axis=1)

# -------------------------------
# Evaluate Performance with Confusion Matrix and Classification Report
# -------------------------------
from sklearn.metrics import classification_report, confusion_matrix

# Confusion matrix shows the counts of true vs predicted classes
print(confusion_matrix(test_y, y_pred_classes))

# Classification report shows precision, recall, f1-score, and support for each class
print(classification_report(test_y, y_pred_classes))


NameError: name 'model' is not defined