# Deep Learning Model Implementation

1. Data Loading
2. Label + One-Hot Encoding
3. Tokenization + Padding
4. Embedding
5. Data Splitting
6. Model Implementation

| Model | Accuracy |
| --- | --- |
| LSTM | |
| Bi-LSTM | |
| GRU | |
| Bi-GRU | |
| 1D-CNN | |

In [1]:
import os, joblib
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

2024-10-28 13:24:15.881622: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-28 13:24:15.895168: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730103855.906791  238674 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730103855.909807  238674 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-28 13:24:15.922047: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [18]:
# Define Data directory
DATA_DIR = '../data/'

# list of poet names
poets = ["abbasinYousuf", "azizMazerwal", "ghaniKhan", "hamzaBaba", "khaliqZiari", "khatirAfridi", "khushalKhanKhattak", "mumtazOrakazi", "munirJan", "rahmanBaba", "rehmatShah", "sahibShahSabir", "salimRiaz"]

poet_labels = {poet: i for i, poet in enumerate(poets)}
print(poet_labels)

{'abbasinYousuf': 0, 'azizMazerwal': 1, 'ghaniKhan': 2, 'hamzaBaba': 3, 'khaliqZiari': 4, 'khatirAfridi': 5, 'khushalKhanKhattak': 6, 'mumtazOrakazi': 7, 'munirJan': 8, 'rahmanBaba': 9, 'rehmatShah': 10, 'sahibShahSabir': 11, 'salimRiaz': 12}


In [19]:
# Load and Label the Data
def load_and_label(data_dir, poets, poet_labels):
    data = []
    labels = []

    for poet in poets:
        poet_dir = os.path.join(data_dir, poet)
        file_path = os.path.join(poet_dir, f'{poet}.txt')

        # Check if the file exists
        if not os.path.exists(file_path):
            print(f'{file_path} does not exist')
            continue

        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            data.extend(lines)
            labels.extend([poet_labels[poet]] * len(lines))

    df = pd.DataFrame({'text': data, 'label': labels})
    return data, labels, df

In [20]:
data, labels, df = load_and_label(DATA_DIR, poets, poet_labels)
df.describe()

Unnamed: 0,label
count,30354.0
mean,7.198854
std,2.391132
min,0.0
25%,6.0
50%,7.0
75%,9.0
max,12.0


In [5]:
X = df['text'].values
y = df['label'].values

# Label Encoding
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()

In [6]:
print(X_tfidf.shape, y_categorical.shape)

(30354, 5000) (30354, 13)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((24283, 5000), (6071, 5000), (24283, 13), (6071, 13))

In [8]:
# Useful Functions for Model Building
# Function to plot training history
def plot_training_history(history, title):
    # Plot Training & Validation Loss
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'{title} - Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot Training & Validation Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f'{title} - Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

# Function to plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels):
    conf_matrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    plt.title("Confusion Matrix")
    plt.show()

### Bi-LSTM Model, TF-IDF Vectorization

In [None]:
# Reshape data for LST
X_train_reshaped = np.expand_dims(X_train, -1)
X_test_reshaped = np.expand_dims(X_test, -1)

# Define the LSTM Model
print("Defining the LSTM model...")
# LSTM_tfidf = Sequential()
# LSTM_tfidf.add(LSTM(64, input_shape=(X_train.shape[1], 1), return_sequences=True))
# LSTM_tfidf.add(Dropout(0.2))
# LSTM_tfidf.add(LSTM(32))
# LSTM_tfidf.add(Dropout(0.2))
# LSTM_tfidf.add(Dense(y_categorical.shape[1], activation='softmax'))

LSTM_bi_tfidf = Sequential()
# LSTM_bi_tfidf.add(Bidirectional(LSTM(64, input_shape=(X_train.shape[1], 1), return_sequences=True)))
LSTM_bi_tfidf.add(Bidirectional(LSTM(30)))
LSTM_bi_tfidf.add(Dense(y_categorical.shape[1], activation='softmax'))

# Compile the Model
print("Compiling the model...")
LSTM_bi_tfidf.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the Model
print("Training the model...")
history_lstm_tfidf = LSTM_bi_tfidf.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_data=(X_test_reshaped, y_test),
                          callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
)

In [None]:
# Evaluate
print("Evaluating the model...")
y_pred_lstm_tfidf = np.argmax(LSTM_bi_tfidf.predict(X_test_reshaped), axis=1)
y_true_lstm_tfidf = np.argmax(y_test, axis=1)

# Confusion Matrix
plot_confusion_matrix(y_true_lstm_tfidf, y_pred_lstm_tfidf, poets)

# Plot Training History
plot_training_history(history_lstm_tfidf, 'LSTM with TF-IDF')

### LSTM Model, Tokenization + Padding, Embedding

In [9]:
# Tokenization
tokenizer = Tokenizer(num_words=500000, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index
X_tokenized = tokenizer.texts_to_sequences(X)

# Padding Sequences
max_sequence_length = max([len(x) for x in X_tokenized])
X_padded = pad_sequences(X_tokenized, maxlen=max_sequence_length, truncating='post')

In [None]:
# Split the data
X_train_tok, X_test_tok, y_train_tok, y_test_tok = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)
X_train_tok.shape, X_test_tok.shape, y_train_tok.shape, y_test_tok.shape

In [None]:
# Define LSTM Model
embedding_dim = 300

print("Defining the LSTM model...")
# LSTM_tok_emb = Sequential()
# LSTM_tok_emb.add(Embedding(len(word_index)+1, embedding_dim, input_length=max_sequence_length))
# LSTM_tok_emb.add(LSTM(64, return_sequences=True))
# LSTM_tok_emb.add(Dropout(0.2))
# LSTM_tok_emb.add(LSTM(32))
# LSTM_tok_emb.add(Dropout(0.2))
# LSTM_tok_emb.add(Dense(y_categorical.shape[1], activation='softmax'))

LSTM_tok_emb = Sequential()
LSTM_tok_emb.add(Bidirectional(LSTM(30)))
LSTM_tok_emb.add(Dense(y_categorical.shape[1], activation='softmax'))

# Compile the Model
print("Compiling the model...")
LSTM_tok_emb.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

print("Model Summary:")
LSTM_tok_emb.summary()

# Train the Model   
print("Training the model...")
history_lstm_tok_emb = LSTM_tok_emb.fit(X_train_tok, y_train_tok, epochs=10, batch_size=32, validation_data=(X_test_tok, y_test_tok),
                          callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
)

In [None]:
# Evaluate
y_pred_lstm_embed = np.argmax(LSTM_tok_emb.predict(X_test_tok), axis=1)
y_true_lstm_embed = np.argmax(y_test_tok, axis=1)

# Confusion Matrix
plot_confusion_matrix(y_true_lstm_embed, y_pred_lstm_embed, poets)

# Plot Training History
plot_training_history(history_lstm_tok_emb, 'LSTM with Embedding')