# Deep Learning Model Implementation

1. Data Loading
2. Label + One-Hot Encoding
3. Tokenization + Padding
4. Embedding
5. Data Splitting
6. Model Implementation

| Model | Accuracy |
| --- | --- |
| LSTM | |
| Bi-LSTM | |
| GRU | |
| Bi-GRU | |
| 1D-CNN | |

In [17]:
import os, joblib
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Bidirectional, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [5]:
# Define Data directory
DATA_DIR = '../data/'

# list of poet names
poets = ["abbasinYousuf", "azizMazerwal", "ghaniKhan", "hamzaBaba", "khaliqZiari", "khatirAfridi", "khushalKhanKhattak", "mumtazOrakazi", "munirJan", "rahmanBaba", "rehmatShah", "sahibShahSabir", "salimRiaz"]

poet_labels = {poet: i for i, poet in enumerate(poets)}
print(poet_labels)

{'abbasinYousuf': 0, 'azizMazerwal': 1, 'ghaniKhan': 2, 'hamzaBaba': 3, 'khaliqZiari': 4, 'khatirAfridi': 5, 'khushalKhanKhattak': 6, 'mumtazOrakazi': 7, 'munirJan': 8, 'rahmanBaba': 9, 'rehmatShah': 10, 'sahibShahSabir': 11, 'salimRiaz': 12}


In [6]:
# Load and Label the Data
def load_and_label(data_dir, poets, poet_labels):
    data = []
    labels = []

    for poet in poets:
        poet_dir = os.path.join(data_dir, poet)
        file_path = os.path.join(poet_dir, f'{poet}.txt')

        # Check if the file exists
        if not os.path.exists(file_path):
            print(f'{file_path} does not exist')
            continue

        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            data.extend(lines)
            labels.extend([poet_labels[poet]] * len(lines))

    df = pd.DataFrame({'text': data, 'label': labels})
    return data, labels, df

In [7]:
data, labels, df = load_and_label(DATA_DIR, poets, poet_labels)
df.describe()

Unnamed: 0,label
count,30354.0
mean,7.198854
std,2.391132
min,0.0
25%,6.0
50%,7.0
75%,9.0
max,12.0


In [9]:
X = df['text'].values
y = df['label'].values

# Label Encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(y)

In [15]:
# Plot training history
def plot_training_history(history, title="Model Training"):
    plt.figure(figsize=(12, 5))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f"{title} - Loss")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title(f"{title} - Accuracy")
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [10]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, encoded_labels, test_size=0.2, random_state=42)
X_vals, X_test, y_vals, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
# Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_vals_tfidf = tfidf_vectorizer.transform(X_vals).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Reshape the data to add a third dimension for the LSTM model
X_train_tfidf = np.expand_dims(X_train_tfidf, axis=-1)
X_vals_tfidf = np.expand_dims(X_vals_tfidf, axis=-1)
X_test_tfidf = np.expand_dims(X_test_tfidf, axis=-1)

In [None]:
# Define the LSTM Model
model = Sequential()
model.add(LSTM(128, input_shape=(X_train_tfidf.shape[1], X_train_tfidf.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(set(encoded_labels)), activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define the early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train_tfidf, y_train, validation_data=(X_vals_tfidf, y_vals), epochs=10, batch_size=32, callbacks=[early_stopping])

In [None]:
plot_training_history(history, title="Second LSTM Model (TF-IDF) Training")

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test_tfidf, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Make predictions on the test set
y_probs = model.predict(X_test_tfidf)
y_preds = np.argmax(y_probs, axis=1)

# Confusion Matrix
confusion_matrix = confusion_matrix(y_test, y_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix, annot=True, fmt='d', xticklabels=poets, yticklabels=poets)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - LSTM Model (TF-IDF)')

# Classification Report
classification_rep = classification_report(y_test, y_preds, target_names=poets)
print(classification_rep)
plt.show()