### importing packages

In [1]:
import pandas as pd
import numpy as np
import os
import cohere
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from dotenv import load_dotenv
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset


In [1]:
import warnings

warnings.filterwarnings('ignore')

### Reading and exploring the dataset

In [2]:
text_path = os.path.join(os.getcwd(), '..', 'Datasets\\extracted_data.csv')
data = pd.read_csv(text_path)
data.head()

Unnamed: 0,custom_id,image_url,markdown_text,raw_text
0,Resume/0000153377.jpg,https://filstrg.blob.core.windows.net/tobacco-...,"## Fitzmaurice, Mary Anne\n\n### Research Biol...","\nFitzmaurice, Mary Anne\nResearch Biologist\n..."
1,Resume/10036815_10036823.jpg,https://filstrg.blob.core.windows.net/tobacco-...,# CURRICULUM VITAE\n\n## Name: \nPeter M. Howl...,"CURRICULUM VITAE\n\nName: Peter M. Howley, M.D..."
2,Resume/10087799_10087801.jpg,https://filstrg.blob.core.windows.net/tobacco-...,![Form No. 1a (For N.I.H. Continuation Grant a...,Form No. 1a (For N.I.H. Continuation Grant app...
3,Resume/10150247_10150256.jpg,https://filstrg.blob.core.windows.net/tobacco-...,### UNIVERSITY OF MIAMI\n\n#### CURRICULUM VIT...,UNIVERSITY OF MIAMI\n\nCURRICULUM VITAE\n\nSta...
4,Resume/11300115-0116.jpg,https://filstrg.blob.core.windows.net/tobacco-...,# CURRICULUM VITAE\n\n## WILLIAM CARSON HINDS\...,CURRICULUM VITAE\n\nWILLIAM CARSON HINDS\n\nBo...


In [3]:
# extracting the document class from the custom_id
data['class'] = data['custom_id'].apply(lambda x : x.split('/')[0] )

In [4]:
data['class'].value_counts()

class
Memo          620
Email         596
Letter        566
Form          430
Report        265
scientific    258
ADVE          222
Note          200
News          188
Resume        120
Name: count, dtype: int64

In [5]:
data.shape

(3465, 5)

In [6]:
data.isna().sum()

custom_id        0
image_url        0
markdown_text    1
raw_text         1
class            0
dtype: int64

In [7]:
# droping the empty document
data = data.dropna()

### generating text embeddings

In [8]:
load_dotenv()  # Load environment variables from .env file


co_embed = cohere.Client(
    api_key=os.getenv("AZURE_ML_COHERE_EMBED_CREDENTIAL"),
    base_url=os.getenv("AZURE_ML_COHERE_EMBED_ENDPOINT"),
)

In [9]:
# test the embedding
docs = data.head()["raw_text"].tolist()
embeddings = co_embed.embed(
    input_type="classification",
    texts=docs,
).embeddings
len(embeddings)

5

In [10]:
len(embeddings[0])

1024

In [11]:
# splitting the data to train and test
docs_train, docs_test, classes_train, classes_test = train_test_split(
            list(data['raw_text']), list(data['class']), test_size=0.20, random_state=0)


#### generating raw text embeddings

In [12]:
# get the train embeddings 
embeddings_train = co_embed.embed(texts=docs_train,
                            input_type='classification'
                            ).embeddings

In [13]:
len(embeddings_train[0]), len(embeddings_train)

(1024, 2598)

In [15]:
# get the test embeddings
embeddings_test = co_embed.embed(texts=docs_test,
                           input_type='classification'
                            ).embeddings

In [16]:
len(embeddings_test[0]), len(embeddings_test)

(1024, 866)

In [2]:
# saving the embeddings



### Reading the text embeddings

### encoding the target

In [17]:
# encoding the target variable 
encoder = LabelEncoder()

classes_train_encoded = encoder.fit_transform(classes_train)
classes_test_encoded = encoder.transform(classes_test)

### modelling

In [18]:
docs_train_tensor = torch.tensor(embeddings_train, dtype=torch.float32)
classes_train_tensor = torch.tensor(classes_train_encoded, dtype=torch.float32).reshape(-1, 1)

docs_test_tensor = torch.tensor(embeddings_test, dtype=torch.float32)
classes_test_tensor = torch.tensor(classes_test_encoded, dtype=torch.float32).reshape(-1, 1)

In [19]:
class TextClassifier(nn.Module):
    def __init__(self):
        super(TextClassifier, self).__init__()
        self.layer_stack = nn.Sequential(
            # layer 1
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),  
            nn.ReLU(),
            nn.Dropout(0.5),
            # layer 2
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),  
            nn.ReLU(),
            nn.Dropout(0.5),
            # output layer
            nn.Linear(256, 10)
        )
    
    def forward(self, x):
        return self.layer_stack(x)


In [20]:
model = TextClassifier()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

TextClassifier(
  (layer_stack): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=256, out_features=10, bias=True)
  )
)

In [21]:
class_counts = data['class'].value_counts().sort_index()
weights = 1.0 / torch.tensor(class_counts.values, dtype=torch.float32)
weights = weights / weights.sum()
criterion = nn.CrossEntropyLoss(weight=weights.to(device))

In [22]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)



In [23]:
# Create TensorDataset
train_dataset = TensorDataset(docs_train_tensor, classes_train_tensor)
test_dataset = TensorDataset(docs_test_tensor, classes_test_tensor)

# Create DataLoaders
batch_size = 32  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


### training and evaluation

In [24]:
num_epochs = 30
best_val_loss = float('inf')
patience = 5
patience_counter = 0

for epoch in range(num_epochs):
    # Training Phase
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device, dtype=torch.long).squeeze()
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)
    
    train_loss = total_loss / len(train_loader)
    train_accuracy = 100 * correct / total

    # Validation Phase
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device, dtype=torch.long).squeeze()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == targets).sum().item()
            val_total += targets.size(0)
    
    val_loss = val_loss / len(test_loader)
    val_accuracy = 100 * val_correct / val_total

    # Learning rate scheduling
    scheduler.step(val_loss)

    # Early stopping check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Save best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_val_loss': best_val_loss
        }, 'best_model.pth')
    else:
        patience_counter += 1

    # Print epoch statistics
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Training - Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%")
    print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2f}%")

    # Early stopping
    if patience_counter >= patience:
        print("Early stopping triggered!")
        break

# Load best model for final evaluation
checkpoint = torch.load('best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])

Epoch [1/30]
Training - Loss: 2.0243, Accuracy: 26.64%
Validation - Loss: 1.7439, Accuracy: 57.04%
Epoch [2/30]
Training - Loss: 1.4874, Accuracy: 51.62%
Validation - Loss: 1.2616, Accuracy: 67.67%
Epoch [3/30]
Training - Loss: 1.1988, Accuracy: 61.86%
Validation - Loss: 1.0725, Accuracy: 72.52%
Epoch [4/30]
Training - Loss: 1.0152, Accuracy: 68.32%
Validation - Loss: 0.9296, Accuracy: 74.48%
Epoch [5/30]
Training - Loss: 0.8744, Accuracy: 72.48%
Validation - Loss: 0.8452, Accuracy: 78.18%
Epoch [6/30]
Training - Loss: 0.7829, Accuracy: 76.79%
Validation - Loss: 0.7786, Accuracy: 77.37%
Epoch [7/30]
Training - Loss: 0.6676, Accuracy: 80.22%
Validation - Loss: 0.7243, Accuracy: 80.60%
Epoch [8/30]
Training - Loss: 0.6068, Accuracy: 83.26%
Validation - Loss: 0.6844, Accuracy: 80.37%
Epoch [9/30]
Training - Loss: 0.5482, Accuracy: 84.45%
Validation - Loss: 0.6574, Accuracy: 80.72%
Epoch [10/30]
Training - Loss: 0.4880, Accuracy: 85.95%
Validation - Loss: 0.6362, Accuracy: 81.41%
Epoch [11

<All keys matched successfully>