# Data

In [None]:
import pandas as pd
#read data
data = pd.read_pickle('')
# data = pd.read_table('')
# data = pd.read_csv('')
#...

# Tokenize , clean and simplify data

Optional

In [None]:
import spacy
import string
nlp = spacy.load("en_core_web_lg")
stop_words = nlp.Defaults.stop_words
print(stop_words)
punctuations = string.punctuation
print(punctuations)

In [None]:
def spacy_tokenizer(sentence):
    doc = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() for word in doc ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    sentence = " ".join(mytokens)
    return sentence

In [None]:
data['tokenize'] = data['text'].apply(spacy_tokenizer)

# Text Classification Methods:

## Sentence_transformers + non-deep models

In [None]:
from sentence_transformers import SentenceTransformer,util,losses
model = SentenceTransformer('AHDMK/Sentence-GISTEmbedLoss-BioBert-Allnli-scinli') #use a suitable model for the task from huggingface

In [None]:
data['embeddings'] = data['text'].apply(model.encode)

In [None]:
#if a gpu is available 
#list_data = [x for x in data['text'][:5000]]
#x = model.encode(list)
#save into dictionary then concat to dataframe

In [None]:
#get classes
classes=data['Category'].unique().tolist()
nb_classes = len(classes)
print(nb_classes)
print(classes)

In [None]:
#turn categories to indices 
for i,type_c in enumerate(classes):
   for j,type_t in enumerate(data['Category']):
       if type_c == type_t :
           data.loc[j,'Category'] = i
for i,type_c in enumerate(classes):
   for j,type_t in enumerate(data_test['Category']):
       if type_c == type_t :
           data_test.loc[j,'Category'] = i

In [None]:

from sklearn.model_selection import train_test_split
X_train = data['embeddings'].to_list()
y_train = data['Category'].to_list()
X_test = data_test['embeddings'].to_list()
y_test = data_test['Category'].to_list()
#if there is no test data use train_test_split 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

clfs = [
    ('LogisticRegression', LogisticRegression(max_iter=3000,
                                              class_weight='balanced')
    ),
    ('RandomForest', RandomForestClassifier(max_depth=18,
                                            n_estimators=75,
                                            random_state=0)
    ),
    ('KNN 5', KNeighborsClassifier(n_neighbors=5)
    ),
    ('SVM C1', SVC(C=1,
                   class_weight='balanced')
    )]

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score


def print_val_scores(scores: list[float]) -> None:

  print(f'Cross validation scores: mean: {np.mean(scores):.3f}, '
        f'all: {[round(score, 3) for score in scores]}')


def print_stratified_kfold(clfs: list[tuple[str, any]], X_train: pd.DataFrame,
                           y_train: pd.Series, n_splits: int = 5, cv: int = 5,
                           ) -> None:

  for clf in clfs:
    print(f'\nStratifiedKFold - classifier: {clf[0]}:\n')
    skf = StratifiedKFold(n_splits=n_splits)

    scores = cross_val_score(clf[1],
                            X_train,
                            y_train,
                            cv=cv)

    print_val_scores(scores)

In [None]:
#cross validation
print_stratified_kfold(clfs, X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

clf =  SVC(C=1,class_weight='balanced') #use the model with best accuracy from cross validation (or try all if the test data is not similar to training data)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
# prob = clf.predict_proba(X_test)
# print(prob)

accuracy = np.mean(y_pred == y_test)

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title(f'SVM - acc {accuracy:.3f}', size=15)
plt.show()

## Word2vec + CNN

In [None]:
train = data[:int(len(data)*0.8)]
validate = data[int(len(data)*0.8):]
validate = validate.reset_index(drop=True)

In [None]:
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download
Embedding = r"/kaggle/input/biowordvec/BioWordVec_PubMed_MIMICIII_d200.vec.bin"
#w2vmodel = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/nlpl_222", filename="model.bin"), binary=True, unicode_errors="ignore")
BioWordVec = KeyedVectors.load_word2vec_format(Embedding,binary=True)
weights = BioWordVec #pick a model suitable for the task , load it and set weights = model

In [None]:
#create custom dataset
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.data import get_tokenizer


max_words = 250
tokenizer = get_tokenizer("basic_english")

class CustomTextDataset(Dataset):
    def __init__(self, df):
        self.labels = df['Category']
        self.text = df['text']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        data = self.text[idx]
        #sample = {"text": data, "Category": label}
        X = tokenizer(data)
        y=[0]*len(X)
        for i,text in enumerate(X):
            try:
             y[i] = weights.key_to_index[text]
            except : 0
        y=y[:max_words]
        G = [0]*max_words
        for i in range(len(y)):
            G[i]=y[i]
        return torch.tensor(G, dtype=torch.int32), torch.tensor(label)
    
train_set = CustomTextDataset(train)
validation_set = CustomTextDataset(validate)
test_set = CustomTextDataset(data_test)

In [None]:
batch_size=256
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [None]:
#set device = gpu if available
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#2 gpus : 
#device_model = torch.device('cuda:0')
#device = torch.device('cuda:1')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
import torch.nn.init as init
import torch.nn.functional as F 

k1 = max_words+1-3
k2 = max_words+1-4
k3 = max_words+1-5
vector_len = 200
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        D = 300
        self.embed = nn.Embedding.from_pretrained(torch.FloatTensor(weights.vectors), padding_idx=weights.key_to_index['pad'])
        #self.embed = nn.Embedding(199808, D)
        #self.embed.weight.data.copy_(embedding_weights)
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 100, kernel_size=(3,vector_len), stride=1,padding=0),  # h = 9-3 +1  and w = 1 output : 7x1
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(k1,1), stride=1)) #1x1
      
        self.layer2 = nn.Sequential(
            nn.Conv2d(1, 100, kernel_size=(4,vector_len), stride=1,padding=0), #6x1
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(k2,1), stride=1))  #1x1
        
        self.layer3 = nn.Sequential(
            nn.Conv2d(1, 100, kernel_size=(5,vector_len), stride=1,padding=0), #5x1
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(k3,1), stride=1)) #1X1
       
        self.drop_out = nn.Dropout()
        #concat operation
        self.fc1 = nn.Linear(1 * 1 * 100 * 3, 100)
        self.fc2 = nn.Linear(100, nb_classes)
        
        #self.fc3 = nn.Linear(100,3)
      
    def forward(self, x):
        #x=x.to(device)  #if the embedding layer is very large you can set it on a different gpu if available, move the data to it and then back after embedding
        x = self.embed(x)
        x = torch.unsqueeze(x, 1)
        #x=x.to(device_model)
        #print(x.shape)
        x3 = self.layer1(x)
        #print(x3.shape)
        x4 = self.layer2(x)
        x5 = self.layer3(x)
        x3 = x3.reshape(x3.size(0), -1)
        x4 = x4.reshape(x4.size(0), -1)
        x5 = x5.reshape(x5.size(0), -1)
        #print(x3.shape)
        x3 = self.drop_out(x3)
        x4 = self.drop_out(x4)
        x5 = self.drop_out(x5)
        out = torch.cat((x3,x4,x5),1)
        #print(out.shape)
        out = self.fc1(out)
        out = self.fc2(out)
        out = F.softmax(out, dim=1)
        #print(out.shape)
        return(out)

In [None]:
nb_classes=4
num_epochs = 15
learning_rate = 0.001

In [None]:
model = ConvNet()
model.to(device_model)
#model.embed.to(device)
def freeze_layer(layer):
 for param in layer.parameters():
  param.requires_grad = False
freeze_layer(model.embed)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcValLossAndAccuracy(model, loss_fn, val_loader):
    with torch.no_grad():
        Y_shuffled, Y_preds, losses = [],[],[]
        for X, Y in val_loader:
            X , Y = X.to(device_model), Y.to(device_model)
            preds = model(X)
            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_shuffled.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_shuffled = torch.cat(Y_shuffled)
        Y_preds = torch.cat(Y_preds)

        print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
        print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.cpu().detach().numpy(), Y_preds.cpu().detach().numpy())))


def TrainModel(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    for i in range(1, epochs+1):
        losses = []
        for X, Y in tqdm(train_loader):
            #print('Y',Y)
            X , Y = X.to(device_model), Y.to(device_model)
            Y_preds = model(X) ## Make Predictions

            loss = loss_fn(Y_preds, Y) ## Calculate Loss
            losses.append(loss.item())

            optimizer.zero_grad() ## Clear previously calculated gradients
            loss.backward() ## Calculates Gradients
            optimizer.step() ## Update network weights.

        print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
        CalcValLossAndAccuracy(model, loss_fn, val_loader)

In [None]:
TrainModel(model, criterion, optimizer, train_loader, validation_loader, 10)

In [None]:
def MakePredictions(model, loader):
    Y_shuffled, Y_preds = [], []
    for X, Y in loader:
        X,Y = X.to(device_model) , Y.to(device_model)
        preds = model(X)
        Y_preds.append(preds)
        Y_shuffled.append(Y)
    gc.collect()
    Y_preds, Y_shuffled = torch.cat(Y_preds), torch.cat(Y_shuffled)

    return Y_shuffled.cpu().detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).cpu().detach().numpy()

Y_actual, Y_preds = MakePredictions(model, test_loader)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))
ConfusionMatrixDisplay.from_predictions(Y_actual, Y_preds)
plt.title(f'LSTM', size=15)
plt.show()

In [None]:
text=''#trying the model give the input here
X = tokenizer(text)
print(X)
y=[0]*len(X)

for i,text in enumerate(X):
    try:
     y[i] = weights.key_to_index[text]
    except : 1
y=y[:max_words]
G = [0]*max_words
for i in range(len(y)):
    G[i]=y[i]
print(G)

#G.to(device)
G=[G]
G=torch.tensor(G, dtype=torch.int32)
G=G.to(device)
#model2 = model2.to(device)
output = model(G)
print(output)
v,i = max( (v,i) for i, v in enumerate(output[0]) )
v = v.cpu().detach().numpy()
print("Category :" , classes[i],"➜",v)


## Word2vec + BiLSTM

In [None]:
from torch import nn
from torch.nn import functional as F

embed_len = 200
hidden_dim = 100
n_layers=3

class LSTMClassifier(nn.Module):
    def __init__(self):
        super(LSTMClassifier, self).__init__()
        V = len(weights.key_to_index) + 1
        D = 300
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights.vectors), padding_idx=weights.key_to_index['pad'])
        #self.embedding_layer = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embed_len)
        #self.embedding_layer = nn.Embedding(V, D)
        #self.embedding_layer.weight.data.copy_(embedding_weights)
        self.lstm = nn.LSTM(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True,
                            bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim, nb_classes)  ## Input dimension are 2 times hidden dimensions due to bidirectional results

    def forward(self, x):
        #x = x.to(device)
        embeddings = self.embedding(x)
        #embeddings= embeddings.to(device_model)
        hidden, carry = torch.randn(2*n_layers, len(x), hidden_dim), torch.randn(2*n_layers, len(x), hidden_dim)
        hidden , carry = hidden.to(device_model) , carry.to(device_model)
        output, (hidden, carry) = self.lstm(embeddings, (hidden, carry))
        return self.linear(output[:,-1])

In [None]:
from torch.optim import Adam

epochs = 20
learning_rate = 1e-3

loss_fn = nn.CrossEntropyLoss()
lstm_classifier = LSTMClassifier()
lstm_classifier.to(device_model)
#lstm_classifier.embedding.to(device)
optimizer = Adam(lstm_classifier.parameters(), lr=learning_rate)
def freeze_layer(layer):
 for param in layer.parameters():
  param.requires_grad = False

freeze_layer(lstm_classifier.embedding)

In [None]:
TrainModel(lstm_classifier, criterion, optimizer, train_loader, validation_loader, 10)

In [None]:
Y_actual, Y_preds = MakePredictions(lstm_classifier, test_loader)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
print("Test Accuracy : {}".format(accuracy_score(Y_actual, Y_preds)))
print("\nClassification Report : ")
print(classification_report(Y_actual, Y_preds, target_names=classes))
print("\nConfusion Matrix : ")
print(confusion_matrix(Y_actual, Y_preds))
ConfusionMatrixDisplay.from_predictions(Y_actual, Y_preds)
plt.title(f'LSTM', size=15)
plt.show()