In [2]:
import os
import pandas as pd
import numpy as np
import shutil
import sys
import tqdm.notebook as tq
from collections import defaultdict

import torch
import torch.nn as nn

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
     

In [3]:
data_dir = "/kaggle/"
df_data = pd.read_csv(os.path.join(data_dir,"input","20ng-bydata-csv/20ng_bydate.tsv"), sep="\t")

In [4]:
df_data.head(5)

Unnamed: 0,text,autos,baseball,christian,comp,crypt,electronics,forsale,graphics,hardware,...,rec,religion,sci,soc,space,sport,sys,talk,windows,x
0,From: astein@nysernet.org (Alan Stein)\nSubjec...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,From: gary@colossus.cgd.ucar.edu (Gary Strand)...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,From: mls@panix.com (Michael Siemon)\nSubject:...,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,From: willis@oracle.SCG.HAC.COM (Stan Willis)\...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,From: richard@amc.com (Richard Wernick)\nSubje...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [5]:
from sklearn.model_selection import train_test_split
# split into train and test
df_train, df_test = train_test_split(df_data, random_state=77, test_size=0.30, shuffle=True)
# split test into test and validation datasets
df_test, df_valid = train_test_split(df_test, random_state=88, test_size=0.50, shuffle=True)

In [6]:
print(f"Train: {df_train.shape}, Test: {df_test.shape}, Valid: {df_valid.shape}")

Train: (12632, 30), Test: (2707, 30), Valid: (2708, 30)


In [7]:
# Hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05
THRESHOLD = 0.5 # threshold for the sigmoid


In [8]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
# Test the tokenizer
test_text = "We are testing BERT tokenizer."
# generate encodings
encodings = tokenizer.encode_plus(test_text, 
                                  add_special_tokens = True,
                                  max_length = 50,
                                  truncation = True,
                                  padding = "max_length", 
                                  return_attention_mask = True, 
                                  return_tensors = "pt")
# we get a dictionary with three keys (see: https://huggingface.co/transformers/glossary.html) 
print(encodings)
     

{'input_ids': tensor([[  101,  2057,  2024,  5604, 14324, 19204, 17629,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}


In [10]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.title = list(df['text'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'title': title
        }

In [11]:
target_list = list(df_data.columns)
target_list
     

['text',
 'autos',
 'baseball',
 'christian',
 'comp',
 'crypt',
 'electronics',
 'forsale',
 'graphics',
 'hardware',
 'hockey',
 'ibm',
 'mac',
 'med',
 'misc',
 'motorcycles',
 'ms-windows',
 'os',
 'pc',
 'politics',
 'rec',
 'religion',
 'sci',
 'soc',
 'space',
 'sport',
 'sys',
 'talk',
 'windows',
 'x']

In [12]:
target_list = target_list[1:]
target_list


['autos',
 'baseball',
 'christian',
 'comp',
 'crypt',
 'electronics',
 'forsale',
 'graphics',
 'hardware',
 'hockey',
 'ibm',
 'mac',
 'med',
 'misc',
 'motorcycles',
 'ms-windows',
 'os',
 'pc',
 'politics',
 'rec',
 'religion',
 'sci',
 'soc',
 'space',
 'sport',
 'sys',
 'talk',
 'windows',
 'x']

In [13]:
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, target_list)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN, target_list)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN, target_list)

In [14]:
# Data loaders
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

test_data_loader = torch.utils.data.DataLoader(test_dataset, 
    batch_size=TEST_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [15]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, len(target_list))

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

# # Freezing BERT layers: (tested, weaker convergence)
# for param in model.bert_model.parameters():
#     param.requires_grad = False

model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [16]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [17]:
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-5)      



In [59]:
# Training of the model for one epoch
def train_model(training_loader, model, optimizer):

    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to training mode (activate droput, batch norm)
    model.train()
    # initialize the progress bar
    loop = tq.tqdm(enumerate(training_loader), total=len(training_loader), 
                      leave=True, colour='steelblue')
    for batch_idx, data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        # forward
        outputs = model(ids, mask, token_type_ids) # (batch,predict)=(32,8)
        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        # training accuracy, apply sigmoid, round (apply thresh 0.5)
        outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
        targets = targets.cpu().detach().numpy()
        correct_predictions += np.sum(outputs==targets)
        num_samples += targets.size   # total number of elements in the 2D array

        # backward
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # grad descent step
        optimizer.step()

        # Update progress bar
        #loop.set_description(f"")
        #loop.set_postfix(batch_loss=loss)

    # returning: trained model, model accuracy, mean loss
    return model, float(correct_predictions)/num_samples, np.mean(losses)

In [18]:
def eval_model(validation_loader, model, optimizer):
    losses = []
    correct_predictions = 0
    num_samples = 0
    # set model to eval mode (turn off dropout, fix batch norm)
    model.eval()

    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

            # validation accuracy
            # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
            outputs = torch.sigmoid(outputs).cpu().detach().numpy().round()
            targets = targets.cpu().detach().numpy()
            correct_predictions += np.sum(outputs==targets)
            num_samples += targets.size   # total number of elements in the 2D array

    return float(correct_predictions)/num_samples, np.mean(losses)
     

In [1]:
history = defaultdict(list)
best_accuracy = 0

for epoch in range(1, EPOCHS+1):
    print(f'Epoch {epoch}/{EPOCHS}')
    model, train_acc, train_loss = train_model(train_data_loader, model, optimizer)
    val_acc, val_loss = eval_model(val_data_loader, model, optimizer)

    print(f'train_loss={train_loss:.4f}, val_loss={val_loss:.4f} train_acc={train_acc:.4f}, val_acc={val_acc:.4f}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    # save the best model
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), os.path.join(data_dir,"working","MLTC_model_state.bin"))
        best_accuracy = val_acc
     

In [None]:
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (10,7)
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);
plt.grid()

# Model Evaluation

In [19]:
# Loading pretrained model (best model)
model = BERTClass()
# model.load_state_dict(torch.load(os.path.join(data_dir,"input/bert_uncased_20ng_classification/pytorch/version_1/1","MLTC_model_state.bin")))
model.load_state_dict(torch.load(
    os.path.join(data_dir, "input/bert_uncased_20ng_classification/pytorch/version_1/1", "MLTC_model_state.bin"),
    map_location=torch.device('cpu')  # Load the model to CPU
))
model = model.to(device)


In [20]:
# Evaluate the model using the test data
test_acc, test_loss = eval_model(test_data_loader, model, optimizer)
# The accuracy looks OK, similar to the validation accuracy
# The model generalizes well !
test_acc

0.9884845165152925

# Predicting on raw text

In [29]:
def inference(text):
    encoded_text = tokenizer.encode_plus(
    raw_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=True,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
    )
     
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)
    token_type_ids = encoded_text['token_type_ids'].to(device)
    output = model(input_ids, attention_mask, token_type_ids)
     # add sigmoid, for the training sigmoid is in BCEWithLogitsLoss
    output = torch.sigmoid(output).detach().cpu()
    # thresholding at 0.5
    output = output.flatten().round().numpy()

    # Correctly identified the topic of the paper: High energy physics
    print(f"Title: {raw_text}")
    for idx, p in enumerate(output):
      if p==1:
        print(f"Label: {target_list[idx]}")

In [30]:
inference('''From: mls@panix.com (Michael Siemon)
Subject: Re: Homosexuality issues in Christianity
Organization: PANIX Public Access Unix, NYC
Lines: 25

In <May.7.01.08.16.1993.14381@athos.rutgers.edu> whitsebd@nextwork.rose-hulman.edu (Bryan Whitsell) writes:

>Any one who thinks that Homosexuality and Christianity are compatible should check  
>out:
>	Romans 1:27
>	I Corinthians 6:9
>	I Timothy 1:10
>	Jude 1:7
>	II Peter 2:6-9
>	Gen. 19
>	Lev  18:22
>(to name a few of the verses that pertain to homosexuality)

Homosexual Christians have indeed "checked out" these verses.  Some of
them are used against us only through incredibly perverse interpretations.
Others simply do not address the issues.

You would seem to be more in need of a careful and Spirit-led course
in exegesis than most of the gay Christians I know.  I suggest that
you stop "proof-texting" about things you know nothing about.
-- 
Michael L. Siemon		I say "You are gods, sons of the
mls@panix.com			Most High, all of you; nevertheless
    - or -			you shall die like men, and fall
mls@ulysses.att..com		like any prince."   Psalm 82:6-7
''')

Title: From: mls@panix.com (Michael Siemon)
Subject: Re: Homosexuality issues in Christianity
Organization: PANIX Public Access Unix, NYC
Lines: 25

In <May.7.01.08.16.1993.14381@athos.rutgers.edu> whitsebd@nextwork.rose-hulman.edu (Bryan Whitsell) writes:

>Any one who thinks that Homosexuality and Christianity are compatible should check  
>out:
>	Romans 1:27
>	I Corinthians 6:9
>	I Timothy 1:10
>	Jude 1:7
>	II Peter 2:6-9
>	Gen. 19
>	Lev  18:22
>(to name a few of the verses that pertain to homosexuality)

Homosexual Christians have indeed "checked out" these verses.  Some of
them are used against us only through incredibly perverse interpretations.
Others simply do not address the issues.

You would seem to be more in need of a careful and Spirit-led course
in exegesis than most of the gay Christians I know.  I suggest that
you stop "proof-texting" about things you know nothing about.
-- 
Michael L. Siemon		I say "You are gods, sons of the
mls@panix.com			Most High, all of you; neve

In [2]:
df=pd.read_csv("/kaggle/input/20ng-bydata-csv/20ng_bydate.tsv", sep="\t")

In [3]:
df

Unnamed: 0,text,autos,baseball,christian,comp,crypt,electronics,forsale,graphics,hardware,...,rec,religion,sci,soc,space,sport,sys,talk,windows,x
0,From: astein@nysernet.org (Alan Stein)\nSubjec...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,From: gary@colossus.cgd.ucar.edu (Gary Strand)...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,From: mls@panix.com (Michael Siemon)\nSubject:...,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,From: willis@oracle.SCG.HAC.COM (Stan Willis)\...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,From: richard@amc.com (Richard Wernick)\nSubje...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18042,From: tquinn@heartland.bradley.edu (Terry Quin...,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
18043,From: jrs@zippy.unet.com (John Switzer Frame 3...,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
18044,From: kludge@grissom.larc.nasa.gov (Scott Dors...,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
18045,From: d88-jwa@hemul.nada.kth.se (Jon Wätte)\nS...,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [5]:
df_small=df.iloc[:10]

In [6]:
df_small.to_csv("/kaggle/working/20ng_bydate_small.tsv", sep="\t", header=True, index=False)