# Question Classification

In [2]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print('No GPU available')
    device=torch.device('cpu')

In [3]:
import pandas as pd
quora= pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")

In [4]:
quora

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


In [5]:
quora.loc[quora.target == 0].sample(5)[['question_text', 'target']]

Unnamed: 0,question_text,target
366923,What is the purpose of life and why are we cre...,0
415930,Can you gain lineage citizenship in Hungary th...,0
607910,Which is your worst selfie with your friends?,0
1289951,Why is the mainstream American media still dis...,0
711983,How much money do I need when I go to Belgrade?,0


In [6]:
questions=quora.question_text.values
target=quora.target.values

In [7]:
len(questions)

1306122

In [8]:
#https://huggingface.co/bert-base-uncased
from transformers import AutoTokenizer, BertTokenizer  #Use the 12-layer BERT model, with an uncased vocab,uncased means lowercase
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) #gpt2, bert-base_uncased

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
print("original", questions[0])
print('tokenize', tokenizer.tokenize(questions[0]))
print("token Id", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(questions[0])))

original How did Quebec nationalists see their province as a nation in the 1960s?
tokenize ['how', 'did', 'quebec', 'nationalists', 'see', 'their', 'province', 'as', 'a', 'nation', 'in', 'the', '1960s', '?']
token Id [2129, 2106, 5447, 17934, 2156, 2037, 2874, 2004, 1037, 3842, 1999, 1996, 4120, 1029]


In [10]:
input_ids =[]
for ques in questions:
    encoded_ques =tokenizer.encode(ques, add_special_tokens=True) #max_length
    input_ids.append(encoded_ques)
print('Original: ', questions[0])
print('Token IDs:', input_ids[0])

Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors


Original:  How did Quebec nationalists see their province as a nation in the 1960s?
Token IDs: [101, 2129, 2106, 5447, 17934, 2156, 2037, 2874, 2004, 1037, 3842, 1999, 1996, 4120, 1029, 102]


In [11]:
max([len(ques) for ques in input_ids])
min([len(ques) for ques in input_ids])

3

In [12]:
#check how many sentence have greater than 170 token
count_lenth =0
for text in input_ids:
    if len(text) >64:
        count_lenth +=1
print(count_lenth)

1043


In [13]:
%%time
#add padding and truncation on the dataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
# check id and pad token
# tokenizer.pad_token_id,tokenizer.pad_token
max_length= 64
input_ids_wpad= pad_sequences(input_ids, maxlen=max_length, dtype='long', value=0, truncating='post', padding='post') #post means last, opposed mean first, padding val 0



CPU times: user 8.78 s, sys: 927 ms, total: 9.71 s
Wall time: 12.4 s


In [14]:
## atention mask simplify with are actual token and which are padding, bert havenot 0 voca so if 0 then pad otherwise token
attention_mask=[]
for ques in input_ids_wpad:
    att_mask=[int(token_id>0) for token_id in ques]
    attention_mask.append(att_mask)

In [15]:
#split dataset,target and mask on train and valid
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_targets, validation_targets = train_test_split(input_ids_wpad, target, random_state=42, test_size=0.15)
train_mask, validation_mask, _, _ = train_test_split(attention_mask, target, random_state=42, test_size=0.15)

In [16]:
#convert into pytorch data
train_inputs=torch.tensor(train_inputs)
valid_inputs=torch.tensor(validation_inputs)

train_targets = torch.tensor(train_targets)
valid_targets = torch.tensor(validation_targets)

train_mask = torch.tensor(train_mask)
valid_mask = torch.tensor(validation_mask)

In [17]:
train_inputs.shape,train_mask.shape, train_targets.shape

(torch.Size([1110203, 64]), torch.Size([1110203, 64]), torch.Size([1110203]))

In [18]:
## create batche from the dataset 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size=32

#crete data for training set
train_data = TensorDataset(train_inputs, train_mask, train_targets)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size=batch_size)

# crete data for validation
valid_data = TensorDataset(valid_inputs, valid_mask, valid_targets)
valid_sampler = RandomSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler = valid_sampler, batch_size=batch_size)

In [19]:
## now train the model
# Get all of the model's parameters as a list of tuples.https://www.kaggle.com/code/gazu468/all-about-bert-you-need-to-know?scriptVersionId=115965914&cellId=75
from transformers import BertForSequenceClassification, BertConfig, AdamW
model= BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2, output_attentions=False, output_hidden_states= False)
model.cuda()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [20]:
# set the optimizers
optimizers = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs=1



In [23]:
#seed all over to make reproducible
import random
seed_val=42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [24]:
%%time
### train the model
from tqdm import tqdm
loss_values=[]
for epoch_i in range(0, epochs):
    total_loss=0
    model.train()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    for step, batch in enumerate(tqdm(train_dataloader,desc="Training")):
        b_input_ids=batch[0].to(device)
        b_input_mask =batch[1].to(device)
        b_targets=batch[2].to(device)
        model.zero_grad()

        outputs = model(b_input_mask,token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_targets)
        loss=outputs[0]
        total_loss +=loss.item()
        loss.backward()
    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizers.step()
    avg_train_loss = total_loss / len(train_dataloader)  
    loss_values.append(avg_train_loss)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("Running Validation")
                
print("")
print("Training complete!")



Training:  96%|█████████▌| 33316/34694 [1:34:41<03:55,  5.85it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Training: 100%|██████████| 34694/34694 [1:38:37<00:00,  5.86it/s]


  Average training loss: 0.40
Running Validation

Training complete!
CPU times: user 1h 38min 34s, sys: 13.2 s, total: 1h 38min 47s
Wall time: 1h 38min 37s


In [27]:
from sklearn.metrics import accuracy_score, f1_score
eval_accuracy = 0
all_preds = []
all_targets = []

# Iterate through the validation data loader.
for batch in tqdm(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_targets = batch
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0].detach().cpu().numpy()
    target_ids = b_targets.to('cpu').numpy()
    
    # Convert logits to predicted labels (assuming multiclass classification).
    preds = np.argmax(logits, axis=1)
    
    # Accumulate the total accuracy.
    eval_accuracy += accuracy_score(target_ids, preds)
    
    # Append predictions and true labels to lists.
    all_preds.extend(preds)
    all_targets.extend(target_ids)

# Calculate accuracy and F1 score for the entire validation dataset.
nb_eval_steps = len(valid_dataloader)
accuracy = eval_accuracy / nb_eval_steps
f1 = f1_score(all_targets, all_preds, average='weighted')  # Use 'weighted' for multiclass F1 score

print("Accuracy: {0:.2f}".format(accuracy))
print("F1 Score: {0:.2f}".format(f1))

print("\nValidation complete!")


100%|██████████| 6123/6123 [05:54<00:00, 17.28it/s]


Accuracy: 0.94
F1 Score: 0.91

Validation complete!


In [35]:
torch.save(model.state_dict(), "model.pth")