<a href="https://colab.research.google.com/github/AyonSOMADDAR/NLP/blob/main/Document_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
#get the gpu name
device_name=tf.test.gpu_device_name()

#the device name should look like the following

if device_name=='/device:GPU:0':
  print(f'Found GPU as: {device_name}')
else:
  raise SystemError('GPU device not found')


In [None]:
import torch
if torch.cuda.is_available():  #if GPU is available
  #telling pytorch to use GPU
  device = torch.device("cuda")
  print(f"There are {torch.cuda.device_count()} GPU's available")
  print(f'We will be using the GPU: {torch.cuda.get_device_name(0)}')
else:
  print('No GPU available, USING CPU INSTEAD......')
  device=torch.device('CPU') # using cpu 


In [None]:
#INSTALLING BERT
!pip install transformers

In [None]:
#we are using CoLA DATASET 
!pip install wget

In [None]:
import wget
import os

print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'

# Download the file (if we haven't already)
if not os.path.exists('./cola_public_1.1.zip'):
    wget.download(url, './cola_public_1.1.zip')

In [None]:
#unzipping the dataset
if not os.path.exists('./cola_public/'):
  !unzip cola_public_1.1.zip


In [None]:
import pandas as pd
df=pd.read_csv("/content/cola_public/raw/in_domain_train.tsv",delimiter='\t')
#reporting number of sentences
print(f'Number of training sentences: {df.shape[0]}\n',)
df.head(10)

In [None]:
df.sample(10)

In [None]:
df.rename(columns={'1':'label'},inplace=True)
df.rename(columns={"Our friends won't buy this analysis, let alone the next one we propose.":"sentence"},inplace=True)


In [None]:
df.head()

In [None]:
df.loc[df.label==0].sample(5)[['sentence','label']]

In [None]:
#listing sentences and their labels
sentences=df.sentence.values
labels=df.label.values

In [None]:
#installing BERT tokenizer
from transformers import BertTokenizer
print('Loading BERT Tokenizer >>>>>')
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)


In [None]:
#applying to one sentence to get the output
print(f'Original: {sentences[0]}')
#print the sentence into tokens
print(f'Tokenized: {tokenizer.tokenize(sentences[0])}')
#print the sentence mapped to token ids.
print(f"Token IDs:{tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))}")

In [None]:
#NOW WE ARE TOKENIZING ALL THE SENTENSEX AND MAPPING THEM TO WORD IDS
input_ids=[]
for sent in sentences:
  encode_sent=tokenizer.encode(
      sent,
      add_special_tokens=True
      #here truncation and conversion can be done, but padding cannot be done.
      #hence we cannot use the following features:
      #max_length=128 #for truncation
      #return_tensors='pt',#return pytorch tensors
  )
  input_ids.append(encode_sent)

#printing the first statement as a list of IDS. 
print(f'Original: {sentences[0]}')
print(f'Token IDs: {input_ids[0]}')

In [None]:
print(f"Maximum sentence length: {max([len(sen) for sen in input_ids])}")

In [None]:
#padding and truncating the encoded sentences to the max size using Keras library
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN=64
print(f'\nPadding/truncating all sentences to {MAX_LEN} values...' )
print(f'\nPadding token: "{tokenizer.pad_token}", ID: {tokenizer.pad_token_id}')
input_ids=pad_sequences(input_ids,maxlen=MAX_LEN,dtype="long",value=0,truncating='post',padding='post')
print("done")

In [None]:
#CREATING ATTENTION MASKS FOR STORING THE LIST OF INDEXES
attention_masks=[]
for sent in input_ids:
  att_mask=[int(token_id>0)for token_id in sent]
  # Here in the above sentence we are checking if:
  # token_id is 0 then false i.e we append 0. 
  # if token_id is 1 then true i.e we append 1.
  attention_masks.append(att_mask)

In [None]:
#NOW WE COME TO TRAINING AND VALIDATION 
from sklearn.model_selection import train_test_split
#we will be using 90% of the result for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                         random_state=2018, test_size=0.1)
#doing the same for the masks.
train_masks,validation_masks,_,_=train_test_split(attention_masks,labels,random_state=2018,test_size=0.1)


In [None]:
#CONVERTING TO PYTORCH TENSORS
train_inputs=torch.tensor(train_inputs)
validation_inputs=torch.tensor(validation_inputs)

train_labels=torch.tensor(train_labels)
validation_labels=torch.tensor(validation_labels)

train_masks=torch.tensor(train_masks)
validation_masks=torch.tensor(validation_masks)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 32

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
model=BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2, #number is 2 for binary classification
    output_attentions= False,
    output_hidden_states= False,
)
#Tellling pytorch to run this model over GPU
model.cuda()