<a href="https://colab.research.google.com/github/Adi8885/RecurrentNets/blob/master/text_classification_using_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as Dataset
import torch.autograd as autograd
from torch.utils.data import Dataset, DataLoader
import torchvision
import transformers
from transformers import AlbertModel , AlbertTokenizer
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import pandas_profiling as pp
import time
import warnings
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report , accuracy_score , confusion_matrix
warnings.filterwarnings("ignore")

In [0]:
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

In [18]:
filepath = os.getcwd()+'/'
filename = 'consumer_complaints.zip'
print(' reading from path : {}'.format(filepath + filename))
start_time = time.time()
data = pd.read_csv(filepath + filename , compression = 'zip' , low_memory=False)
print('time taken to load data : {}'.format(time.time() - start_time))

 reading from path : /content/consumer_complaints.zip
time taken to load data : 3.2927610874176025


In [19]:
data.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [20]:
data = data[['consumer_complaint_narrative','product']]
data['seq_length'] = data['consumer_complaint_narrative'].apply(lambda x: len(str(x).split(" ")))
data.head()

Unnamed: 0,consumer_complaint_narrative,product,seq_length
0,,Mortgage,1
1,,Mortgage,1
2,,Credit reporting,1
3,,Student loan,1
4,,Debt collection,1


In [21]:
data[(~data.consumer_complaint_narrative.isna())].head()

Unnamed: 0,consumer_complaint_narrative,product,seq_length
190126,XXXX has claimed I owe them {$27.00} for XXXX ...,Debt collection,56
190135,Due to inconsistencies in the amount owed that...,Consumer Loan,249
190155,In XX/XX/XXXX my wages that I earned at my job...,Mortgage,667
190207,I have an open and current mortgage with Chase...,Mortgage,76
190208,XXXX was submitted XX/XX/XXXX. At the time I s...,Mortgage,260


In [22]:
data.seq_length.describe(percentiles = [0.25 , 0.5 , 0.75 , 0.9 , 0.95 , 0.98 , 0.99])

count    555957.000000
mean         23.910802
std          84.791148
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
90%          54.000000
95%         167.000000
98%         333.000000
99%         476.000000
max        1285.000000
Name: seq_length, dtype: float64

In [0]:
#since 90% of data is below 64, keepong seq length at 64
seq_length = 256

In [24]:
data = data[(~data.consumer_complaint_narrative.isna())]
data.reset_index(inplace= True , drop=True)
data.head()

Unnamed: 0,consumer_complaint_narrative,product,seq_length
0,XXXX has claimed I owe them {$27.00} for XXXX ...,Debt collection,56
1,Due to inconsistencies in the amount owed that...,Consumer Loan,249
2,In XX/XX/XXXX my wages that I earned at my job...,Mortgage,667
3,I have an open and current mortgage with Chase...,Mortgage,76
4,XXXX was submitted XX/XX/XXXX. At the time I s...,Mortgage,260


In [25]:
data.shape

(66806, 3)

In [26]:
embeddings_matrix =  np.zeros((data.shape[0] , 768))
embeddings_matrix.shape

(66806, 768)

In [27]:
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('CUDA available using GPU')
else :
   device = torch.device('cpu')
   print('CUDA NOT available using CPU')

CUDA available using GPU


In [0]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2' , max_len = seq_length)
model = AlbertModel.from_pretrained('albert-base-v2' , output_hidden_states = True)
model = model.to(device)
model = model.eval()

In [29]:
warnings.filterwarnings("ignore")
print_every =  500
start_time_loop = time.time()
time_list = []
for i in data.index:
  text = data.loc[i].consumer_complaint_narrative
  input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)[:seq_length]).unsqueeze(0)
  outputs = model(input_ids.to(device))
  last_layer = outputs[2][12][0][0]
  layer_11 = outputs[2][11][0][0]
  layer_10 = outputs[2][10][0][0]
  layer_9 = outputs[2][9][0][0]

  final_embedding = last_layer + layer_11 + layer_10 + layer_9

  embeddings_matrix[i]+=final_embedding.cpu().detach().numpy()

  if i % print_every == 0 :
    print('index {} / {} , {}'.format(i , data.shape[0] , time.time() - start_time))
    time_list.append(time.time() - start_time)
    start_time = time.time()

try:
  np.save('embeddings_matrix' , arr = embeddings_matrix , allow_pickle=True)
  print('Saved matrix locally')
except:
  print('ERROR saving embeddings')
print('time taken to generate embeddings : {}'.format(time.time() - start_time_loop))

index 0 / 66806 , 6.006929159164429
index 500 / 66806 , 8.62108302116394
index 1000 / 66806 , 8.773597717285156
index 1500 / 66806 , 8.832340240478516
index 2000 / 66806 , 8.64569091796875
index 2500 / 66806 , 8.569299936294556
index 3000 / 66806 , 8.725908279418945
index 3500 / 66806 , 8.449679613113403
index 4000 / 66806 , 8.566915273666382
index 4500 / 66806 , 8.685968160629272
index 5000 / 66806 , 8.63992714881897
index 5500 / 66806 , 8.648613691329956
index 6000 / 66806 , 8.468859672546387
index 6500 / 66806 , 8.720704555511475
index 7000 / 66806 , 8.550178527832031
index 7500 / 66806 , 8.635714054107666
index 8000 / 66806 , 8.592604398727417
index 8500 / 66806 , 8.798749685287476
index 9000 / 66806 , 8.514890909194946
index 9500 / 66806 , 8.501818180084229
index 10000 / 66806 , 8.626028537750244
index 10500 / 66806 , 8.740608215332031
index 11000 / 66806 , 8.587806224822998
index 11500 / 66806 , 8.531693696975708
index 12000 / 66806 , 8.618770837783813
index 12500 / 66806 , 8.509

In [0]:
label_encoder = LabelEncoder()
data['encoder_product'] = label_encoder.fit_transform(data['product'])

In [71]:
X = embeddings_matrix
Y = data.encoder_product
x_train , x_test , y_train , y_test = train_test_split(X,Y,test_size = 0.1 , random_state = 7)
print('x_train :{}'.format(x_train.shape))
print('y_train :{}'.format(y_train.shape))
print('x_test :{}'.format(x_test.shape))
print('y_test :{}'.format(y_test.shape))

x_train :(60125, 768)
y_train :(60125,)
x_test :(6681, 768)
y_test :(6681,)


In [0]:
batch_size = 512

In [73]:
#Convert input data to tensors
x_train_data_tensor = torch.tensor(x_train)
y_train_data_tensor = torch.tensor(y_train.values)
print(x_train_data_tensor.shape)

#Convert tensors to type Dataset
train_dataset = torch.utils.data.TensorDataset(x_train_data_tensor ,y_train_data_tensor)
print(type(train_dataset))

#Convert datsets to Dataloader for loading batches
train_dataset = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
print(type(train_dataset))

torch.save(train_dataset, 'train_dataset.dataloader')

torch.Size([60125, 768])
<class 'torch.utils.data.dataset.TensorDataset'>
<class 'torch.utils.data.dataloader.DataLoader'>


In [74]:
#Convert input data to tensors
x_test_data_tensor = torch.tensor(x_test)
y_test_data_tensor = torch.tensor(y_test.values)
print(x_test_data_tensor.shape)

#Convert tensors to type Dataset
test_dataset = torch.utils.data.TensorDataset(x_test_data_tensor ,y_test_data_tensor)
print(type(train_dataset))

#Convert datsets to Dataloader for loading batches
test_dataset = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
type(test_dataset)

torch.save(test_dataset, 'test_dataset.dataloader')

torch.Size([6681, 768])
<class 'torch.utils.data.dataloader.DataLoader'>


In [0]:
output_size = len(label_encoder.classes_)
batch_size = batch_size
drop_out_probability = 0.33

In [0]:
class ALBERT_Classification(nn.Module):
    def __init__(self , input_size , hidden_size , output_size):
        super(ALBERT_Classification ,self).__init__()
        
        self.fc_input_size = input_size
        self.fc_hidden_size = hidden_size
        self.fc_output_size = output_size
        
        
        # Inputs to hidden layer linear transformation
        self.fc1 = nn.Linear(in_features = input_size , out_features = hidden_size)
        self.fc2 = nn.Linear(in_features = input_size , out_features = hidden_size)
        self.dropout = nn.Dropout(p = 0.33)
        
        # Output layer, 10 units - one for each digit
        self.out = nn.Linear(in_features = hidden_size , out_features = output_size)
        
        
    def forward(self,x):
        
        #Pass output of BERT through Fully connected layer
        x = self.fc1(x)
        x = torch.tanh(x)

        x = self.fc2(x)
        x = torch.tanh(x)

        x = self.dropout(x)
        
        #Output layer
        x = self.out(x)
        
        return x
    
albert_classification = ALBERT_Classification(input_size = 768 
                            , hidden_size = 768 
                            , output_size = output_size )

albert_classification = albert_classification.double()

In [115]:
epochs = 10
loss_fn = F.cross_entropy

learning_rate = 0.001

optimizer = torch.optim.Adam(albert_classification.parameters(), lr = learning_rate)
print_every = 100
albert_classification.to(device)


# to track the average training loss per epoch as the model trains
avg_train_losses = []
# to track the average validation loss per epoch as the model trains
avg_valid_losses = [] 
patience_ctr = 0
patience = 5
    
for epoch in range(0,epochs):
    
    # to track the training loss as the model trains
    train_losses = []
    # to track the validation loss as the model trains
    valid_losses = []

    batch = 0
    #loss = 0
    start_time = time.time()
    albert_classification.train()
    for i in range(0,len(train_dataset)) :
        x, y = next(iter(train_dataset))
        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        y_pred = albert_classification(x)
        
        #y_pred = y_pred.unsqueeze(0)
        loss = loss_fn(y_pred , y)

        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
        if (batch % print_every == 0):
            print('epoch : {} \t batch number : {} \t train loss : {}'.format(epoch, batch,loss.item()))
            
        batch +=1

        #Calculate Validation loss
    val_acc = 0
    albert_classification = albert_classification.eval()
    for j in test_dataset :
        x_v, y_v = next(iter(test_dataset))
        x_v = x_v.to(device)
        y_v = y_v.to(device)
        
        y_pred = albert_classification(x_v)
        loss = loss_fn(y_pred , y_v)
        
        valid_losses.append(loss.item())
    
    # calculate average loss over an epoch
    train_loss = np.average(train_losses)
    valid_loss = np.average(valid_losses)
    avg_train_losses.append(train_loss)
    avg_valid_losses.append(valid_loss)

    #Early stopping
    if valid_loss > train_loss:
        patience_ctr +=1
        print('patience_ctr : {}'.format(patience_ctr))
        if patience_ctr >= patience:
            print('early stoppping since valid_loss > train_loss')
            break
    else :
        patience_ctr = 0
    
    epoch_len = len(str(epochs))
    print('epoch : {}\t train_loss :{}\t validation loss : {}\n'.format(epoch , train_loss , valid_loss))
    
            
    print('epoch : {} \t train loss : {} \t time required : {}'.format(epoch, loss.item(),(time.time() - start_time)))

epoch : 0 	 batch number : 0 	 train loss : 2.3155243830254695
epoch : 0 	 batch number : 100 	 train loss : 1.5423446951353308
epoch : 0	 train_loss :1.7200105785749682	 validation loss : 1.4811064714352218

epoch : 0 	 train loss : 1.5243617696772496 	 time required : 3.203022003173828
epoch : 1 	 batch number : 0 	 train loss : 1.5499494402899
epoch : 1 	 batch number : 100 	 train loss : 1.4031112471153242
epoch : 1	 train_loss :1.4313884544858229	 validation loss : 1.3907113729507823

epoch : 1 	 train loss : 1.4313038237853366 	 time required : 3.1150460243225098
epoch : 2 	 batch number : 0 	 train loss : 1.3462926197560612
epoch : 2 	 batch number : 100 	 train loss : 1.311357075676757
epoch : 2	 train_loss :1.3468813452025619	 validation loss : 1.3003875300320566

epoch : 2 	 train loss : 1.3450486495347134 	 time required : 2.969135046005249
epoch : 3 	 batch number : 0 	 train loss : 1.2676625590360229
epoch : 3 	 batch number : 100 	 train loss : 1.1651407362729733
epoch : 

In [116]:
st_time = time.time()
ctr = 0
print_every = 5
albert_classification = albert_classification.double()
albert_classification = albert_classification.to(device)
albert_classification = albert_classification.eval()
for j in test_dataset :
    x_v, y_v = next(iter(test_dataset))
    x_v = x_v.to(device)
    #y_v = y_v.to(device)
   
    op = albert_classification(x_v)
    op = torch.argmax(F.softmax(op), dim = 1)
    if ctr == 0:
        y_pred = op
        y_actual = y_v
    else:
        y_pred = torch.cat((y_pred , op))
        y_actual = torch.cat((y_actual , y_v))
   
    ctr += 1
   
    if ctr % print_every == 0:
        print(ctr)
print('time taken for prediction :{} seconds'.format(time.time() - st_time ))

5
10
time taken for prediction :0.23434185981750488 seconds


In [118]:
def my_classification_report(y_actual , y_predicted):
   
    print(classification_report(y_actual, y_pred, target_names=list(label_encoder.classes_)))
    acc = accuracy_score(y_actual , y_predicted)
    print('accuracy : {}'.format(acc))
   
    conf_mat=confusion_matrix(y_actual, y_pred)
    print(conf_mat)
    #plt.figure(figsize=(20,20))
    ax = plt.subplot()
    sns.heatmap(conf_mat, annot=True, ax = ax); #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(list(label_encoder.classes_))
    ax.yaxis.set_ticklabels(list(label_encoder.classes_))
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    #plt.savefig('conf_matrix.png')
    plt.show()

#y_actual = y_test.cpu().detach().numpy()
#y_pred = y_pred.cpu().detach().numpy()
my_classification_report(y_actual = y_actual , y_predicted = y_pred)

                         precision    recall  f1-score   support

Bank account or service       0.52      0.22      0.31       565
          Consumer Loan       0.66      0.14      0.23       430
            Credit card       0.45      0.52      0.48       887
       Credit reporting       0.65      0.61      0.63      1247
        Debt collection       0.63      0.69      0.66      1933
        Money transfers       0.00      0.00      0.00        77
               Mortgage       0.59      0.82      0.69      1628
Other financial service       0.00      0.00      0.00         9
            Payday loan       0.15      0.06      0.08        68
           Prepaid card       0.85      0.25      0.38        93
           Student loan       0.49      0.43      0.46       231

               accuracy                           0.59      7168
              macro avg       0.45      0.34      0.36      7168
           weighted avg       0.58      0.59      0.56      7168

accuracy : 0.587611607

In [0]:
plt.show()