In [1]:
import os
import math
import random
import time

import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn import metrics
from sklearn.metrics import f1_score
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import BertModel, BertTokenizer

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn import model_selection
from tqdm import tqdm

from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
import gc
gc.enable()



In [2]:
# from pathlib import Path
# print(Path.cwd())

In [3]:
class args:
    train_path = r"C:\Users\ajayp\OneDrive\Desktop\Project\airline_sentiment_analysis.csv"
    TOKENIZER_PATH = "bert-base-uncased"
    BERT_PATH = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
    MAX_LEN = 256
    train_batch_size = 1
    valid_batch_size = 1
    epochs = 2
    model_path = r"C:\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights"
    folds_path = r"C:\Users\ajayp\OneDrive\Desktop\Project\train_folds.csv"
    splits  = 20


In [4]:
def add_sentiment(row):
    if row == "positive":
        return 0
    else:
        return 1

In [5]:
train_df = pd.read_csv(r"C:\Users\ajayp\OneDrive\Desktop\Project\airline_sentiment_analysis.csv")
train_df["airline_sentiment"] = train_df["airline_sentiment"].apply(lambda x : add_sentiment(x))
train_df

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,0,@VirginAmerica plus you've added commercials t...
1,3,1,@VirginAmerica it's really aggressive to blast...
2,4,1,@VirginAmerica and it's a really big bad thing...
3,5,1,@VirginAmerica seriously would pay $30 a fligh...
4,6,0,"@VirginAmerica yes, nearly every time I fly VX..."
...,...,...,...
11536,14633,1,@AmericanAir my flight was Cancelled Flightled...
11537,14634,1,@AmericanAir right on cue with the delays👌
11538,14635,0,@AmericanAir thank you we got on a different f...
11539,14636,1,@AmericanAir leaving over 20 minutes Late Flig...


In [6]:
# class Data_class(Dataset):
#     def __init__(self, df,args, inference_only=False):
#         super().__init__()

#         self.df = df        
#         self.inference_only = inference_only
#         self.text = df.text.tolist()
            
    
#         self.encoded = args.tokenizer.batch_encode_plus(
#             self.text,
#             padding = 'max_length',            
#             max_length = args.MAX_LEN,
#             truncation = True,
#             return_attention_mask=True
#         )        
 

#     def __len__(self):
#         return len(self.df)

    
#     def __getitem__(self, index):        
#         input_ids = torch.tensor(self.encoded['input_ids'][index])
#         attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
#         if self.df["airline_sentiment"].iloc[index] == "positive":
#             target = torch.tensor([0], dtype = torch.long)
            
#         else:
#             target = torch.tensor([1], dtype = torch.long)

        
        
#         if self.inference_only:
#             return {
#                 "input_ids" : input_ids, 
#                 "attention_mask" : attention_mask, 
#             }           
#         else:
#             return {
#                 "input_ids" : input_ids, 
#                 "attention_mask" : attention_mask, 
#                 "target" : target
#             }

In [7]:
class Data_class(Dataset):
    def __init__(self, df,args, inference_only=False):
        super().__init__()
        
        self.df = df      
        df["airline_sentiment"] = df["airline_sentiment"].apply(lambda x : add_sentiment(x))
        self.inference_only = inference_only
        self.text = df.text.tolist()
        
        if not self.inference_only:
            self.target = torch.tensor(df.airline_sentiment.values, dtype=torch.float)        
    
        self.encoded = args.tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = args.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        
        if self.inference_only:
            return {
                "input_ids" : input_ids, 
                "attention_mask" : attention_mask, 
            }           
        else:
            target = self.target[index]
            return {
                "input_ids" : input_ids, 
                "attention_mask" : attention_mask, 
                "target" : target
            }

In [8]:
# train_df = pd.read_csv(args.folds_path)
# # print(train_df)
# temp = Data_class(train_df, args)
# ans = temp.__getitem__(1)
# input_ids, attention_mask, target = ans["input_ids"], ans["attention_mask"], ans["target"]
# print(input_ids, attention_mask, target)

In [9]:
class SEN_Model(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(args.BERT_PATH)
        config.update({"output_hidden_states":True, 
                       "layer_norm_eps": 1e-7})                       
        self.layer_start = 9
        self.bert = AutoModel.from_pretrained(args.BERT_PATH, config=config)  

        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.linear = nn.Linear(768, 1)
#         self.softmax = nn.Softmax(dim = -1)
        

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        #print(outputs)
        #all_hidden_state = outputs.hidden_states[-1]
       # weighted_pooling_embeddings = self.pooler(all_hidden_state)
#         print(outputs.hidden_states[-1].shape)
        
        weights = self.attention(outputs.hidden_states[-1])
        #[batch_size, max_len, hidden_states]
#         print(weights.shape)
        
       
        context_vector = torch.sum(weights *outputs.hidden_states[-1] , dim=1) 
#         print((weights *outputs.hidden_states[-1]).shape)
#         print(context_vector.shape)
        
        return self.linear(context_vector)

In [10]:
# # temp = Data_class(train_df, args)
# # ans = temp.__getitem__(0)
# # input_ids, attention_mask, target = ans["input_ids"], ans["attention_mask"], ans["target"]


# model = SEN_Model()
# input_id = input_ids.unsqueeze(0)
# attention_mas = attention_mask.unsqueeze(0)
# out = model(input_id, attention_mas)
# tar = target.unsqueeze(0)
# print(out.shape, out, tar)

In [11]:
def loss_fn(out, tar):
#     print(out)
#     print(tar)
    loss = nn.BCEWithLogitsLoss()(out, tar.view(-1, 1))
    return loss

# print(loss_fn(out.flatten(), tar))

In [12]:
def train_fn(data_loader, model, optimizer,device, scheduler):
    model.train()
    
    loss_sum = 0
    
    for i, data in tqdm(enumerate(data_loader), total = len(data_loader)):
        input_ids = data["input_ids"]
        attention_mask = data["attention_mask"]
        target = data["target"]
        
        input_ids = input_ids.to(device, dtype = torch.long)
        attention_mask = attention_mask.to(device, dtype = torch.long)
        target = target.to(device, dtype = torch.float)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        
        loss = loss_fn(outputs, target)
        loss.backward()
        
        loss_sum += loss.item()
        
        optimizer.step()
        scheduler.step()
        
        
    return loss_sum/len(data_loader)
        
        
    

In [13]:
def eval_fn(data_loader, model, device):
    
    model.eval()
    
    final_targets = []
    
    final_outputs = []
    
    with torch.no_grad():
            
        for i, data in tqdm(enumerate(data_loader), total = len(data_loader)):
            input_ids = data["input_ids"]
            attention_mask = data["attention_mask"]
            target = data["target"]

            input_ids = input_ids.to(device, dtype = torch.long)
            attention_mask = attention_mask.to(device, dtype = torch.long)
            target = target.to(device, dtype = torch.float)
            
            outputs = model(input_ids, attention_mask)
            
            output = torch.sigmoid(outputs)
            
#             ans = torch.argmax(output, dim = -1)
            
            targets = target.detach().cpu().numpy().tolist()
            
            ans = output.detach().cpu().numpy().tolist()
            
            final_targets.extend(targets)
            
            final_outputs.extend(ans)
            
    return final_outputs, final_targets

In [14]:

import pandas as pd
from sklearn import model_selection

# Training data is in a csv file called train.csv 
df = pd.read_csv(r"C:\Users\ajayp\OneDrive\Desktop\Project\airline_sentiment_analysis.csv")
# we create a new column called kfold and fill it with -1
df["kfold"] = -1

# fetch targets
y = df.airline_sentiment.values

# initiate the kfold class from model_selection module
kf = model_selection.StratifiedKFold(n_splits=args.splits, shuffle=True, random_state=42)

# fill the new kfold column
for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

# save the new csv with kfold column
df.to_csv("train_folds.csv", index=False)

In [15]:
# df.airline_sentiment.value_counts()
# new = df[df["kfold"] == 1]
# new.reset_index(inplace = True)
# new["airline_sentiment"].value_counts()

In [16]:
def run(fold):
    df = pd.read_csv(args.folds_path)
    df = df.iloc[:5000]
#     df_train, df_valid = model_selection.train_test_split(df, test_size = 0.1, random_state = 42)
    
    df_train = df[df["kfold"] != fold]
    df_valid = df[df["kfold"] == fold]
    
    
    df_train = df_train.reset_index(drop = True)
    df_valid = df_valid.reset_index(drop = True)
    
    train_dataset = Data_class(df_train, args)
    valid_dataset = Data_class(df_valid, args)
    
    train_loader = DataLoader(train_dataset, batch_size = args.train_batch_size, shuffle = True)
    valid_loader = DataLoader(valid_dataset, batch_size = args.valid_batch_size)
    
    device = torch.device("cuda")
    model = SEN_Model()
    
    
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
    optimizer_parameters = [
        {'params' : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params' : [p for n, p in param_optimizer if  any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    
    num_training_steps = int(len(df_train)/args.train_batch_size)*args.epochs
    
    optimizer = AdamW(optimizer_parameters, lr = 3e-5)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = num_training_steps)
    
    model = model.to(device)
    loss_list = []
    
    best_val = -1
    
    for epoch in range(args.epochs):
        loss = train_fn(valid_loader, model, optimizer, device, scheduler)
        print(loss)
        loss_list.append(loss)
        final_out, final_tar = eval_fn(valid_loader, model, device)
        print("================loss============", loss)
#         print(metrics.classification_report(final_out, final_tar))
        
#         print("================validation===========")
    
#         f1_scores = f1_score(final_tar, final_out, average=None, labels=labels)
    
#         print("=============f1_scores======================", f1_scores)
        
#         f1_mean = f1_scores.mean()
#         print("=============f1_scores mean======================", f1_mean)

        outputs = np.array(final_out) >= 0.5
        accuracy = metrics.accuracy_score(final_tar, outputs)
        
        if best_val < accuracy:
            print("======saving model============")
            best_val = accuracy
            model_path = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = fold)
            torch.save(model.state_dict(), model_path)
    loss_path = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pickle".format(fold = fold)
    with open(loss_path, 'wb') as f:
        pickle.dump(loss_list, f)
                

In [17]:
run(14)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 252/252 [00:53<00:00,  4.69it/s]
  1%|          | 2/252 [00:00<00:13, 18.69it/s]

0.45532380392597543


100%|██████████| 252/252 [00:12<00:00, 20.08it/s]




100%|██████████| 252/252 [00:53<00:00,  4.75it/s]
  1%|          | 3/252 [00:00<00:11, 20.83it/s]

0.1314756132908062


100%|██████████| 252/252 [00:13<00:00, 18.88it/s]




In [64]:
# df = pd.read_csv(args.folds_path)
# df = df.iloc[:5000]

In [65]:
# df_train = df[df["kfold"] != 0]
# df_valid = df[df["kfold"] == 0]

# df_train = df_train.reset_index(drop = True)
# df_valid = df_valid.reset_index(drop = True)


# train_dataset = Data_class(df_train, args)
# valid_dataset = Data_class(df_valid, args)

# train_loader = DataLoader(train_dataset, batch_size = args.train_batch_size, shuffle = True)
# valid_loader = DataLoader(valid_dataset, batch_size = args.valid_batch_size)

In [18]:
# df = pd.read_csv(args.folds_path)
# df = df.iloc[:5000]
# #     df_train, df_valid = model_selection.train_test_split(df, test_size = 0.1, random_state = 42)

# df_train = df[df["kfold"] != 0]
# df_valid = df[df["kfold"] == 0]

# print(df_train.shape)

# df_train = df_train.reset_index(drop = True)
# df_valid = df_valid.reset_index(drop = True)
# print(df_train.shape)

# train_dataset = Data_class(df_train, args)
# valid_dataset = Data_class(df_valid, args)

# train_loader = DataLoader(train_dataset, batch_size = args.train_batch_size, shuffle = True)
# valid_loader = DataLoader(valid_dataset, batch_size = args.valid_batch_size)

# device = torch.device("cuda")
# model = SEN_Model()

# print(len(valid_loader))

# param_optimizer = list(model.named_parameters())
# no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

# optimizer_parameters = [
#     {'params' : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
#     {'params' : [p for n, p in param_optimizer if  any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]

# num_training_steps = int(len(df_train)/args.train_batch_size)*args.epochs

# optimizer = AdamW(optimizer_parameters, lr = 3e-5)

# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = num_training_steps)

# model = model.to(device)


In [19]:
# train_fn(train_loader, model, optimizer, device, scheduler)


In [20]:
# fold = 0

In [21]:
# model_path = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = fold)
# torch.save(model.state_dict(), model_path)

In [14]:
model_path = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = 0)
model = SEN_Model()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
model.load_state_dict(torch.load(model_path))
model.eval().cuda()

SEN_Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
  

In [13]:
class Data_class(Dataset):
    def __init__(self, df,args, inference_only=False):
        super().__init__()
        
        self.df = df      
        self.inference_only = inference_only
        self.text = df.text.tolist()
        
        if not self.inference_only:
            self.target = torch.tensor(df.airline_sentiment.values, dtype=torch.float)        
    
        self.encoded = args.tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = args.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        
        if self.inference_only:
            return {
                "input_ids" : input_ids, 
                "attention_mask" : attention_mask, 
            }           
        else:
            target = self.target[index]
            return {
                "input_ids" : input_ids, 
                "attention_mask" : attention_mask, 
                "target" : target
            }

In [30]:


# text = str(input()) 
text = "@VirginAmerica and it's a really big bad thing about it."
data = {'Unnamed: 0': [0],
        'text': [text]}
df = pd.DataFrame(data)

temp = Data_class(df, args, inference_only = True)
ans = temp.__getitem__(0)
input_ids, attention_mask = ans["input_ids"], ans["attention_mask"]
input_ids = input_ids.unsqueeze(0).cuda()
attention_mask = attention_mask.unsqueeze(0).cuda()
device = torch.device("cuda")

In [47]:
out_1 = model_1(input_ids, attention_mask)

In [48]:
ans = torch.sigmoid(out_1).item()
if ans >= 0.51:
    print("negative", ans)
else:
    print("positive", ans)

negative 0.9935722351074219


In [15]:
model_1 = SEN_Model()
model_2 = SEN_Model()
model_3 = SEN_Model()
model_4 = SEN_Model()
model_5 = SEN_Model()

model_path_1 = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = 0)
model_path_2 = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = 1)
model_path_3 = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = 2)
model_path_4 = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = 3)
model_path_5 = r"\Users\ajayp\OneDrive\Desktop\Project\Saved_model_weights\model_{fold}_.pth".format(fold = 4)

model_1.load_state_dict(torch.load(model_path_1))
model_1.eval().cuda()

model_2.load_state_dict(torch.load(model_path_2))
model_2.eval().cuda()

model_3.load_state_dict(torch.load(model_path_3))
model_3.eval().cuda()

model_4.load_state_dict(torch.load(model_path_4))
model_4.eval().cuda()

model_5.load_state_dict(torch.load(model_path_5))
model_5.eval().cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.tr

SEN_Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
  

In [31]:
out_2 = model_2(input_ids, attention_mask)
out_3 = model_3(input_ids, attention_mask)
out_4 = model_4(input_ids, attention_mask)
out_5 = model_5(input_ids, attention_mask)

out_2 = torch.sigmoid(out_2).item()
out_3 = torch.sigmoid(out_3).item()
out_4 = torch.sigmoid(out_4).item()
out_5 = torch.sigmoid(out_5).item()

final_out = (out_2 + out_3)/2



In [32]:
final_out

0.996575802564621

In [27]:
# predictions
print(torch.sigmoid(out_2).item(), torch.sigmoid(out_3).item(), torch.sigmoid(out_4).item(), torch.sigmoid(out_5).item(), torch.sigmoid(final_out).item())

TypeError: sigmoid(): argument 'input' (position 1) must be Tensor, not float

In [33]:
# ans = torch.sigmoid(final_out).item()
if final_out >= 0.75:
    print("negative", final_out)
else:
    print("positive", 1-final_out)

negative 0.996575802564621


In [19]:
train_df["text"].iloc[11238]

'@AmericanAir My flt AA375 for 02/23 got cncled and i cant get hold of a CSR so i can get alternate arrangement. Plz help'