In [1]:
import json
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import pipeline, AutoTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

bert_id = "/home/erik/PycharmProjects/Kaggle/Models"
bert_id_1 = "bert-base-cased"
device = 'cuda'
tokenizer = AutoTokenizer.from_pretrained(bert_id_1)
model = BertForSequenceClassification.from_pretrained(bert_id, num_labels=2)
pipe = pipeline("sentiment-analysis", tokenizer=tokenizer, model=model)


In [2]:
class SentimentPolarityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class SentimentPolarityDatasetTest(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.labels)

In [3]:
def fetch_datasets():
    """
    Fetches data from the /Data directory. Parses labels, tokenizes inputs. Loads data into a custom pytorch Dataset
        Returns:
            Six SentimentPolarityDataset datasets.
    """

    with open('Data/nlp-getting-started/train.csv') as polarity_data:
        polarity_array = pd.read_csv(polarity_data)
        train_texts = polarity_array['text'].values.tolist()
        train_labels = polarity_array['target'].values.tolist()
        type(train_texts)

    with open('Data/nlp-getting-started/test.csv') as polarity_data:
        polarity_array = pd.read_csv(polarity_data)
        test_texts = polarity_array['text'].values.tolist()

    tokenizer_deberta = AutoTokenizer.from_pretrained(bert_id_1)

    encodings = tokenizer_deberta(train_texts, truncation=True, padding=True,add_special_tokens=True,)
    train_dataset = SentimentPolarityDataset(encodings, train_labels)
    encodings_test = tokenizer_deberta(test_texts, truncation=True, padding=True,       add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt')
    test_dataset = SentimentPolarityDatasetTest(encodings_test)

    return train_dataset, test_dataset


In [28]:
def tune(model, optim, dataset, epochs=15):
    """
    Trains a given model on the given dataset using the given optimizer.
        Parameters:
            model (pytorch model): The model we want to train.
            optim (pytoch optimizer): The optimizer we wish to use.
            dataset (pytorch dataset): The dataset we wish to tune our model to.
    """
    print(len(dataset))
    loader = DataLoader(dataset, batch_size=16, shuffle=False)
    model.train()
    avg_loss = 0
    i = 0
    for epoch in range(epochs):
        for batch in tqdm(loader):
            i += 1
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            avg_loss += loss.item()
            optim.step()
        print("Training loss:",avg_loss/i)
    model.eval()

In [5]:
train, test = fetch_datasets()

model_ = pipe.model
model_.to(device)
optim = AdamW(model_.parameters(), lr=1e-5)
tune(model_, optim, train)




7613


100%|██████████| 476/476 [00:57<00:00,  8.28it/s]


Training loss: 0.03703165094566047


100%|██████████| 476/476 [00:57<00:00,  8.29it/s]


Training loss: 0.04060422151356959


100%|██████████| 476/476 [00:57<00:00,  8.24it/s]


Training loss: 0.03929158923468907


100%|██████████| 476/476 [00:57<00:00,  8.31it/s]


Training loss: 0.03661188566325529


100%|██████████| 476/476 [00:58<00:00,  8.17it/s]


Training loss: 0.03515249320169597


100%|██████████| 476/476 [00:57<00:00,  8.22it/s]


Training loss: 0.0339925576297287


100%|██████████| 476/476 [00:57<00:00,  8.24it/s]


Training loss: 0.03368169449377714


100%|██████████| 476/476 [00:56<00:00,  8.35it/s]


Training loss: 0.03243399187568921


100%|██████████| 476/476 [00:59<00:00,  8.03it/s]


Training loss: 0.03200277332189599


100%|██████████| 476/476 [00:59<00:00,  8.01it/s]


Training loss: 0.03164365300785177


100%|██████████| 476/476 [00:57<00:00,  8.21it/s]


Training loss: 0.030831690099735652


100%|██████████| 476/476 [00:58<00:00,  8.11it/s]


Training loss: 0.030342237780407705


100%|██████████| 476/476 [01:01<00:00,  7.73it/s]


Training loss: 0.030208566513572482


100%|██████████| 476/476 [00:59<00:00,  7.98it/s]


Training loss: 0.02937893694615213


100%|██████████| 476/476 [00:58<00:00,  8.14it/s]

Training loss: 0.0290130530768476





In [20]:
with open('Data/nlp-getting-started/test.csv') as polarity_data:
    polarity_array = pd.read_csv(polarity_data)
    test_texts = polarity_array['text'].values.tolist()
def predict(model, dataset):
    model.eval()
    preds = []
    i = 0
    for batch in tqdm(dataset):
        encoded_review = tokenizer.encode_plus(
            batch,
            add_special_tokens=True,
            return_token_type_ids=False,
            truncation=True, padding=True,
            return_attention_mask=True,
            return_tensors='pt',
            )
        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)

        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output[0], dim=1)
        preds.append((polarity_array['id'].values.tolist()[i] ,int(prediction)))
        i += 1
    return preds

In [21]:
ans = predict(model_, test_texts)
ans

100%|██████████| 3263/3263 [00:21<00:00, 148.64it/s]


[(0, 1),
 (2, 1),
 (3, 1),
 (9, 1),
 (11, 1),
 (12, 1),
 (21, 0),
 (22, 0),
 (27, 0),
 (29, 0),
 (30, 0),
 (35, 0),
 (42, 0),
 (43, 0),
 (45, 0),
 (46, 1),
 (47, 0),
 (51, 1),
 (58, 0),
 (60, 0),
 (69, 0),
 (70, 0),
 (72, 0),
 (75, 1),
 (84, 0),
 (87, 0),
 (88, 0),
 (90, 0),
 (94, 0),
 (99, 1),
 (101, 0),
 (103, 0),
 (106, 1),
 (108, 0),
 (111, 1),
 (115, 0),
 (116, 1),
 (122, 0),
 (123, 0),
 (124, 1),
 (125, 0),
 (127, 1),
 (140, 0),
 (142, 1),
 (147, 0),
 (148, 0),
 (150, 0),
 (152, 0),
 (154, 1),
 (155, 0),
 (166, 0),
 (167, 0),
 (169, 1),
 (177, 0),
 (179, 0),
 (181, 0),
 (186, 0),
 (188, 0),
 (189, 0),
 (192, 0),
 (200, 1),
 (202, 0),
 (206, 1),
 (207, 1),
 (214, 1),
 (217, 1),
 (223, 0),
 (224, 1),
 (227, 1),
 (228, 1),
 (230, 0),
 (233, 1),
 (234, 1),
 (236, 1),
 (239, 1),
 (250, 1),
 (255, 0),
 (257, 0),
 (259, 1),
 (275, 1),
 (278, 0),
 (282, 0),
 (284, 0),
 (286, 1),
 (288, 1),
 (292, 0),
 (295, 0),
 (300, 0),
 (304, 1),
 (305, 1),
 (306, 1),
 (308, 0),
 (311, 0),
 (317, 0),


In [25]:
with open("Data/nlp-getting-started/results.txt", 'w') as file:
    file.write("id,target")
    for id, y in ans:
        file.write(str(id) + "," + str(y) + "\n")


In [26]:
tune(model_, optim, train)

7613


100%|██████████| 476/476 [00:56<00:00,  8.44it/s]


Training loss: 0.02716784151636159


100%|██████████| 476/476 [00:57<00:00,  8.34it/s]


Training loss: 0.024040912460552764


100%|██████████| 476/476 [00:57<00:00,  8.28it/s]


Training loss: 0.023923481996309358


100%|██████████| 476/476 [00:58<00:00,  8.20it/s]


Training loss: 0.022906982383509394


100%|██████████| 476/476 [00:58<00:00,  8.08it/s]


Training loss: 0.0228723414319139


100%|██████████| 476/476 [00:57<00:00,  8.22it/s]


Training loss: 0.022353497110365352


100%|██████████| 476/476 [00:58<00:00,  8.16it/s]


Training loss: 0.021474640125155535


100%|██████████| 476/476 [00:58<00:00,  8.14it/s]


Training loss: 0.020708145139887158


100%|██████████| 476/476 [00:58<00:00,  8.14it/s]


Training loss: 0.020356172990078126


100%|██████████| 476/476 [00:58<00:00,  8.07it/s]


Training loss: 0.0197650981998624


100%|██████████| 476/476 [01:01<00:00,  7.69it/s]


Training loss: 0.019511855635435334


100%|██████████| 476/476 [01:02<00:00,  7.64it/s]


Training loss: 0.019006742909054776


100%|██████████| 476/476 [00:58<00:00,  8.13it/s]


Training loss: 0.01859222596948216


100%|██████████| 476/476 [00:58<00:00,  8.11it/s]


Training loss: 0.018135466071914182


100%|██████████| 476/476 [00:58<00:00,  8.10it/s]

Training loss: 0.017491735898537236





In [29]:
tune(model_, optim, train, epochs=10)

7613


100%|██████████| 476/476 [00:58<00:00,  8.13it/s]


Training loss: 0.013112363197025934


100%|██████████| 476/476 [01:03<00:00,  7.55it/s]


Training loss: 0.01332581408798085


100%|██████████| 476/476 [01:01<00:00,  7.69it/s]


Training loss: 0.01353422429685709


100%|██████████| 476/476 [01:00<00:00,  7.90it/s]


Training loss: 0.014430892224587362


100%|██████████| 476/476 [01:00<00:00,  7.88it/s]


Training loss: 0.014471119309230538


100%|██████████| 476/476 [01:00<00:00,  7.91it/s]


Training loss: 0.01335484397944457


100%|██████████| 476/476 [01:00<00:00,  7.90it/s]


Training loss: 0.012455896564041464


100%|██████████| 476/476 [01:01<00:00,  7.71it/s]


Training loss: 0.01225980437187706


100%|██████████| 476/476 [01:03<00:00,  7.52it/s]


Training loss: 0.012336094889556958


100%|██████████| 476/476 [01:01<00:00,  7.78it/s]

Training loss: 0.012206876997326633





In [30]:


ans = predict(model_, test_texts)
with open("Data/nlp-getting-started/results.txt", 'w') as file:
    file.write("id,target")
    for id, y in ans:
        file.write(str(id) + "," + str(y) + "\n")



100%|██████████| 3263/3263 [00:21<00:00, 152.24it/s]


In [31]:
with open("Data/nlp-getting-started/results.txt", 'w') as file:
    file.write("id,target")
    for id, y in ans:
        file.write(str(id) + "," + str(y) + "\n")