In [1]:
#@title
!pip install transformers
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
import torch
import evaluate
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding, TrainingArguments, AdamW, get_scheduler
from datasets import load_dataset, Dataset
import datasets

import nltk
import re
import string

from tqdm.auto import tqdm

In [3]:
#Get preprocessed comments
df = pd.read_csv("Comments_prep.csv")
eval_dataset = []
for row in df.iterrows():
    one_video = []
    for comment in row:
      if(type(comment) == int):
        continue
      temp_list = [item for item in comment if not(pd.isnull(item)) == True] #remove NaN comments
      for entry in temp_list:
        if type(entry) is not str: 
          temp_list.remove(entry) #remove column numbers from dataFrame
    eval_dataset.append(temp_list)

In [4]:

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

def tokenize(data):
    return tokenizer(data["text"], padding=True, truncation=True, return_tensors='pt')

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [5]:
#device should be cuda, because with cpu the training and testing takes couple of hours and with cuda only few minutes
#(for google colab) bearbeiten - notebook-einstellungen - hardwarebeschleuniger - change to gpu 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print(device)


cuda


In [6]:
outputs_before = []

with torch.no_grad():
    for video in eval_dataset:
      output_one_video = []
      for comment in video:
        input = tokenizer(comment, padding=True, truncation=True, return_tensors='pt')
        input.to(device)
        output = model(**input)
        #Get positive or negative evaluation of comment
        label_id = torch.argmax(output.logits).item()
        output_one_video.append(label_id)
        # label = model.config.id2label[label_id]
        # print(label_id, label)

      outputs_before.append(output_one_video)

print(len(outputs_before))
outputs_before_df = pd.DataFrame(outputs_before)
outputs_before_df.to_csv('Evaluation_before_finetuning.csv', encoding='utf-8')

183


In [7]:
dataset_tweets_prep = pd.read_csv("Tweets_prep.csv")
dataset_tweets_prep = dataset_tweets_prep.mask(dataset_tweets_prep.eq('None')).dropna() # remove comments where None is stored, otherwise tokenizer throws error

dataset_tweets_prep = Dataset(pa.Table.from_pandas(dataset_tweets_prep))
training_data2, test_data2 = dataset_tweets_prep.train_test_split(test_size=0.2).values()
dataset_tweets_prep = datasets.DatasetDict({"train":training_data2,"test":test_data2})

tokenized_datasets_tweets_prep = dataset_tweets_prep.map(tokenize, batched=True)
tokenized_datasets_tweets_prep = tokenized_datasets_tweets_prep.remove_columns(['__index_level_0__', 'text', 'Unnamed: 0'])
tokenized_datasets_tweets_prep.set_format("torch")
print(tokenized_datasets_tweets_prep)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 21921
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5481
    })
})


In [8]:
batch_size = 8

train_dataloader = torch.utils.data.DataLoader(tokenized_datasets_tweets_prep["train"], batch_size=batch_size, collate_fn=collator)
test_dataloader = torch.utils.data.DataLoader(tokenized_datasets_tweets_prep["test"], batch_size=batch_size, collate_fn=collator)

In [9]:
def train(dataloader, model, optimizer, batch_size, progress_bar):
  total_loss = 0
  true_positive = 0
  true_negative = 0
  false_positive = 0
  false_negative = 0
  model.train()
  for batch in list(dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    preds = model(**batch)
    loss = preds.loss

    #Backpropagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss
    progress_bar.update(1)

    predicted = torch.argmax(preds.logits, -1)
    references= batch["labels"]
    for i in range(len(predicted)):
      if(predicted[i] == 1 and references[i] == 1):
        true_positive += 1
      if(predicted[i] == 1 and references[i] == 0):
        false_negative += 1
      if(predicted[i] == 0 and references[i] == 1):
        false_positive += 1
      if(predicted[i] == 0 and references[i] == 0):
        true_negative += 1  
  loss_value = (total_loss/len(dataloader)).item()
  precision = true_positive/(true_positive + false_positive)
  recall = true_positive/(true_positive + false_negative)
  f_score = 2*precision*recall / (precision + recall)
  accuracy = (true_positive + true_negative)/(true_positive + true_negative + false_positive + false_negative)
  specificity = true_negative / (true_negative + false_positive)
  print("Training - Loss value:", loss_value, "Precision:", precision, 
        "Recall:", recall, "Specificity:", specificity, "F_score:", f_score, "Accuracy:", accuracy)

In [10]:
def test(dataloader, model, batch_size):
  model.eval()
  total_loss = 0 
  true_positive = 0
  true_negative = 0
  false_positive = 0
  false_negative = 0
  for batch in dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
        preds = model(**batch)
      loss = preds.loss
      total_loss += loss
      predicted = torch.argmax(preds.logits, -1)
      references= batch["labels"]
      for i in range(len(predicted)):
        if(predicted[i] == 1 and references[i] == 1):
          true_positive += 1
        if(predicted[i] == 1 and references[i] == 0):
          false_positive += 1
        if(predicted[i] == 0 and references[i] == 1):
          false_negative += 1
        if(predicted[i] == 0 and references[i] == 0):
          true_negative += 1

  print(true_positive, true_negative, false_negative, false_positive)
  loss_value = (total_loss/len(dataloader)).item()
  precision = true_positive/(true_positive + false_positive)
  recall = true_positive/(true_positive + false_negative)
  f_score = 2*precision*recall / (precision + recall)
  accuracy = (true_positive + true_negative)/(true_positive + true_negative + false_positive + false_negative)
  specificity = true_negative / (true_negative + false_positive)
  print("Testing - Loss value:", loss_value, "Precision:", precision, 
        "Recall:", recall, "Specificity:", specificity, "F_score:", f_score, "Accuracy:", accuracy)

In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
progress_bar = tqdm(range(num_epochs * len(train_dataloader)))

for t in range(num_epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, optimizer, batch_size, progress_bar)
    test(test_dataloader, model, batch_size)
    

  0%|          | 0/8223 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 1
-------------------------------
Training - Loss value: 0.3925461769104004 Precision: 0.9227292979441156 Recall: 0.8566954260725682 Specificity: 0.7571514302860572 F_score: 0.8884871142708302 Accuracy: 0.8339947995073217
3650 993 265 573
Testing - Loss value: 0.3586781919002533 Precision: 0.8643144683874023 Recall: 0.9323116219667944 Specificity: 0.6340996168582376 F_score: 0.8970262963873187 Accuracy: 0.8471081919357781
Epoch 2
-------------------------------
Training - Loss value: 0.2891225814819336 Precision: 0.944179237476927 Recall: 0.8999029361805387 Specificity: 0.8386978112929925 F_score: 0.9215095511725424 Accuracy: 0.884722412298709
3517 1091 398 475
Testing - Loss value: 0.4281582534313202 Precision: 0.8810120240480962 Recall: 0.8983397190293742 Specificity: 0.6966794380587484 F_score: 0.889591501201467 Accuracy: 0.8407224958949097
Epoch 3
-------------------------------
Training - Loss value: 0.17943182587623596 Precision: 0.9611736999554452 Recall: 0.943105171121658

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

temp_X = tokenized_datasets_tweets_prep["train"]["input_ids"]
y = tokenized_datasets_tweets_prep["train"]["labels"].tolist()
max_len_x = 48

#change type of temp_X from tensor to list and make all entries the same length
X = []
for i in range(len(temp_X)):
  temp_list = temp_X[i].tolist()
  if(len(temp_list) != max_len_x):
    for i in range(max_len_x - len(temp_list)):
     temp_list.append(0)
  X.append(temp_list)

print(type(X))
for i in range(5):
  print(X[i])
print(type(y))
for i in range(5):
  print(y[i])

#Implementation of model from https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier
baseline_model = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3))
baseline_model.fit(X, y)

<class 'list'>
[101, 19610, 2361, 8416, 28851, 6854, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 4931, 5580, 5958, 3407, 8823, 2919, 6207, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 5958, 2123, 2102, 3105, 5353, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2039, 18570, 7760, 9130, 2814, 3334, 24927, 2101, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 2126, 2147, 2175, 2078, 6583, 3335, 2236, 2902, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
<class 'list'>
0
1
1
1
0


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier())])

In [13]:
temp_input = tokenized_datasets_tweets_prep["test"]["input_ids"]
input = []
max_len_x = 48

for i in range(len(temp_input)):
  temp_list = temp_input[i].tolist()
  if(len(temp_list) != max_len_x):
    for i in range(max_len_x - len(temp_list)):
     temp_list.append(0)
  input.append(temp_list)

predicted = baseline_model.predict(input)
print(predicted)
references = tokenized_datasets_tweets_prep["test"]["labels"].tolist()

true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0
for i in range(len(predicted)):
  if(predicted[i] == 1 and references[i] == 1):
    true_positive += 1
  if(predicted[i] == 1 and references[i] == 0):
    false_positive += 1
  if(predicted[i] == 0 and references[i] == 1):
    false_negative += 1
  if(predicted[i] == 0 and references[i] == 0):
    true_negative += 1

print(true_positive, true_negative, false_negative, false_positive)
precision = true_positive/(true_positive + false_positive)
recall = true_positive/(true_positive + false_negative)
f_score = 2*precision*recall / (precision + recall)
accuracy = (true_positive + true_negative)/(true_positive + true_negative + false_positive + false_negative)
print("Baseline model - Precision:", precision, "Recall:", recall,
      "F_score:", f_score, "Accuracy:", accuracy)

[1 1 1 ... 1 1 1]
3877 12 38 1554
Baseline model - Precision: 0.7138648499355551 Recall: 0.9902937420178799 F_score: 0.8296597474855553 Accuracy: 0.7095420543696406


In [14]:
outputs_after = []

with torch.no_grad():
    for video in eval_dataset:
      output_one_video = []
      for comment in video:
        input = tokenizer(comment, padding=True, truncation=True, return_tensors='pt')
        input.to(device)
        output = model(**input)
        #Get positive or negative evaluation of comment
        label_id = torch.argmax(output.logits).item()
        output_one_video.append(label_id)
        # label = model.config.id2label[label_id]
        # print(label_id, label)

      outputs_after.append(output_one_video)

print(len(outputs_after))
outputs_after_df = pd.DataFrame(outputs_after)
outputs_after_df.to_csv('Evaluation_after_finetuning.csv', encoding='utf-8')

183


In [15]:
total_same_eval = 0
total_diff_eval = 0
total_pos_before = 0
total_neg_before = 0
total_pos_after = 0
total_neg_after = 0

for idx1 in range(len(outputs_before)):
  for idx2 in range(len(outputs_before[idx1])):
    if outputs_before[idx1][idx2] == outputs_after[idx1][idx2]:
      total_same_eval += 1
    else:
      print("Comment:", eval_dataset[idx1][idx2], "Before:", outputs_before[idx1][idx2], "After:", outputs_after[idx1][idx2])
      total_diff_eval += 1
    if(outputs_after[idx1][idx2] == 0):
      total_neg_after += 1
    if(outputs_after[idx1][idx2] == 1):
      total_pos_after += 1
    if(outputs_before[idx1][idx2] == 0):
      total_neg_before += 1
    if(outputs_before[idx1][idx2] == 0):
      total_pos_before += 1

print("Total same evaluation:", total_same_eval)
print("Total different evaluation:", total_diff_eval)
print("Before - positive:", total_pos_before, "negative:", total_neg_before)
print("After - positive:", total_pos_after, "negative:", total_neg_after)

Comment: Wow stadium recipe killed Before: 0 After: 1
Comment: What syrup Before: 0 After: 1
Comment: It might nice make syrup cinnamon bark isnt grit drink could also make ginger syrup shrub place fresh ginger reason Otherwise could double strain drink pour Before: 0 After: 1
Comment: I want thank help recipe Ideas I love AZ Anyway said send message Instagram You said wanted know think But I send message ’ respond anything I send question Instagram never responded That ’ Before: 0 After: 1
Comment: Mmmm I need drink year around lol Before: 0 After: 1
Comment: Pulled wine delicious I say tho rather expensive hardtofind ingredient I feel like u would rather buy grocery store It save much money time I really really love UL lately Ive missing story meal recipe anyways I hope peaceful Christmas Before: 0 After: 1
Comment: Yyyy Sadie short Before: 0 After: 1
Comment: whats fun drinking fake wine doesnt hit Before: 0 After: 1
Comment: I love I thinking day I wanted find nonalcoholic pulled w