In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install loguru



In [None]:
!pip install tqdm

import tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!gdown 1h74ECRl7Aqb7zZk6WJch-xmqhW6mzH3-
!gdown 1x9BRcMcdobE23K2vyLFpiqVZhHXtFS2S

Downloading...
From: https://drive.google.com/uc?id=1h74ECRl7Aqb7zZk6WJch-xmqhW6mzH3-
To: /content/processed_neg_tweets_08.txt.zip
100% 39.3M/39.3M [00:00<00:00, 159MB/s]
Downloading...
From: https://drive.google.com/uc?id=1x9BRcMcdobE23K2vyLFpiqVZhHXtFS2S
To: /content/processed_pos_tweets_08.txt.zip
100% 30.5M/30.5M [00:00<00:00, 139MB/s]


In [None]:
!unzip processed_neg_tweets_08.txt.zip
!unzip processed_pos_tweets_08.txt.zip

Archive:  processed_neg_tweets_08.txt.zip
replace processed_neg_tweets_08.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: processed_neg_tweets_08.txt  
Archive:  processed_pos_tweets_08.txt.zip
replace processed_pos_tweets_08.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: processed_pos_tweets_08.txt  


In [None]:
import logging
import torch
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import sys
import os
from sklearn.utils import shuffle
from datasets import DatasetDict, Dataset
from sklearn.metrics import accuracy_score, f1_score
from loguru import logger
import pickle

sys.path.append('')

def save_dictionary_as_pickle(dictionary, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dictionary, file)

def load_pickle_as_dictionary(filename):
    with open(filename, 'rb') as file:
        dictionary = pickle.load(file)
    return dictionary

def load_model_from_checkpoint(path_to_checkpoint):
    ''' Helper function, to load the model from a checkpoint.
    takes as input a path to the checkpoint (from the "experiment-[...]" )
     '''
    full_path_to_model_checkpoint = experiment_path + path_to_checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(full_path_to_model_checkpoint, num_labels=2, local_files_only=False, ignore_mismatched_sizes=True)
    print(f"Loaded model from: {full_path_to_model_checkpoint}")
    return model

def numpy_softmax(model_preds):
    '''Converts the raw predictions from a HuggingFace model into clean logits.'''
    max = np.max(model_preds, axis=1, keepdims=True)
    e_x = np.exp(model_preds-max)
    sum = np.sum(e_x, axis=1, keepdims=True)
    out = e_x / sum
    return out

def load_tweets(file_path):
    tweets = list()
    with open(file_path, 'r', encoding='utf-8') as preprocessed_tweets:
        for tweet in preprocessed_tweets:
            tweets.append(tweet.rstrip('\n'))
    return tweets

def preprocess_function(examples, tok_max_length):
    return tokenizer(examples["tweet"], truncation=True, max_length=tok_max_length, padding=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Set default values for the variables
model_name = "vinai/bertweet-base"
batch_size = 32
seed = 12222
fp16 = True
out = "./logging"
epochs = 1
lr = 1e-4
wd = 0.005
tok_max_length = 128
train_val_ratio = 0.99

torch.cuda.empty_cache()
time_run = time.time()

project_path = "./"
experiment_path = "./" + "Experiments/"

experiment_date_for_folder_name = "experiment-" + model_name + "_" + "default"

experiments_results_path = experiment_path + experiment_date_for_folder_name
os.makedirs(experiments_results_path, exist_ok=True)
checkpoints_path = experiments_results_path + "/checkpoints/"
print("The project path is: ", project_path)
print("The experiment path is: ", experiment_path)
print("The model checkpoints will be saved at: ", checkpoints_path, "\n")

# for the submission
test_results_path = experiments_results_path + "/test_results/"
os.makedirs(test_results_path, exist_ok=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device {device}')
np.random.seed(seed)
torch.manual_seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

train_pos_tweets = load_tweets('processed_pos_tweets_08.txt')
#train_pos_tweets = train_pos_tweets[:2000]
train_neg_tweets = load_tweets('processed_neg_tweets_08.txt')
#train_neg_tweets = train_neg_tweets[:2000]

#Create labels
train_neg_labels = [0] * len(train_neg_tweets)
train_pos_labels = [1] * len(train_pos_tweets)

train_tweets = train_pos_tweets + train_neg_tweets
train_labels = train_pos_labels + train_neg_labels

#Shuffle
train_tweets, train_labels = shuffle(train_tweets, train_labels, random_state=10)
data = pd.DataFrame({'tweet': train_tweets, 'label': train_labels})

X = list(data["tweet"])
y = list(data["label"])


# Combine X_train and y_train into a single dictionary
train_data = {"tweet": X, "label": y}
# Convert the dictionary to a Dataset object
train_dataset = Dataset.from_dict(train_data)

#Tokenization using map
tokenized_dataset = train_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

#DataCollator for efficient batches
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2,ignore_mismatched_sizes=True).to(device)


logging_steps = 4000
print(logging_steps)
training_args = TrainingArguments(
    output_dir=out,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    save_total_limit=2,
    seed = seed,
    weight_decay=wd,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=4,
    disable_tqdm=False,
    fp16=fp16,
    logging_steps=logging_steps,
    logging_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    warmup_steps=500
)


trainer = Trainer(
    model=model,                      # the instantiated HuggingFace model to be trained
    args=training_args,               # training arguments, defined above
    train_dataset=tokenized_dataset,      # training dataset
    eval_dataset=tokenized_dataset,      # training dataset
    tokenizer=tokenizer,
    data_collator=data_collator,      # data collator
    compute_metrics=compute_metrics,  # metrics function
)

logger.info('Started training')
trainer.train()
logger.info('Ended training')
print(time.time())
print(tokenized_dataset)
results = trainer.predict(tokenized_dataset)


logits = numpy_softmax(results.predictions)


differences = []
for i, logit in enumerate(logits):
    true_label = y[i]
    score = np.abs(logit[1] - true_label)
    differences.append(score)

scores_dict = {}
for i, score in enumerate(differences):
    scores_dict[X[i]] = {"score": score, "label": y[i]}

save_dictionary_as_pickle(scores_dict, 'drive/MyDrive/new_scores_dict.pkl')

time_total = time.time() - time_run
print(f"The program took {str(time_total/60/60)[:6]} Hours or {str(time_total/60)[:6]} minutes to run.")

The project path is:  ./
The experiment path is:  ./Experiments/
The model checkpoints will be saved at:  ./Experiments/experiment-vinai/bertweet-base_default/checkpoints/ 

Using device cuda


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2267098 [00:00<?, ? examples/s]

Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

4000




Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2527,0.184062,0.925503,0.925504


[32m2023-07-06 10:47:53.132[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 169>[0m:[36m169[0m - [1mEnded training[0m


1688640473.1338677
Dataset({
    features: ['tweet', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2267098
})
