In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
#!gdown 1h74ECRl7Aqb7zZk6WJch-xmqhW6mzH3-
#!gdown 1x9BRcMcdobE23K2vyLFpiqVZhHXtFS2S
#!gdown 1KHJpzofsASSa0DfBZINmqsFQ1FgRvj_J
#!gdown 1SbKL3cPZTw8jjq-L_O9mzZHcMCBPEEdR
!gdown 1zbhaZYGWg5BdjpuofGs01ZXXQ9mQzJKy
!gdown 1DbBDg0tdx1GSRJ0WHgBQTsRwIPyGYCMP
!gdown 1x44fAZLzCFxWaPF7R0MX9V_V744u7Std

Downloading...
From: https://drive.google.com/uc?id=1zbhaZYGWg5BdjpuofGs01ZXXQ9mQzJKy
To: /content/test_data.txt
100% 817k/817k [00:00<00:00, 182MB/s]
Downloading...
From: https://drive.google.com/uc?id=1DbBDg0tdx1GSRJ0WHgBQTsRwIPyGYCMP
To: /content/scores_dict.pkl
100% 247M/247M [00:01<00:00, 153MB/s]
Downloading...
From: https://drive.google.com/uc?id=1x44fAZLzCFxWaPF7R0MX9V_V744u7Std
To: /content/train_val_folds.pkl
100% 3.03G/3.03G [00:26<00:00, 114MB/s]


**Install dependencies**

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install loguru

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.4 MB/s[0m eta [36m0:00:0

### Training

In [None]:
import logging
import torch
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import sys
import os
from sklearn.utils import shuffle
from datasets import DatasetDict, Dataset
from sklearn.metrics import accuracy_score, f1_score
from loguru import logger
import pickle

sys.path.append('')

def load_model_from_checkpoint(path_to_checkpoint):
    ''' Helper function, to load the model from a checkpoint.
    takes as input a path to the checkpoint (from the "experiment-[...]" )
     '''
    full_path_to_model_checkpoint = experiment_path + path_to_checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(full_path_to_model_checkpoint, num_labels=2, local_files_only=False, ignore_mismatched_sizes=True)
    print(f"Loaded model from: {full_path_to_model_checkpoint}")
    return model

def numpy_softmax(model_preds):
    '''Converts the raw predictions from a HuggingFace model into clean logits.'''
    max = np.max(model_preds, axis=1, keepdims=True)
    e_x = np.exp(model_preds-max)
    sum = np.sum(e_x, axis=1, keepdims=True)
    out = e_x / sum
    return out

def load_tweets(file_path):
    tweets = list()
    with open(file_path, 'r', encoding='utf-8') as preprocessed_tweets:
        for tweet in preprocessed_tweets:
            tweets.append(tweet.rstrip('\n'))
    return tweets

def preprocess_function(examples, tok_max_length):
    return tokenizer(examples["tweet"], truncation=True, max_length=tok_max_length, padding=True)


def save_dictionary_as_pickle(dictionary, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dictionary, file)

def load_pickle_as_dictionary(filename):
    with open(filename, 'rb') as file:
        dictionary = pickle.load(file)
    return dictionary


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

def get_mispredicted_samples(X, y_pred, y_true):
    mispredicted_X = []
    mispredicted_Y = []

    for i in range(len(X)):
        if y_pred[i] != y_true[i]:
            mispredicted_X.append(X[i])
            mispredicted_Y.append(y_true[i])

    return mispredicted_X, mispredicted_Y

def sort_by_difficulty(dataset, dict_path):
    subset_X = dataset['tweet']
    subset_y = dataset['label']
    scores_dict = load_pickle_as_dictionary(dict_path)

    new_dict = {}
    for i, x in enumerate(subset_X):
      new_dict[i] = {'x': x, 'label': subset_y[i], 'score': scores_dict[x]['score']}

    new_sorted_dict = dict(sorted(new_dict.items(), key=lambda item: item[1]['score'], reverse=False))
    sorted_X = [item[1]['x'] for item in new_sorted_dict.items()]
    sorted_y = [item[1]['label'] for item in new_sorted_dict.items()]

    new_data = {"tweet": sorted_X, "label": sorted_y}
    new_dataset = Dataset.from_dict(new_data)
    new_tokenized_dataset = new_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

    return new_tokenized_dataset

def interleave(train_dataset_sorted):
    X_pos_train, y_pos_train = list(zip(*filter(lambda t: t[1] == 1, zip(train_dataset_sorted['tweet'], train_dataset_sorted['label']))))
    X_neg_train, y_neg_train = list(zip(*filter(lambda t: t[1] == 0, zip(train_dataset_sorted['tweet'], train_dataset_sorted['label']))))

    a, b = [X_pos_train, y_pos_train], [X_neg_train, y_neg_train]
    n = len(a[0]) + len(b[0])
    results = []
    for j in range(2):
      true_ratio = len(a[j]) / n
      c = []
      a_count, b_count = 0, 0
      running_ratio = 0
      for i in range(n):
          if running_ratio < true_ratio:
              c.append(a[j][a_count])
              a_count += 1
          else:
              c.append(b[j][b_count])
              b_count += 1
          running_ratio = a_count / (a_count + b_count)
      results.append(c)

    new_data = {"tweet": results[0], "label": results[1]}
    new_dataset = Dataset.from_dict(new_data)
    new_tokenized_dataset = new_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

    return new_tokenized_dataset

# Set default values for the variables
model_name = "bert-base-uncased"
batch_size = 32
seed = 12222
fp16 = True
out = "./logging"
epochs = 2
lr = 1e-4
wd = 0.005
tok_max_length = 128

torch.cuda.empty_cache()
time_run = time.time()

project_path = "./"
experiment_path = "./" + "Experiments/"

experiment_date_for_folder_name = "experiment-" + model_name + "_" + "default"

experiments_results_path = experiment_path + experiment_date_for_folder_name
os.makedirs(experiments_results_path, exist_ok=True)
checkpoints_path = experiments_results_path + "/checkpoints/"
print("The project path is: ", project_path)
print("The experiment path is: ", experiment_path)
print("The model checkpoints will be saved at: ", checkpoints_path, "\n")

# for the submission
test_results_path = experiments_results_path + "/test_results/"
os.makedirs(test_results_path, exist_ok=True)

# for validation results
val_results_path = experiments_results_path + "/val_results/"
os.makedirs(val_results_path, exist_ok=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device {device}')
np.random.seed(seed)
torch.manual_seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

test_tweets = load_tweets('/content/test_data.txt')

with open('/content/train_val_folds.pkl', 'rb') as file:
    train_val_folds = pickle.load(file)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)

logging_steps = 4000
training_args = TrainingArguments(
    output_dir=out,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    save_total_limit=2,
    seed=seed,
    weight_decay=wd,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=4,
    disable_tqdm=False,
    fp16=fp16,
    logging_steps=logging_steps,
    logging_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    warmup_steps=500
)

dictionary_path = '/content/scores_dict.pkl'
train_dataset_sorted = sort_by_difficulty(train_val_folds[1][0][:20000], dictionary_path)
train_interleave_dataset = interleave(train_dataset_sorted)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_interleave_dataset,
    eval_dataset=train_val_folds[1][1],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

logger.info(f"Started training")
trainer.train()
logger.info(f"Ended training")

data_test = pd.DataFrame({'tweet': test_tweets})
test_dataset = Dataset.from_dict(data_test)
test_dataset = test_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

results_test = trainer.predict(test_dataset)
y_preds_test = np.argmax(results_test.predictions, axis=1)

results_val = trainer.predict(train_val_folds[1][1])
y_preds_val = np.argmax(results_val.predictions, axis=1)

y_preds_test = [-1 if test == 0 else 1 for test in y_preds_test]

X_val_false, y_val_false = get_mispredicted_samples(train_val_folds[1][1]['tweet'],y_preds_val,train_val_folds[1][1]['label'])

val_data_false = {"tweet": X_val_false, "label": y_val_false}
# Convert the dictionary to a Dataset object
val_false_dataset = Dataset.from_dict(val_data_false)

#Tokenization using map
val_false_tokenized_dataset = val_false_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

# Save val_false locally
with open('/content/val_false_tokenized_dataset2.pkl', 'wb') as file:
    pickle.dump(val_false_tokenized_dataset, file)

# Save val_false on Google Drive
with open('/content/drive/MyDrive/val_false_tokenized_dataset2.pkl', 'wb') as file:
    pickle.dump(val_false_tokenized_dataset, file)

df = pd.DataFrame(y_preds_test, columns=["Prediction"])
df.index.name = "Id"
df.index += 1
df.to_csv(test_results_path + f"test_data2.csv")

logits_val = numpy_softmax(results_val.predictions)
logits_test = numpy_softmax(results_test.predictions)

os.makedirs(test_results_path + model_name + "-" + 'logits_test2.txt', exist_ok=True)
np.savetxt(test_results_path + f"logits_test2.txt", logits_test, delimiter=",", header="negative,positive", comments="")

os.makedirs(val_results_path + model_name + "-" + 'logits_val2.txt', exist_ok=True)
np.savetxt(val_results_path + f"logits_val2.txt", logits_val, delimiter=",", header="negative,positive", comments="")

time_total = time.time() - time_run
print(f"The program took {str(time_total/60/60)[:6]} Hours or {str(time_total/60)[:6]} minutes to run.")


The project path is:  ./
The experiment path is:  ./Experiments/
The model checkpoints will be saved at:  ./Experiments/experiment-bert-base-uncased_default/checkpoints/ 

Using device cuda


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

[32m2023-07-06 15:18:25.217[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 208>[0m:[36m208[0m - [1mStarted training[0m
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.5033,0.368731,0.836844,0.836636
1,0.3339,0.349228,0.844897,0.844893


[32m2023-07-06 15:50:34.364[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 210>[0m:[36m210[0m - [1mEnded training[0m


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

### Download files

In [None]:
!zip -r /content/Experiments.zip /content/Experiments
!zip -r /content/logging.zip /content/logging
!zip -r /content/logs.zip /content/logs

In [None]:
from google.colab import files
files.download("/content/val_false_tokenized_dataset.pkl")
files.download("/content/Experiments.zip")
files.download("/content/logs.zip")
files.download("/content/logging.zip")
