In [None]:
!gdown 1h74ECRl7Aqb7zZk6WJch-xmqhW6mzH3-
!gdown 1x9BRcMcdobE23K2vyLFpiqVZhHXtFS2S
!gdown 1KHJpzofsASSa0DfBZINmqsFQ1FgRvj_J
!gdown 1SbKL3cPZTw8jjq-L_O9mzZHcMCBPEEdR
!gdown 1zbhaZYGWg5BdjpuofGs01ZXXQ9mQzJKy
!gdown 1DbBDg0tdx1GSRJ0WHgBQTsRwIPyGYCMP

**Install dependencies**

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install loguru

In [None]:
!unzip /content/processed_neg_tweets_08.txt.zip
!unzip /content/processed_pos_tweets_08.txt.zip

# Save different folds

In [None]:
import logging
import torch
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import sys
import os
from sklearn.utils import shuffle
from datasets import DatasetDict, Dataset
from sklearn.metrics import accuracy_score, f1_score
from loguru import logger
import pickle

sys.path.append('')

def load_model_from_checkpoint(path_to_checkpoint):
    ''' Helper function, to load the model from a checkpoint.
    takes as input a path to the checkpoint (from the "experiment-[...]" )
     '''
    full_path_to_model_checkpoint = experiment_path + path_to_checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(full_path_to_model_checkpoint, num_labels=2, local_files_only=False, ignore_mismatched_sizes=True)
    print(f"Loaded model from: {full_path_to_model_checkpoint}")
    return model

def numpy_softmax(model_preds):
    '''Converts the raw predictions from a HuggingFace model into clean logits.'''
    max = np.max(model_preds, axis=1, keepdims=True)
    e_x = np.exp(model_preds-max)
    sum = np.sum(e_x, axis=1, keepdims=True)
    out = e_x / sum
    return out

def load_tweets(file_path):
    tweets = list()
    with open(file_path, 'r', encoding='utf-8') as preprocessed_tweets:
        for tweet in preprocessed_tweets:
            tweets.append(tweet.rstrip('\n'))
    return tweets

def preprocess_function(examples, tok_max_length):
    return tokenizer(examples["tweet"], truncation=True, max_length=tok_max_length, padding=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Set default values for the variables
model_name = "bert-base-uncased"
batch_size = 32
seed = 12222
fp16 = True
out = "./logging"
epochs = 1
lr = 1e-4
wd = 0.005
tok_max_length = 128
train_val_ratio = 0.99

torch.cuda.empty_cache()
time_run = time.time()

project_path = "./"
experiment_path = "./" + "Experiments/"

experiment_date_for_folder_name = "experiment-" + model_name + "_" + "default"

experiments_results_path = experiment_path + experiment_date_for_folder_name
os.makedirs(experiments_results_path, exist_ok=True)
checkpoints_path = experiments_results_path + "/checkpoints/"
print("The project path is: ", project_path)
print("The experiment path is: ", experiment_path)
print("The model checkpoints will be saved at: ", checkpoints_path, "\n")

# for the submission
test_results_path = experiments_results_path + "/test_results/"
os.makedirs(test_results_path, exist_ok=True)

# for validation results
val_results_path = experiments_results_path + "/val_results/"
os.makedirs(val_results_path, exist_ok=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device {device}')
np.random.seed(seed)
torch.manual_seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

train_pos_tweets = load_tweets('/content/processed_pos_tweets_08.txt')
train_pos_tweets = train_pos_tweets[:10000]
train_neg_tweets = load_tweets('/content/processed_neg_tweets_08.txt')
train_neg_tweets = train_neg_tweets[:10000]
test_tweets = load_tweets('/content/test_data.txt')

#Create labels
train_neg_labels = [0] * len(train_neg_tweets)
train_pos_labels = [1] * len(train_pos_tweets)

train_tweets = train_pos_tweets + train_neg_tweets
train_labels = train_pos_labels + train_neg_labels

#Shuffle
train_tweets, train_labels = shuffle(train_tweets, train_labels, random_state=10)
data = pd.DataFrame({'tweet': train_tweets, 'label': train_labels})

X = list(data["tweet"])
y = list(data["label"])

num_folds = 3  # Set the number of folds for cross-validation

skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)
train_val_folds = []
false_val = []

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):

    train_tweets_fold = [train_tweets[i] for i in train_index]
    train_labels_fold = [train_labels[i] for i in train_index]

    val_tweets_fold = [train_tweets[i] for i in val_index]
    val_labels_fold = [train_labels[i] for i in val_index]

    train_data_fold = {"tweet": train_tweets_fold, "label": train_labels_fold}
    train_dataset_fold = Dataset.from_dict(train_data_fold)

    val_data_fold = {"tweet": val_tweets_fold, "label": val_labels_fold}
    val_dataset_fold = Dataset.from_dict(val_data_fold)

    data_dict_fold = DatasetDict({"train": train_dataset_fold, "validation": val_dataset_fold})

    tokenized_dataset_fold = data_dict_fold.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

    # Save different folds
    train_val_folds.append((tokenized_dataset_fold["train"], tokenized_dataset_fold["validation"]))

with open('/content/train_val_folds.pkl', 'wb') as file:
    pickle.dump(train_val_folds, file)



'''
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)

    logging_steps = 4000
    training_args = TrainingArguments(
        output_dir=out,
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        save_total_limit=2,
        seed=seed,
        weight_decay=wd,
        evaluation_strategy="epoch",
        gradient_accumulation_steps=4,
        disable_tqdm=False,
        fp16=fp16,
        logging_steps=logging_steps,
        logging_strategy="epoch",
        save_strategy="epoch",
        logging_dir='./logs',
        load_best_model_at_end=True,
        warmup_steps=500
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset_fold["train"],
        eval_dataset=tokenized_dataset_fold["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    logger.info(f"Started training fold {fold + 1}")
    trainer.train()
    logger.info(f"Ended training fold {fold + 1}")

    data_test = pd.DataFrame({'tweet': test_tweets})
    test_dataset = Dataset.from_dict(data_test)
    test_dataset = test_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

    results_test = trainer.predict(test_dataset)

    y_preds = np.argmax(results_test.predictions, axis=1)

    y_preds = [-1 if val == 0 else 1 for val in y_preds]

    df = pd.DataFrame(y_preds, columns=["Prediction"])
    df.index.name = "Id"
    df.index += 1
    df.to_csv(test_results_path + f"test_data_fold{fold + 1}.csv")

    logits = numpy_softmax(results.predictions)

    os.makedirs(test_results_path + model_name + "-" + 'logits.txt', exist_ok=True)
    np.savetxt(test_results_path + f"logits_fold{fold + 1}.txt", logits, delimiter=",", header="negative,positive", comments="")

time_total = time.time() - time_run
print(f"The program took {str(time_total/60/60)[:6]} Hours or {str(time_total/60)[:6]} minutes to run.")
'''

The project path is:  ./
The experiment path is:  ./Experiments/
The model checkpoints will be saved at:  ./Experiments/experiment-bert-base-uncased_default/checkpoints/ 

Using device cuda


Map:   0%|          | 0/13333 [00:00<?, ? examples/s]

Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Map:   0%|          | 0/13333 [00:00<?, ? examples/s]

Map:   0%|          | 0/6667 [00:00<?, ? examples/s]

Map:   0%|          | 0/13334 [00:00<?, ? examples/s]

Map:   0%|          | 0/6666 [00:00<?, ? examples/s]

' \n    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)\n\n    logging_steps = 4000\n    training_args = TrainingArguments(\n        output_dir=out,\n        learning_rate=lr,\n        per_device_train_batch_size=batch_size,\n        per_device_eval_batch_size=batch_size,\n        num_train_epochs=epochs,\n        save_total_limit=2,\n        seed=seed,\n        weight_decay=wd,\n        evaluation_strategy="epoch",\n        gradient_accumulation_steps=4,\n        disable_tqdm=False,\n        fp16=fp16,\n        logging_steps=logging_steps,\n        logging_strategy="epoch",\n        save_strategy="epoch",\n        logging_dir=\'./logs\',\n        load_best_model_at_end=True,\n        warmup_steps=500\n    )\n\n    trainer = Trainer(\n        model=model,\n        args=training_args,\n        train_dataset=tokenized_dataset_fold["train"],\n    

# Cross-validation / Stage 1 Training

In [None]:
import logging
import torch
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import sys
import os
from sklearn.utils import shuffle
from datasets import DatasetDict, Dataset
from sklearn.metrics import accuracy_score, f1_score
from loguru import logger
import pickle

sys.path.append('')

def load_model_from_checkpoint(path_to_checkpoint):
    ''' Helper function, to load the model from a checkpoint.
    takes as input a path to the checkpoint (from the "experiment-[...]" )
     '''
    full_path_to_model_checkpoint = experiment_path + path_to_checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(full_path_to_model_checkpoint, num_labels=2, local_files_only=False, ignore_mismatched_sizes=True)
    print(f"Loaded model from: {full_path_to_model_checkpoint}")
    return model

def numpy_softmax(model_preds):
    '''Converts the raw predictions from a HuggingFace model into clean logits.'''
    max = np.max(model_preds, axis=1, keepdims=True)
    e_x = np.exp(model_preds-max)
    sum = np.sum(e_x, axis=1, keepdims=True)
    out = e_x / sum
    return out

def load_tweets(file_path):
    tweets = list()
    with open(file_path, 'r', encoding='utf-8') as preprocessed_tweets:
        for tweet in preprocessed_tweets:
            tweets.append(tweet.rstrip('\n'))
    return tweets

def preprocess_function(examples, tok_max_length):
    return tokenizer(examples["tweet"], truncation=True, max_length=tok_max_length, padding=True)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# Set default values for the variables
model_name = "bert-base-uncased"
batch_size = 32
seed = 12222
fp16 = True
out = "./logging"
epochs = 1
lr = 1e-4
wd = 0.005
tok_max_length = 128
train_val_ratio = 0.99

torch.cuda.empty_cache()
time_run = time.time()

project_path = "./"
experiment_path = "./" + "Experiments/"

experiment_date_for_folder_name = "experiment-" + model_name + "_" + "default"

experiments_results_path = experiment_path + experiment_date_for_folder_name
os.makedirs(experiments_results_path, exist_ok=True)
checkpoints_path = experiments_results_path + "/checkpoints/"
print("The project path is: ", project_path)
print("The experiment path is: ", experiment_path)
print("The model checkpoints will be saved at: ", checkpoints_path, "\n")

# for the submission
test_results_path = experiments_results_path + "/test_results/"
os.makedirs(test_results_path, exist_ok=True)

# for validation results
val_results_path = experiments_results_path + "/val_results/"
os.makedirs(val_results_path, exist_ok=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device {device}')
np.random.seed(seed)
torch.manual_seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

test_tweets = load_tweets('/content/test_data.txt')

with open('/content/train_val_folds.pkl', 'rb') as file:
    train_val_folds = pickle.load(file)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)

logging_steps = 4000
training_args = TrainingArguments(
    output_dir=out,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    save_total_limit=2,
    seed=seed,
    weight_decay=wd,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=4,
    disable_tqdm=False,
    fp16=fp16,
    logging_steps=logging_steps,
    logging_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
    warmup_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val_folds[0][0],
    eval_dataset=train_val_folds[0][1],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

logger.info(f"Started training")
trainer.train()
logger.info(f"Ended training")

data_test = pd.DataFrame({'tweet': test_tweets})
test_dataset = Dataset.from_dict(data_test)
test_dataset = test_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

results_test = trainer.predict(test_dataset)
y_preds_test = np.argmax(results_test.predictions, axis=1)

results_val = trainer.predict(train_val_folds[0][1])
y_preds_val = np.argmax(results_val.predictions, axis=1)

y_preds_test = [-1 if test == 0 else 1 for test in y_preds_test]

X_val_false, y_val_false = get_mispredicted_samples(train_val_folds[0][1]['tweet'],y_preds_val,train_val_folds[0][1]['label'])

val_data_false = {"tweet": X_val_false, "label": y_val_false}
# Convert the dictionary to a Dataset object
val_false_dataset = Dataset.from_dict(val_data_false)

#Tokenization using map
val_false_tokenized_dataset = val_false_dataset.map(lambda examples: preprocess_function(examples, tok_max_length), batched=True)

# Save val_false dataset
with open('/content/val_false_tokenized_dataset.pkl', 'wb') as file:
    pickle.dump(val_false_tokenized_dataset, file)


df = pd.DataFrame(y_preds_test, columns=["Prediction"])
df.index.name = "Id"
df.index += 1
df.to_csv(test_results_path + f"test_data.csv")

logits_val = numpy_softmax(results_val.predictions)
logits_test = numpy_softmax(results_test.predictions)

os.makedirs(test_results_path + model_name + "-" + 'logits_test.txt', exist_ok=True)
np.savetxt(test_results_path + f"logits_test.txt", logits_test, delimiter=",", header="negative,positive", comments="")

time_total = time.time() - time_run
print(f"The program took {str(time_total/60/60)[:6]} Hours or {str(time_total/60)[:6]} minutes to run.")


The project path is:  ./
The experiment path is:  ./Experiments/
The model checkpoints will be saved at:  ./Experiments/experiment-bert-base-uncased_default/checkpoints/ 

Using device cuda


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.5532,0.401548,0.819709,0.818159


[32m2023-07-02 13:40:42.254[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 138>[0m:[36m138[0m - [1mEnded training[0m


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1202 [00:00<?, ? examples/s]

The program took 0.0244 Hours or 1.4688 minutes to run.
