## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')
import warnings
import re

# PyTorch
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler, Dataset

# Hugging Face Transformers
from transformers import CamembertTokenizer, CamembertTokenizerFast, CamembertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import TrainerCallback
from transformers import Trainer
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback


# Scikit-learn packages for modeling and evaluation
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.utils import shuffle

# Other
#!pip install datasets
#!pip install wandb
#!pip install Trainer
#!pip install accelerate -U
#!pip install transformers[torch]


from datasets import Dataset
from datasets import load_metric
import wandb
import os

os.environ["WANDB_WATCH"] = "all"


#!pip install GPUtil
'''
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import string
import re

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()
'''

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'\nimport torch\nfrom GPUtil import showUtilization as gpu_usage\nfrom numba import cuda\nimport string\nimport re\n\ndef free_gpu_cache():\n    print("Initial GPU Usage")\n    gpu_usage()\n\n    torch.cuda.empty_cache()\n\n    cuda.select_device(0)\n    cuda.close()\n    cuda.select_device(0)\n\n    print("GPU Usage after emptying the cache")\n    gpu_usage()\n\nfree_gpu_cache()\n'

## Data Loading and Processing


In [None]:
labeled_df = pd.read_csv( "labeled_df.csv")
labeled_df = labeled_df[['label', 'id', 'text']]
labeled_df.head()
num_rows = len(labeled_df)
print(f"Number of rows: {num_rows}")

# Get unique values and their counts in the 'label' column
unique_labels = labeled_df['label'].unique()
print(f"Unique labels: {unique_labels}")

label_counts = labeled_df['label'].value_counts()
print("Counts of each label:")
print(label_counts)

duplicate_ids = labeled_df[labeled_df['id'].duplicated(keep=False)]

# Displaying the rows with duplicate IDs
print(duplicate_ids)


Number of rows: 339
Unique labels: [0 1]
Counts of each label:
1    174
0    165
Name: label, dtype: int64
     label          id                                               text
5        1  0BV7191EOS  Passer de 75% à 100% de couverture des cantine...
94       1  HAANZ1HGAQ  ----- Aemro Selassie (FMI) : " L’Afrique doit ...
120      1  N3BEDGSJZU  Bénin : l'intégralité du conseil des ministres...
123      0  NDBP5YOL02  Gestion des cantines scolaires au Bénin : Une ...
147      1  U02ORISCP5  Les glaciers du Kilimandjaro, sommet des défis...
304      1  0BV7191EOS  Passer de 75% à 100% de couverture des cantine...
317      1  NDBP5YOL02  Gestion des cantines scolaires au Bénin : Une ...
319      1  N3BEDGSJZU  Bénin : l'intégralité du conseil des ministres...
320      1  U02ORISCP5  Les glaciers du Kilimandjaro, sommet des défis...
324      1  HAANZ1HGAQ  ----- Aemro Selassie (FMI) : " L’Afrique doit ...


In [None]:
# First, ensure that if 'NDBP5YOL02' is duplicated with different labels, keep the one with label 1
special_case = labeled_df[(labeled_df['id'] == 'NDBP5YOL02') & (labeled_df['label'] == 1)]
if not special_case.empty:
    # If the special case exists, remove all other 'NDBP5YOL02' entries
    labeled_df = labeled_df.drop(labeled_df[(labeled_df['id'] == 'NDBP5YOL02')].index)
    # Append the special case back to the dataframe
    labeled_df = pd.concat([labeled_df, special_case], ignore_index=True)

# Step 2: Remove all other duplicates, keeping the first occurrence
labeled_df = labeled_df.drop_duplicates(subset=['id'], keep='first')
print(labeled_df[(labeled_df['id'] == 'NDBP5YOL02') ])

     label          id                                               text
337      1  NDBP5YOL02  Gestion des cantines scolaires au Bénin : Une ...


In [None]:
df = pd.read_csv( "/content/drive/My Drive/df.csv")

# Select only 'id' and 'text' columns
df= df[['id', 'text']]

# Add an empty 'label' column
df['label'] = np.nan

# Display the modified DataFrame to verify
df.head()

Unnamed: 0,id,text,label
0,HO8KNVZ6QF,Le plan d’autonomie est une “solution de compr...,
1,N7HP3S8B9V,Un quatuor béninois pour arbitrer Mali U23 vs ...,
2,ORG3BSXN7V,------------------- distinguée femme leader de...,
3,ULGT4CHQHH,La CCI Bénin signe avec les CCI de Bahreïn et ...,
4,2HTTXAR4Q8,RADARISTES EST TROP !\n\nOncle AGBAYA\nOn vous...,


In [None]:
# Get a list of unique IDs from the modified labeled_df
unique_ids_in_labeled_df = labeled_df['id'].unique()

# Remove rows from df that have IDs matching those in labeled_df
df = df[~df['id'].isin(unique_ids_in_labeled_df)]

In [None]:
def clean_text(text):
    """
    Remove URLs and other unwanted patterns from the text.
    """
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Remove lines with dashes or similar patterns
    text = re.sub(r'-{2,}', '', text)

    # Remove emails or specific patterns (example)
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Any additional cleaning steps can be added here

    return text



In [None]:
# Apply the cleaning function to your DataFrame directly on the text column
labeled_df['text'] = labeled_df['text'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)

display(labeled_df.head())
display(df.head())

label_counts = labeled_df['label'].value_counts()
print("Counts of each label:")
print(label_counts)



Unnamed: 0,label,id,text
0,0,02XWR02BCE,"Crise sanitaire, recrutements, succession… Les..."
1,0,03URLSTT7L,Transfert de l’énergie solaire depuis l’espace...
2,1,08XCWWYT57,Des céréales ukrainiennes vers l’Afrique : le ...
3,1,09746LR4F6,Les députés en séance plénière pour l'examen d...
4,1,09QJEPKIPQ,"L’ANSD relève une progression de 0, 8 % de l’i..."


Unnamed: 0,id,text,label
0,HO8KNVZ6QF,Le plan d’autonomie est une “solution de compr...,
1,N7HP3S8B9V,Un quatuor béninois pour arbitrer Mali U23 vs ...,
2,ORG3BSXN7V,distinguée femme leader de l’année 2022\n\nLa...,
3,ULGT4CHQHH,La CCI Bénin signe avec les CCI de Bahreïn et ...,
4,2HTTXAR4Q8,RADARISTES EST TROP !\n\nOncle AGBAYA\nOn vous...,


Counts of each label:
1    170
0    164
Name: label, dtype: int64


In [None]:
'''
plot_df['sequence_length'] = labeled_df['text'].apply(lambda x: len(x.split()))

# Bin text lengths into discrete intervals
plot_df['length_bin'] = pd.cut(plot['sequence_length'], bins=range(0, 1000, 256))  # Adjust bins as needed

# Calculate the frequency of each bin
length_distribution = plot['length_bin'].value_counts().sort_index()


# Plot the distribution of sequence lengths
# Plotting
plt.figure(figsize=(12, 8))
length_distribution.plot(kind='bar')
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length Intervals')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

'''

"\nplot_df['sequence_length'] = labeled_df['text'].apply(lambda x: len(x.split()))\n\n# Bin text lengths into discrete intervals\nplot_df['length_bin'] = pd.cut(plot['sequence_length'], bins=range(0, 1000, 256))  # Adjust bins as needed\n\n# Calculate the frequency of each bin\nlength_distribution = plot['length_bin'].value_counts().sort_index()\n\n\n# Plot the distribution of sequence lengths\n# Plotting\nplt.figure(figsize=(12, 8))\nlength_distribution.plot(kind='bar')\nplt.title('Distribution of Text Lengths')\nplt.xlabel('Text Length Intervals')\nplt.ylabel('Frequency')\nplt.xticks(rotation=45)\nplt.show()\n\n"

In [None]:
tokenizer = CamembertTokenizerFast.from_pretrained('camembert-base')


def count_tokens(dataframe, tokenizer):
    token_lengths = []
    for text in dataframe['text'].tolist():
        # Tokenize the text and count the tokens
        tokens = tokenizer.encode(text, add_special_tokens=True)  # add_special_tokens accounts for [CLS], [SEP], etc.
        token_lengths.append(len(tokens))
    return token_lengths

# Assuming 'labeled_df' is your DataFrame and 'tokenizer' is your tokenizer instance
token_lengths = count_tokens(labeled_df, tokenizer)

# Assuming your DataFrame has an 'id' column that uniquely identifies each text
over_512_ids = labeled_df.loc[[length > 512 for length in token_lengths], 'id']

print("IDs of texts longer than 512 tokens:")
print(over_512_ids.tolist())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Token indices sequence length is longer than the specified maximum sequence length for this model (636 > 512). Running this sequence through the model will result in indexing errors


IDs of texts longer than 512 tokens:
['08XCWWYT57', '0U1O5H30GI', '1H4GUWS4W9', '1RLAZN7PDQ', '1UZW087ZR0', '2CXF4V292H', '2G9VHL0NWT', '2GPRN3774H', '3Q3MHJSQGZ', '43QYUX3QVD', '4QNMFDXS0L', '4X55AR6KXW', '550IVUW33Y', '55M6311I4W', '5A4W69N3RA', '5CCBUOXZYX', '5PO82IB29L', '5ZECKXNDPJ', '6386DHPUVU', '6EC5SEU0EG', '6TUPRS7ET8', '7F6H8G1I5R', '7F9DX8UAH7', '7O6KYS84BV', '7SNKL2D34R', '885I10ZEWK', '8BFAI6O2QF', '8BH4FTLD5A', '8EFSWYZ74W', '8FROQI2E7I', '8I29R8B7S7', '93DXYG2ND6', '9DM3PPD4HF', 'C4KBV8BF6D', 'CE0Y2VJQQB', 'DKL0YSSRNY', 'DV1Y6YBLB5', 'DZBMVVLTXH', 'E1NDE8PYA2', 'EA08UXRGL8', 'EG8ISZIELL', 'FNBXTKM0XC', 'GAQ6TTU1Z7', 'GI6FYIEIGM', 'GSMXZZ4TJR', 'H35MDXS2LA', 'HAANZ1HGAQ', 'HL53JM5KGT', 'HLF5HAKAKB', 'HZCND4Y3J5', 'IA9KKCRCPI', 'IDP1QI07PA', 'ITAJKENATN', 'IWDSGHQNHQ', 'JDH1VFGJKH', 'JK2DQKMJ14', 'JKOSLD4HKV', 'K662K9SNRK', 'KSRZJS67BK', 'KWMEQ32QMW', 'LKGBH88QXG', 'MAPYZJIK7B', 'MBXDZ4DYF2', 'MR1MDW9BII', 'MX81JWEWYK', 'N3BEDGSJZU', 'NP5Y1LKJXB', 'NRRCIUDO7J', 'NZGHU9723

## Preparing the folds

In [None]:
###########  DEVIDING MY labeled_df IN TO FOLDS  ###########

labeled_df = shuffle(labeled_df, random_state=1)

df_label_1 = labeled_df[labeled_df['label'] == 1]
df_label_0 = labeled_df[labeled_df['label'] == 0]

# Calculate the desired distribution of labels across the folds
# For label 1: 56, 57, 57
folds_label_1 = [df_label_1.iloc[:56], df_label_1.iloc[56:113], df_label_1.iloc[113:]]
# For label 0: 54, 55, 55
folds_label_0 = [df_label_0.iloc[:54], df_label_0.iloc[54:109], df_label_0.iloc[109:]]



# FOLD one 1/3 of labeled_df
fold1 = pd.concat([folds_label_1[0], folds_label_0[0]]).sample(frac=1, random_state=42)

# FOLD two 1/3 of labeled_df
fold2 = pd.concat([folds_label_1[1], folds_label_0[1]]).sample(frac=1, random_state=42)

# FOLD three 1/3 of labeled_df
fold3 = pd.concat([folds_label_1[2], folds_label_0[2]]).sample(frac=1, random_state=42)

# Check the distribution in each fold
print("Fold1 distribution:\n", fold1['label'].value_counts())
print("Fold2 distribution:\n", fold2['label'].value_counts())
print("Fold3 distribution:\n", fold3['label'].value_counts())



###########  USE THE FOLD one AND SPLIT IN TO TRAIN AND VALIDATION 2/3 and 1/3  ###########

###########  PERFORM THE NORMAL K-FOLD WITH THE TRAIN AND VALIDATION  ##########

###########  MANUAL PICK OF BEST PARAMETERS AND GO BACK TO TRAIN TEST  ##########


Fold1 distribution:
 1    56
0    54
Name: label, dtype: int64
Fold2 distribution:
 1    57
0    55
Name: label, dtype: int64
Fold3 distribution:
 1    57
0    55
Name: label, dtype: int64


## Fold 1  and  Fold 2 :  TRAIN
---
## Fold 3  :   TEST

In [None]:
# Concatenate fold1 and fold2 to form the train_val set for the first iteration
train_val_df = pd.concat([fold1, fold2])


### Camambert Tokenizer

In [None]:
tokenizer = CamembertTokenizerFast.from_pretrained('camembert-base')

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def tokenize_dataframe(examples, tokenizer):
    # Ensure 'examples' is a DataFrame with 'text' and 'label' columns
    # Tokenize the text and include the labels directly in the tokenizer output
    tokenized_inputs = tokenizer(examples["text"].tolist(),
                                  padding=True,
                                  truncation=True,
                                  max_length=320,
                                  return_tensors="pt")
    # Add 'labels' to the tokenized inputs. This assumes 'labels' is a column in your DataFrame
    tokenized_inputs["labels"] = examples["label"].tolist()
    return tokenized_inputs


In [None]:
def tokenize_and_create_dataset(df, tokenizer):
    encodings = tokenize_dataframe(df, tokenizer)
    labels = df['label'].tolist()
    return TextDataset(encodings, labels)

### initializing Parameters

In [None]:
import wandb
# Initialize wandb
wandb.init(project="K-Fold", entity="arisoy10")

sweep_config = {
    'method': 'grid',  # or 'grid', 'random'
    'metric': {
     "name": "accuracy",
      "goal": "maximize"
    },
    'parameters': {
        'learning_rate': {
            'values': [2e-5, 3e-5, 5e-5]
        },
        'num_train_epochs': {
            'values': [15]
        },
        'per_device_train_batch_size': {
            'values': [16, 32]
        }
    }
}


[34m[1mwandb[0m: Currently logged in as: [33marisoy10[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import TrainingArguments, Trainer, TrainerCallback


# Define compute_metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [None]:


def create_hf_dataset(df, tokenizer):
    def tokenize_function(examples):
        # Tokenize the text
        result = tokenizer(examples["text"], padding=True, truncation=True, max_length=320, return_tensors="pt")
        return result

    # Convert the pandas DataFrame to a Hugging Face Dataset
    dataset = Dataset.from_pandas(df)

    # Apply the tokenize function
    dataset = dataset.map(tokenize_function, batched=True)

    # It's important to rename 'label' to 'labels' to match the Trainer's expectation
    dataset = dataset.rename_column("label", "labels")

    # Set the format to PyTorch tensors including only the columns the model needs
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    return dataset

def model_train_evaluate(train_val_df, config):
    # Splitting data into folds
    labels = train_val_df['label'].values

    kfold = StratifiedKFold(n_splits=3, shuffle=False, random_state=None)
    average_accuracy = []

    for fold, (train_ids, val_ids) in enumerate(kfold.split(train_val_df, labels)):
        print(f"Starting fold {fold}")
        train_dataset = create_hf_dataset(train_val_df.iloc[train_ids], tokenizer)
        val_dataset = create_hf_dataset(train_val_df.iloc[val_ids], tokenizer)

        model = CamembertForSequenceClassification.from_pretrained("/content/drive/My Drive/MyModel", num_labels=2)

        training_args = TrainingArguments(
            gradient_accumulation_steps=2,
            output_dir=f'./results_fold_{fold}',
            learning_rate=config['learning_rate'],
            per_device_train_batch_size=config['per_device_train_batch_size'],
            num_train_epochs=config['num_train_epochs'],
            logging_strategy='steps',  # Log after a certain number of steps
            logging_steps=10,  # Number of steps between logging of training loss
            evaluation_strategy='steps',
            eval_steps=10,   # Number of steps to wait before the next evaluation
            logging_dir=f'./logs_fold_{fold}',
            load_best_model_at_end=True,  # Ensures the best model is loaded based on the metric specified
            metric_for_best_model='eval_loss',  # Monitors 'eval_loss' for early stopping
            greater_is_better=False,  # Indicates we want to minimize the metric (loss, in this case)
            report_to='wandb',
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        )

        trainer.train()
        eval_result = trainer.evaluate()

        wandb.log({f'fold_{fold} ': eval_result})

        average_accuracy.append(eval_result['eval_accuracy'])

    # Log the average accuracy across folds to WandB
    wandb.log({"average_accuracy": np.mean(average_accuracy)})
    print(f"Average Accuracy: {np.mean(average_accuracy)}")

In [None]:
def sweep_train():
    with wandb.init() as run:
        config = run.config
        model_train_evaluate(train_val_df, {
            'learning_rate': config.learning_rate,
            'num_train_epochs': config.num_train_epochs,
            'per_device_train_batch_size': config.per_device_train_batch_size,
        })


In [None]:
sweep_id = wandb.sweep(sweep_config, project="K-Fold")
wandb.agent(sweep_id, sweep_train, count=9)



Create sweep with ID: pnarttet
Sweep URL: https://wandb.ai/arisoy/K-Fold/sweeps/pnarttet


[34m[1mwandb[0m: Agent Starting Run: dkglizvk with config:
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	num_train_epochs: 15
[34m[1mwandb[0m: 	per_device_train_batch_size: 16


Starting fold 0


Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/MyModel and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10,0.6874,0.683584,0.621622,0.72,0.571429,0.972973
20,0.6625,0.659652,0.662162,0.736842,0.603448,0.945946
30,0.6221,0.62238,0.72973,0.777778,0.660377,0.945946
40,0.5812,0.581593,0.783784,0.809524,0.723404,0.918919
50,0.5269,0.548611,0.77027,0.78481,0.738095,0.837838
60,0.4819,0.529147,0.783784,0.789474,0.769231,0.810811
70,0.453,0.520369,0.783784,0.789474,0.769231,0.810811


Starting fold 1


Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/MyModel and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10,0.6907,0.67637,0.716216,0.783505,0.644068,1.0
20,0.6695,0.642718,0.635135,0.737864,0.584615,1.0
30,0.6262,0.590204,0.810811,0.844444,0.730769,1.0
40,0.5828,0.534325,0.878378,0.888889,0.837209,0.947368
50,0.5315,0.486834,0.891892,0.897436,0.875,0.921053
60,0.4925,0.457809,0.878378,0.886076,0.853659,0.921053
70,0.4519,0.443096,0.905405,0.909091,0.897436,0.921053


Starting fold 2


Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/MyModel and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10,0.6853,0.678727,0.702703,0.76087,0.648148,0.921053
20,0.6552,0.64969,0.716216,0.778947,0.649123,0.973684
30,0.6114,0.611975,0.756757,0.780488,0.727273,0.842105
40,0.5595,0.575269,0.77027,0.767123,0.8,0.736842
50,0.501,0.536664,0.783784,0.771429,0.84375,0.710526
60,0.457,0.511224,0.77027,0.760563,0.818182,0.710526
70,0.4209,0.503239,0.77027,0.760563,0.818182,0.710526


Average Accuracy: 0.8153153153153153


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
average_accuracy,▁
eval/accuracy,▁▂▄▅▅▅▅▅▃▁▆▇█▇██▃▃▄▅▅▅▅▅
eval/f1,▁▂▃▄▃▄▄▄▃▂▆▇█▇█▇▃▃▃▃▃▃▃▃
eval/loss,█▇▆▅▄▄▃▃█▇▅▄▂▂▁▁█▇▆▅▄▃▃▃
eval/precision,▁▂▃▄▅▅▅▅▃▁▄▇█▇██▃▃▄▆▇▆▆▆
eval/recall,▇▇▇▆▄▃▃▃███▇▆▆▆▅▆▇▄▂▁▁▁▁
eval/runtime,▆▄▆█▁▂▁▂▂▁▃▄▂▁▂▁▄▇▅▂▂▃▅▁
eval/samples_per_second,▃▅▂▁█▇▇▇▇█▆▄▇█▆█▅▂▃▇▆▆▄█
eval/steps_per_second,▃▅▂▁█▇▇▇▇█▆▄▇█▆█▅▂▃▇▆▆▄█
train/epoch,▁▁▂▂▃▄▄▅▅▆▇▇██▁▂▂▃▃▄▅▅▆▆▇██▁▁▂▃▃▄▄▅▆▆▇▇█

0,1
average_accuracy,0.81532
eval/accuracy,0.77027
eval/f1,0.76056
eval/loss,0.50245
eval/precision,0.81818
eval/recall,0.71053
eval/runtime,5.4173
eval/samples_per_second,13.66
eval/steps_per_second,1.846
train/epoch,15.0


[34m[1mwandb[0m: Agent Starting Run: 0pzwxoma with config:
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: 	num_train_epochs: 15
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: Currently logged in as: [33marisoy10[0m ([33marisoy[0m). Use [1m`wandb login --relogin`[0m to force relogin


Starting fold 0


Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/MyModel and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
10,0.6816,0.675,0.608108,0.718447,0.560606,1.0
20,0.6462,0.643535,0.662162,0.736842,0.603448,0.945946


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7b6e34b28be0>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

# TRAINING


In [None]:
#model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=2)  # Adjust num_labels as per your task
#!pip install trainer


def tokenize_and_create_dataset(df, tokenizer):
    encodings = tokenize_dataframe(df, tokenizer)  # Utilize your tokenize_dataframe function
    labels = df['label'].tolist()
    return TextDataset(encodings, labels)



def model_train_evaluate(hyperparams):
    wandb.init()

    # Convert the train_val_df into a Hugging Face Dataset for easier handling with the Trainer API
    full_dataset = Dataset.from_pandas(train_val_df)
    full_dataset = full_dataset.map(lambda e: tokenize_dataframe(e, tokenizer), batched=True, no_cache=True)
    full_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    validation_scores = []

    for fold, (train_ids, val_ids) in enumerate(kfold.split(full_dataset)):
        train_dataset = full_dataset.select(train_ids)
        val_dataset = full_dataset.select(val_ids)

        model = CamembertForSequenceClassification.from_pretrained("/content/drive/My Drive/MyModel", num_labels=2)

        training_args = TrainingArguments(
          output_dir=f'./results_fold_{fold}',
          learning_rate=hyperparams['learning_rate'],
          per_device_train_batch_size=hyperparams['per_device_train_batch_size'],
          per_device_eval_batch_size=24,
          num_train_epochs=hyperparams['num_train_epochs'],
          weight_decay=0.01,
          evaluation_strategy='epoch',
          logging_steps=4,  # Log metrics every 4 steps
          save_strategy='no',  # Adjust as needed based on your checkpoint saving preference
          report_to='wandb',
      )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        eval_result = trainer.evaluate()

        validation_scores.append(eval_result["eval_loss"])
        wandb.log({f"validation_loss_fold_{fold}": eval_result["eval_loss"]})

    wandb.log({"avg_validation_loss": np.mean(validation_scores)})
    wandb.finish()


Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7b6e34b28be0>> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7b6e34b28be0>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
def sweep_train():
    with wandb.init() as run:
        config = run.config
        model_train_evaluate({
            'learning_rate': config.learning_rate,
            'num_train_epochs': config.num_train_epochs,  # Adjusted to match the sweep config
            'per_device_train_batch_size': config.per_device_train_batch_size,
        })

sweep_id = wandb.sweep(sweep_config, project="K-Fold")
wandb.agent(sweep_id, sweep_train, count=8)

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7b6e34b28be0>> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe



Create sweep with ID: uy7xti5g
Sweep URL: https://wandb.ai/arisoy/K-Fold/sweeps/uy7xti5g


In [None]:
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Create the confusion matrix
cm = confusion_matrix(test_labels, preds)

# Create a custom colormap
cmap = sns.diverging_palette(220, 20, as_cmap=True)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Log the custom colored confusion matrix to wandb
wandb.log({"Confusion Matrix": wandb.Image(plt)})
plt.close()

In [None]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt


# Apply softmax to the predictions to get probabilities
probabilities = softmax(predictions.predictions, axis=1)[:, 1]

# Now you can continue with calculating the ROC curve and plotting
fpr, tpr, thresholds = roc_curve(test_labels, probabilities)
roc_auc = auc(fpr, tpr)


# Create ROC curve plot
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

# Log the ROC curve plot
wandb.log({"ROC Curve": wandb.Image(plt)})
plt.close()


precision, recall, _ = precision_recall_curve(test_labels, probabilities)

# Create precision-recall curve plot
plt.figure()
plt.plot(recall, precision, color='blue', lw=lw, label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="upper right")

# Log the precision-recall curve plot
wandb.log({"Precision-Recall Curve": wandb.Image(plt)})
plt.close()

## WandB and Training

In [None]:
#!pip install wandb
import wandb
wandb.init(project="MLM-binary", entity="arisoy10")


In [None]:
trainer.train()  # Evaluate on the validation set

In [None]:
wandb.finish()

### Evaluating

In [None]:
# Step1
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Step 2: Extract actual labels from the validation dataset 'val_dataset'
actual_labels = [val_dataset[i]['labels'].item() for i in range(len(val_dataset))]

# Step 3: Identify indices of false positives
false_positives_indices = [i for i, (pred, actual) in enumerate(zip(predicted_labels, actual_labels)) if pred == 0 and actual == 1]

# Step 4: Decode and print false positives for review
for idx in false_positives_indices:
    # Assuming your dataset returns PyTorch tensors, use `.numpy()` to convert them for decoding
    input_ids = val_dataset[idx]['input_ids'].numpy()
    decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    print(f"False Positive Text at index {idx}: {decoded_text}")

In [None]:
# Evaluate the model on the test dataset
test_results = trainer.evaluate(test_dataset)

# Print the performance metrics
print("Test Performance:", test_results)