# Fine Tuning Transformer for Text Classification

In [4]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef



In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Data

In [12]:
df = pd.read_csv('data/train/dataset_en_train_with_emotions.csv')
df 

Unnamed: 0.1,Unnamed: 0,id,text,category,annotations,spacy_tokens,matched_words,top_3_emotions,max_emotion
0,0,5206,this is massive australian senator malcolm rob...,CONSPIRACY,[{'span_text': 'Australian Senator Malcolm Rob...,WyJUSElTIiwgIklTIiwgIk1BU1NJVkUiLCAiQXVzdHJhbG...,"['malcolm', 'senator', 'first', 'roberts', 'fo...","['inspiration', 'amusement', 'anger']",inspiration
1,1,1387,i m deeply concerned that the push to vaccinat...,CRITICAL,[{'span_text': 'I ’m deeply concerned that the...,WyJcdTIwMWMiLCAiSSIsICJcdTIwMTltIiwgImRlZXBseS...,"['push', 'young', 'texas', 'experiment', 'noth...","['inspiration', 'amusement', 'fear']",inspiration
2,2,13116,they wanted to know your vaccination status an...,CRITICAL,"[{'span_text': 'someone who died suddenly', 'c...",WyIyMDIxIiwgIjoiLCAiVGhleSIsICJ3YW50ZWQiLCAidG...,"['nt', 'be', 'allowed', 'know', 'want', 'who',...","['inspiration', 'indifference', 'amusement']",inspiration
3,3,11439,anthony fauci once again defended brutal chine...,CRITICAL,"[{'span_text': 'brutal Chinese lockdowns', 'ca...",WyJBbnRob255IiwgIkZhdWNpIiwgIm9uY2UiLCAiYWdhaW...,"['communist', 'forcefully', 'okay', 'people', ...","['anger', 'annoyance', 'inspiration']",anger
4,4,98,proof has emerged showing that death from wuha...,CRITICAL,[{'span_text': 'death from Wuhan coronavirus (...,WyJQcm9vZiIsICJoYXMiLCAiZW1lcmdlZCIsICJzaG93aW...,"['alive', 'also', 'creation', 'death', 'protei...","['inspiration', 'amusement', 'fear']",inspiration
...,...,...,...,...,...,...,...,...,...
3995,3995,4829,police in australia are warning that unvaccina...,CRITICAL,"[{'span_text': 'Police in Australia', 'categor...",WyJQb2xpY2UiLCAiaW4iLCAiQXVzdHJhbGlhIiwgImFyZS...,"['will', 'double', 'receive', 'police', 'austr...","['anger', 'inspiration', 'sadness']",anger
3996,3996,10899,i personally do nt believe putin would set off...,CONSPIRACY,"[{'span_text': 'Deep State', 'category': 'AGEN...",WyJJIiwgInBlcnNvbmFsbHkiLCAiZG8iLCAiblx1MjAxOX...,"['also', 'filled', 'know', 'off', 'unchecked',...","['inspiration', 'amusement', 'annoyance']",inspiration
3997,3997,10637,pfizer lied we know that there s no doubt abou...,CRITICAL,"[{'span_text': 'Pfizer', 'category': 'AGENT', ...",WyJQZml6ZXIiLCAibGllZCIsICIuIiwgIldlIiwgImtub3...,"['own', 'health', 'recorded', 'know', 'road', ...","['annoyance', 'amusement', 'inspiration']",annoyance
3998,3998,11338,it is utterly bizarre and inexplicable dr john...,CRITICAL,"[{'span_text': 'Dr. John Campbell', 'category'...",WyJcIiIsICJJdCIsICJpcyIsICJ1dHRlcmx5IiwgImJpem...,"['rollout', 'vaccination', 'inexplicable', 'th...","['inspiration', 'amusement', 'indifference']",inspiration


In [13]:
def emotion(df):
    df['text']=df['text']+ ' :' + df['max_emotion']
    return df

df=emotion(df)
df.loc[0].text

'this is massive australian senator malcolm roberts exposes nanotech found in the covid vaccines and says they are genocide he is the first politician to expose this share me lauraabolichannel :inspiration'

# Balance Data

In [9]:
df= pd.read_csv('data/train/dataset_en_train_augmented.csv')
df= df.iloc[:4000]
df=emotion(df)

df_augmented
df_augmented= pd.read_csv('data/train/dataset_en_train_augmented.csv')
df_augmented= df_augmented.iloc[4000:]
df_augmented=emotion(df_augmented)
df_augmented.head()

KeyError: 'max_emotion'

In [6]:
original_counts = df['category'].value_counts()
df_conspiracy = df[df['category'] == 'CONSPIRACY']
df_conspiracy_augmented = df_augmented[df_augmented['category'] == 'CONSPIRACY']
critical_count = original_counts.get('CRITICAL', 0)  # Asegurarnos que 'CRITICAL' existe
conspiracy_count = original_counts.get('CONSPIRACY', 0)

additional_rows_needed = critical_count - conspiracy_count
additional_rows_needed

1242

In [7]:
df_conspiracy = df_augmented[df_augmented['category'] == 'CONSPIRACY']

# Seleccionar 1242 filas aleatorias
df_conspiracy_sampled = df_conspiracy.sample(n=1242, random_state=42)

# Concatenar df con las filas seleccionadas
df_combined = pd.concat([df, df_conspiracy_sampled])

# Ver el resultado
df_combined.reset_index(drop=True, inplace=True)
df_combined

Unnamed: 0,id,text,category,annotations,spacy_tokens
0,5206,this is massive australian senator malcolm rob...,CONSPIRACY,[{'span_text': 'Australian Senator Malcolm Rob...,WyJUSElTIiwgIklTIiwgIk1BU1NJVkUiLCAiQXVzdHJhbG...
1,1387,i m deeply concerned that the push to vaccinat...,CRITICAL,[{'span_text': 'I ’m deeply concerned that the...,WyJcdTIwMWMiLCAiSSIsICJcdTIwMTltIiwgImRlZXBseS...
2,13116,they wanted to know your vaccination status an...,CRITICAL,"[{'span_text': 'someone who died suddenly', 'c...",WyIyMDIxIiwgIjoiLCAiVGhleSIsICJ3YW50ZWQiLCAidG...
3,11439,anthony fauci once again defended brutal chine...,CRITICAL,"[{'span_text': 'brutal Chinese lockdowns', 'ca...",WyJBbnRob255IiwgIkZhdWNpIiwgIm9uY2UiLCAiYWdhaW...
4,98,proof has emerged showing that death from wuha...,CRITICAL,[{'span_text': 'death from Wuhan coronavirus (...,WyJQcm9vZiIsICJoYXMiLCAiZW1lcmdlZCIsICJzaG93aW...
...,...,...,...,...,...
5237,E0S3P,. zickute . com video ubxufaftemjq uk police c...,CONSPIRACY,,
5238,ffdyY,get a claim in for medical battery. nbcnews. c...,CONSPIRACY,,
5239,BJ3ns,We warn against ... vaccine mandates health pa...,CONSPIRACY,,
5240,qFOON,Vaccination center in leaks Crime against huma...,CONSPIRACY,,


In [8]:
df_combined.category.value_counts()
df=df_combined

### Transform label from categoric to numeric

Critical = 1
Conspirancy = 0

In [14]:
df['class'] = df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)

new_df = df[['text', 'class']].copy()
new_df

Unnamed: 0,text,class
0,this is massive australian senator malcolm rob...,0
1,i m deeply concerned that the push to vaccinat...,1
2,they wanted to know your vaccination status an...,1
3,anthony fauci once again defended brutal chine...,1
4,proof has emerged showing that death from wuha...,1
...,...,...
3995,police in australia are warning that unvaccina...,1
3996,i personally do nt believe putin would set off...,0
3997,pfizer lied we know that there s no doubt abou...,1
3998,it is utterly bizarre and inexplicable dr john...,1


In [59]:
new_df.loc[0].text

'this is massive australian senator malcolm roberts exposes nanotech found in the covid vaccines and says they are genocide . he is the first politician to expose this share . me lauraabolichannel'

<a id='section03'></a>
### Preparing the Dataset and Dataloader

#### *CustomDataset* Dataset Class

#### Dataloader


In [15]:
MAX_LEN = 128 # @param {type:"integer"}
TRAIN_BATCH_SIZE = 32 # @param {type:"integer"}
VALID_BATCH_SIZE = 32 # @param {type:"integer"}
EPOCHS = 3 # @param {type:"integer"}
LEARNING_RATE = 1e-5 # @param {type:"number"}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [22]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.targets = self.data['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        ids = inputs['input_ids'].squeeze()
        mask = inputs['attention_mask'].squeeze()
        token_type_ids = inputs["token_type_ids"].squeeze()


        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [13]:
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (4000, 2)
TRAIN Dataset: (3200, 2)
TEST Dataset: (800, 2)


NameError: name 'CustomDataset' is not defined

In [15]:

training_set = CustomDataset(new_df, tokenizer, MAX_LEN)


train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)


<a id='section04'></a>
### Creating the Neural Network for Fine Tuning


In [16]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        #1 es porque en este caso es una  tarea binaria
        self.l3 = torch.nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2).squeeze(1)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [18]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [19]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

### Fine Tuning the Model


In [63]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [64]:
for epoch in range(EPOCHS):
    train(epoch)


Epoch: 0, Loss:  0.7055326104164124
Epoch: 1, Loss:  0.43821975588798523
Epoch: 2, Loss:  0.08067561686038971


### Validating the Model


In [10]:
from sklearn.metrics import accuracy_score, f1_score

def validation(epoch):
    model.eval()  # Set the model to evaluation mode
    fin_targets = []  # List to store true labels
    fin_outputs = []  # List to store predicted probabilities

    with torch.no_grad():  # Disable gradient calculation
        for _, data in enumerate(testing_loader):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            # Get model outputs
            outputs = model(ids, mask, token_type_ids)  # Forward pass
            # Apply sigmoid to get probabilities
            outputs = torch.sigmoid(outputs)

            # Append targets and outputs to the lists
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets


In [11]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    mcc = metrics.matthews_corrcoef(targets, outputs)  # Calculate MCC

    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"MCC = {mcc:.4f}")

NameError: name 'testing_loader' is not defined

# ***************************************************************************

### Fine Tuning and Validating the Model (Cross Validation)

In [23]:
def train(epoch, model, train_loader):
    model.train()  # Poner el modelo en modo de entrenamiento
    for _, data in enumerate(train_loader):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.float)

        optimizer.zero_grad()  # Limpiar los gradientes previos
        outputs = model(ids, mask, token_type_ids)  # Propagación hacia adelante
        loss = loss_fn(outputs, targets)  # Calcular la pérdida

        if _ % 5000 == 0:  # Mostrar pérdida cada 5000 pasos
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        loss.backward()  # Propagación hacia atrás
        optimizer.step()  # Actualizar los pesos del modelo

def validation(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    fin_targets = []  # List to store true labels
    fin_outputs = []  # List to store predicted probabilities

    with torch.no_grad():  # Disable gradient calculation
        for data in data_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            targets = data['targets'].to(device)

            # Get model outputs
            outputs = model(ids, mask, token_type_ids)  # Forward pass
            # Apply sigmoid to get probabilities
            outputs = torch.sigmoid(outputs)

            # Append targets and outputs to the lists
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(outputs.cpu().detach().numpy().tolist())

    return fin_outputs, fin_targets
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


def evaluate_metrics(outputs, targets):
    # Convertir las salidas a 0 o 1 (clases predichas) basadas en el umbral 0.5
    outputs = [1 if x > 0.5 else 0 for x in outputs]

    # Calcular las métricas
    accuracy = accuracy_score(targets, outputs)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, outputs, average='binary')
    mcc = matthews_corrcoef(targets, outputs)  # Calcular MCC

    # Imprimir los resultados
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"MCC: {mcc:.4f}")


def cross_validate_model(model, dataframe, tokenizer, epochs=3, batch_size=16, k_folds=5):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataframe)):
        print(f"\nFold {fold + 1}/{k_folds}")
        train_df = dataframe.iloc[train_idx].reset_index(drop=True)
        val_df = dataframe.iloc[val_idx].reset_index(drop=True)

        # Create DataLoader for training and validation
        train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
        val_set = CustomDataset(val_df, tokenizer, MAX_LEN)

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

        # Training the model for this fold
        for epoch in range(epochs):
            train(epoch, model, train_loader)  # Define your train function as needed

        # Validate the model for this fold
        outputs, targets = validation(model, val_loader, device)
        evaluate_metrics(outputs, targets)  # Calculate and print metrics for this fold


In [None]:
# Assuming new_df is your DataFrame with text and classes
cross_validate_model(model, new_df, tokenizer, epochs=3, batch_size=TRAIN_BATCH_SIZE, k_folds=5)



Fold 1/5
Epoch: 0, Loss: 0.7394375801086426
Epoch: 1, Loss: 0.282950758934021
Epoch: 2, Loss: 0.3668004870414734
Accuracy: 0.8738
Precision: 0.9071
Recall: 0.9088
F1 Score: 0.9079
MCC: 0.7071

Fold 2/5
Epoch: 0, Loss: 0.1891280561685562


# ************************************************************

### Saving the Trained Model 

In [23]:
# Define the paths to save the model and tokenizer
MODEL_PATH = "/content/drive/MyDrive/MASTER/NLP/model/bert_finetuned_model_en.pth"
TOKENIZER_PATH = "/content/drive/MyDrive/MASTER/NLP/model/bert_tokenizer/"

# Save the trained model
torch.save(model.state_dict(), MODEL_PATH)

# Save the tokenizer
tokenizer.save_pretrained(TOKENIZER_PATH)

print("Model and tokenizer saved for future inference.")


Model and tokenizer saved for future inference.


### Load the Saving Model 

In [46]:
# Define the paths to save the model and tokenizer
MODEL_PATH = "/content/drive/MyDrive/MASTER/NLP/model/bert_finetuned_model_en.pth"
TOKENIZER_PATH = "/content/drive/MyDrive/MASTER/NLP/model/bert_tokenizer/"

loaded_tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

# Initialize the model and load the saved state dict
loaded_model = BERTClass()
loaded_model.load_state_dict(torch.load(MODEL_PATH))
loaded_model.to(device)
loaded_model.eval()

  loaded_model.load_state_dict(torch.load(MODEL_PATH))


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

# Test

In [18]:
model.eval()
def predict(text, model, tokenizer):
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)

    with torch.no_grad():
        outputs = model(ids, mask, token_type_ids) 
        probabilities = torch.sigmoid(outputs).cpu().numpy()
        prediction = 1 if probabilities[0] >= 0.5 else 0

    return prediction, probabilities[0]


In [19]:
test_df = pd.read_csv("data/test/dataset_en_test_cleaned.csv")
test_df['class'] = test_df['category'].apply(lambda x: 1 if x == 'CRITICAL' else 0)
test_df = test_df[['text', 'class']].copy()
test_df

Unnamed: 0,text,class
0,elon now confirming what we ve been suspecting...,0
1,keeping the pressure on the police to uphold t...,0
2,safe effective the greatest lie ever told . th...,1
3,cdc report admits . million people in the usa ...,1
4,how to use health to acquire totalitarian cont...,0
...,...,...
995,john d. rockefeller wiped out natural cures to...,0
996,fact check biden white house falsely accuses d...,1
997,w onset acral hand lesions following mrna vacc...,1
998,we will fire unvaccinated workers cohen hadad ...,1


In [20]:
predictions = []
probabilities = []

for index, row in test_df.iterrows():
    text = row['text']  # Use the 'text' column
    prediction, probability = predict(text, model, tokenizer)
    predictions.append(prediction)
    probabilities.append(probability)

# Add predictions and probabilities to the DataFrame
test_df['predictions'] = predictions
test_df['probabilities'] = probabilities

# Calculate metrics
accuracy = accuracy_score(test_df['class'], test_df['predictions'])
f1_micro = f1_score(test_df['class'], test_df['predictions'], average='micro')
f1_macro = f1_score(test_df['class'], test_df['predictions'], average='macro')

# Calculate MCC using the true labels and predicted labels from the DataFrame
mcc = matthews_corrcoef(test_df['class'], test_df['predictions'])

# Print metric results
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_micro}")
print(f"F1 Score (Macro) = {f1_macro}")
print(f"Test MCC = {mcc}")

Accuracy Score = 0.898
F1 Score (Micro) = 0.898
F1 Score (Macro) = 0.8855347323532712
Test MCC = 0.7719496004100165


In [40]:
from sklearn.metrics import classification_report
results = classification_report(test_df['class'], test_df['predictions'], digits=5, output_dict=True)
results

{'0': {'precision': 0.375,
  'recall': 0.9826086956521739,
  'f1-score': 0.542834267413931,
  'support': 345.0},
 '1': {'precision': 0.9375,
  'recall': 0.13740458015267176,
  'f1-score': 0.2396804260985353,
  'support': 655.0},
 'accuracy': 0.429,
 'macro avg': {'precision': 0.65625,
  'recall': 0.5600066379024229,
  'f1-score': 0.39125734675623314,
  'support': 1000.0},
 'weighted avg': {'precision': 0.7434375,
  'recall': 0.429,
  'f1-score': 0.34426850135234677,
  'support': 1000.0}}