In [None]:
import pandas as pd
import numpy as np


import torch
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn

from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, confusion_matrix, classification_report

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
from transformers import AutoModel
from safetensors.torch import load_model


## 8. Classification: predict movie success

### 8.1. Data load and preprocessing

In [152]:
imdb_data = pd.read_csv("imdb_top_1000.csv")

In [153]:
def preprocess_imdb_data(imdb_data):
    imdb_data["Gross"] = imdb_data["Gross"].apply(lambda x: float(x.replace(",", "")) if isinstance(x, str) else x)
    imdb_data["Released_Year"] = pd.to_datetime(imdb_data["Released_Year"], format="%Y", errors="coerce")
    imdb_data["Runtime"] = imdb_data["Runtime"].apply(lambda x: int(x.split()[0]))
    imdb_data["Genre"] = imdb_data["Genre"].apply(lambda x: x.split(", "))
    return imdb_data

In [154]:
imdb_data = preprocess_imdb_data(imdb_data)

### 8.2. Transform for classification model

Movies with a rating greater than 8 will be considered a hit.

In [155]:
transformer_model = SentenceTransformer('all-MiniLM-L6-v2')

In [156]:
def encode_multilabel_column(dataframe, column_name):
    encoder = MultiLabelBinarizer()
    encoded_column = encoder.fit_transform(dataframe[column_name])
    encoded_df = pd.DataFrame(encoded_column, columns=encoder.classes_, index=dataframe.index)
    return encoded_df


def encode_onehot_column(dataframe, column_name):
    encoder = OneHotEncoder(sparse_output=False)
    encoded_column = encoder.fit_transform(dataframe[[column_name]])

    encoded_df = pd.DataFrame(encoded_column, columns=encoder.get_feature_names_out([column_name]), index=dataframe.index)
    return encoded_df


def process_dataset(dataset):
    dataset["Overview_Embedding"] = dataset["Overview"].apply(lambda x: transformer_model.encode(x))
    embeddings = np.vstack(dataset["Overview_Embedding"].to_numpy())
    embeddings_df = pd.DataFrame(embeddings, columns=[f'emb_{i}' for i in range(embeddings.shape[1])])

    feat_data = dataset[["Genre", "Runtime", "Gross" ,"Certificate", "No_of_Votes"]]

    # NaN values will be treated as a separate category
    feat_data["Certificate"] = feat_data["Certificate"].apply(lambda x: "Unknown" if pd.isna(x) else x)

    genre_df = encode_multilabel_column(feat_data, "Genre")
    cert_df = encode_onehot_column(feat_data, "Certificate")

    final_data = pd.concat(
        [
            embeddings_df,
            genre_df,
            cert_df,
            feat_data[["No_of_Votes", "Gross", "Runtime"]],
        ],
        axis=1,
    )
    

    # Success definition: IMDB_Rating > 8.0
    labels = np.array([1 if x > 8.2 else 0 for x in dataset["IMDB_Rating"].to_numpy()])
    return final_data, labels

In [None]:
data, labels = process_dataset(imdb_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat_data["Certificate"] = feat_data["Certificate"].apply(lambda x: "Unknown" if pd.isna(x) else x)


128

In [8]:
data_mod = data.drop(columns=["Gross"])
data_train, data_test, labels_train, labels_test = train_test_split(data_mod, labels, test_size=0.2, random_state=32)

In [9]:
print(len(labels_train))
print(np.sum(labels_train))
print(len(labels_test))
print(np.sum(labels_test))

800
97
200
31


In [10]:
weights = compute_class_weight(class_weight="balanced", classes=np.unique(labels), y=labels)
weights

#class_weights = torch.tensor(weights, dtype=torch.float)
#print(class_weights)


array([0.5733945, 3.90625  ])

### 8.3. Model comparison

#### 8.3.1. Logistic Regression

In [None]:
#define model
clf = LogisticRegression(random_state=0, class_weight="balanced", max_iter=1000).fit(data_train, labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#predictions
preds = clf.predict(data_test)

In [16]:
f1 = f1_score(labels_test, preds)
cm = confusion_matrix(labels_test, preds)
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)

print(classification_report(labels_test, preds, digits=4))

F1-score:  0.4944

Confusion Matrix:
 [[133  36]
 [  9  22]]
              precision    recall  f1-score   support

           0     0.9366    0.7870    0.8553       169
           1     0.3793    0.7097    0.4944        31

    accuracy                         0.7750       200
   macro avg     0.6580    0.7483    0.6748       200
weighted avg     0.8502    0.7750    0.7994       200



#### 8.3.2. SVC

In [17]:
svc_model = SVC(
    C=0.5,
    kernel='rbf',
    gamma='scale',
    class_weight="balanced",
    max_iter=1000
    #random_state=42
)

svc_model.fit(data_train, labels_train)

0,1,2
,C,0.5
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [18]:
y_pred = svc_model.predict(data_test)

rmse = np.sqrt(mean_squared_error(labels_test, y_pred))
mae = mean_absolute_error(labels_test, y_pred)

In [19]:
f1 = f1_score(labels_test, y_pred)
cm = confusion_matrix(labels_test, y_pred)
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)

print(classification_report(labels_test, y_pred, digits=4))

F1-score:  0.5128

Confusion Matrix:
 [[142  27]
 [ 11  20]]
              precision    recall  f1-score   support

           0     0.9281    0.8402    0.8820       169
           1     0.4255    0.6452    0.5128        31

    accuracy                         0.8100       200
   macro avg     0.6768    0.7427    0.6974       200
weighted avg     0.8502    0.8100    0.8248       200



#### 8.3.3. Random Forest

In [None]:
rfc_model = BalancedRandomForestClassifier(
    n_estimators=200,
    #class_weight="balanced"
    #random_state=42
)

rfc_model.fit(data_train, labels_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [24]:
y_pred = rfc_model.predict(data_test)

rmse = np.sqrt(mean_squared_error(labels_test, y_pred))
mae = mean_absolute_error(labels_test, y_pred)

In [25]:
f1 = f1_score(labels_test, y_pred)
cm = confusion_matrix(labels_test, y_pred)
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)

print(classification_report(labels_test, y_pred, digits=4))

F1-score:  0.4091

Confusion Matrix:
 [[165   4]
 [ 22   9]]
              precision    recall  f1-score   support

           0     0.8824    0.9763    0.9270       169
           1     0.6923    0.2903    0.4091        31

    accuracy                         0.8700       200
   macro avg     0.7873    0.6333    0.6680       200
weighted avg     0.8529    0.8700    0.8467       200



#### 8.3.4. BERT based text classifier

In [129]:
model_path = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSequenceClassification.from_pretrained(model_path,  num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [130]:
bert_data = imdb_data["Overview"]

# split data for train, test and validation sets

data_train_bert, data_test_bert, labels_train_bert, labels_test_bert = train_test_split(bert_data, labels, test_size=0.2, random_state=8)
data_train_bert, data_valid_bert, labels_train_bert, labels_valid_bert = train_test_split(data_train_bert, labels_train_bert, test_size=0.2, random_state=5)

In [131]:
print(np.sum(labels_train_bert))
print(np.sum(labels_valid_bert))
print(np.sum(labels_test_bert))

80
22
26


In [132]:
train_dataset = Dataset.from_dict({
    "overview": data_train_bert.tolist(),
    "label": labels_train_bert.tolist()
})

eval_dataset = Dataset.from_dict({
    "overview": data_valid_bert.tolist(),
    "label": labels_valid_bert.tolist()
})

In [133]:
def tokenize_dataset(dataset):
    return tokenizer(
        dataset["overview"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

tokenized_train = train_dataset.map(tokenize_dataset, batched=True)
tokenized_eval = eval_dataset.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 640/640 [00:00<00:00, 13333.17 examples/s]
Map: 100%|██████████| 160/160 [00:00<00:00, 11414.43 examples/s]


In [134]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False
    
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [135]:
# metrics to show during training loop
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

In [136]:
#Define new trainer with weighted loss
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [137]:
# metrics during training 
def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs,  references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes,  references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [138]:
# hyperparameters
lr = 7e-4
batch_size = 8
num_epochs = 20

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-movie_classifier",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
)

In [140]:
# create trainer and train
weighted_trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    
)

class_weights = torch.tensor(weights, dtype=torch.float)

weighted_trainer.class_weights = class_weights
weighted_trainer.train()

  weighted_trainer = WeightedLossTrainer(
                                                 
  5%|▌         | 80/1600 [00:03<00:57, 26.25it/s]  

{'loss': 0.8728, 'grad_norm': 18.394929885864258, 'learning_rate': 0.000665, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A

                                                 
[A                                                
  5%|▌         | 80/1600 [00:03<00:57, 26.25it/s]
[A

{'eval_loss': 0.8300922513008118, 'eval_Accuracy': 0.181, 'eval_AUC': 0.545, 'eval_runtime': 0.7124, 'eval_samples_per_second': 224.585, 'eval_steps_per_second': 28.073, 'epoch': 1.0}


                                                  
 10%|█         | 160/1600 [00:06<00:54, 26.56it/s] 

{'loss': 0.6999, 'grad_norm': 2.0175976753234863, 'learning_rate': 0.00063, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 10%|█         | 160/1600 [00:07<00:54, 26.56it/s]
[A

{'eval_loss': 1.2534143924713135, 'eval_Accuracy': 0.862, 'eval_AUC': 0.521, 'eval_runtime': 0.703, 'eval_samples_per_second': 227.595, 'eval_steps_per_second': 28.449, 'epoch': 2.0}


                                                  
 15%|█▌        | 240/1600 [00:10<00:50, 26.68it/s] 

{'loss': 0.8978, 'grad_norm': 1.7826648950576782, 'learning_rate': 0.0005949999999999999, 'epoch': 3.0}



[A
[A
[A
[A
[A
                                                  
[A                                                

 15%|█▌        | 240/1600 [00:11<00:50, 26.68it/s]
[A
[A

{'eval_loss': 0.8337959051132202, 'eval_Accuracy': 0.325, 'eval_AUC': 0.54, 'eval_runtime': 0.7103, 'eval_samples_per_second': 225.264, 'eval_steps_per_second': 28.158, 'epoch': 3.0}


                                                  
 20%|██        | 320/1600 [00:14<00:48, 26.44it/s] 

{'loss': 0.7023, 'grad_norm': 1.6069915294647217, 'learning_rate': 0.0005600000000000001, 'epoch': 4.0}



[A
[A
[A
[A
[A
                                                  
[A                                                

 20%|██        | 320/1600 [00:15<00:48, 26.44it/s]
[A
[A

{'eval_loss': 0.9623383283615112, 'eval_Accuracy': 0.831, 'eval_AUC': 0.572, 'eval_runtime': 0.7029, 'eval_samples_per_second': 227.62, 'eval_steps_per_second': 28.452, 'epoch': 4.0}


                                                  
 25%|██▌       | 400/1600 [00:18<00:45, 26.59it/s] 

{'loss': 0.905, 'grad_norm': 9.746903419494629, 'learning_rate': 0.000525, 'epoch': 5.0}



[A
[A
[A
[A
[A
                                                  
[A                                                

 25%|██▌       | 400/1600 [00:19<00:45, 26.59it/s]
[A
[A

{'eval_loss': 0.7664982080459595, 'eval_Accuracy': 0.8, 'eval_AUC': 0.599, 'eval_runtime': 0.7101, 'eval_samples_per_second': 225.333, 'eval_steps_per_second': 28.167, 'epoch': 5.0}


                                                  
 30%|███       | 480/1600 [00:22<00:41, 26.72it/s] 

{'loss': 0.7012, 'grad_norm': 15.866985321044922, 'learning_rate': 0.00049, 'epoch': 6.0}



[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 30%|███       | 480/1600 [00:23<00:41, 26.72it/s]
[A

{'eval_loss': 1.3497002124786377, 'eval_Accuracy': 0.844, 'eval_AUC': 0.608, 'eval_runtime': 0.7113, 'eval_samples_per_second': 224.942, 'eval_steps_per_second': 28.118, 'epoch': 6.0}


                                                  
 35%|███▌      | 560/1600 [00:26<00:39, 26.50it/s] 

{'loss': 0.6206, 'grad_norm': 9.323049545288086, 'learning_rate': 0.000455, 'epoch': 7.0}



[A
[A
[A
[A
[A
                                                  
[A                                                

 35%|███▌      | 560/1600 [00:27<00:39, 26.50it/s]
[A
[A

{'eval_loss': 0.9515218734741211, 'eval_Accuracy': 0.812, 'eval_AUC': 0.557, 'eval_runtime': 0.708, 'eval_samples_per_second': 225.976, 'eval_steps_per_second': 28.247, 'epoch': 7.0}


                                                  
 40%|████      | 640/1600 [00:30<00:36, 26.32it/s] 

{'loss': 0.6282, 'grad_norm': 3.1338088512420654, 'learning_rate': 0.00041999999999999996, 'epoch': 8.0}



[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 40%|████      | 640/1600 [00:31<00:36, 26.32it/s]
[A

{'eval_loss': 0.9021803736686707, 'eval_Accuracy': 0.831, 'eval_AUC': 0.627, 'eval_runtime': 0.7059, 'eval_samples_per_second': 226.672, 'eval_steps_per_second': 28.334, 'epoch': 8.0}


                                                  
 45%|████▌     | 720/1600 [00:34<00:33, 26.49it/s] 

{'loss': 0.6348, 'grad_norm': 2.5111677646636963, 'learning_rate': 0.00038500000000000003, 'epoch': 9.0}



[A
[A
[A
[A
[A
                                                  
[A                                                

 45%|████▌     | 720/1600 [00:35<00:33, 26.49it/s]
[A
[A

{'eval_loss': 0.7201927304267883, 'eval_Accuracy': 0.562, 'eval_AUC': 0.603, 'eval_runtime': 0.7119, 'eval_samples_per_second': 224.755, 'eval_steps_per_second': 28.094, 'epoch': 9.0}


                                                  
 50%|█████     | 800/1600 [00:38<00:30, 26.60it/s] 

{'loss': 0.6677, 'grad_norm': 7.40121603012085, 'learning_rate': 0.00035, 'epoch': 10.0}



[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 50%|█████     | 800/1600 [00:39<00:30, 26.60it/s]
[A

{'eval_loss': 0.755588710308075, 'eval_Accuracy': 0.75, 'eval_AUC': 0.605, 'eval_runtime': 0.7152, 'eval_samples_per_second': 223.717, 'eval_steps_per_second': 27.965, 'epoch': 10.0}


                                                  
 55%|█████▌    | 880/1600 [00:42<00:27, 26.43it/s] 

{'loss': 0.6045, 'grad_norm': 4.594843864440918, 'learning_rate': 0.000315, 'epoch': 11.0}



[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 55%|█████▌    | 880/1600 [00:43<00:27, 26.43it/s]
[A

{'eval_loss': 0.9337700009346008, 'eval_Accuracy': 0.806, 'eval_AUC': 0.577, 'eval_runtime': 0.7121, 'eval_samples_per_second': 224.689, 'eval_steps_per_second': 28.086, 'epoch': 11.0}


                                                  
 60%|██████    | 960/1600 [00:46<00:23, 26.69it/s] 

{'loss': 0.6044, 'grad_norm': 1.2174433469772339, 'learning_rate': 0.00028000000000000003, 'epoch': 12.0}



[A
[A
[A
[A
[A
[A

                                                  
[A                                                
 60%|██████    | 960/1600 [00:47<00:23, 26.69it/s]
[A

{'eval_loss': 1.3045237064361572, 'eval_Accuracy': 0.838, 'eval_AUC': 0.59, 'eval_runtime': 0.7171, 'eval_samples_per_second': 223.118, 'eval_steps_per_second': 27.89, 'epoch': 12.0}


                                                   
 65%|██████▌   | 1040/1600 [00:50<00:21, 26.43it/s]

{'loss': 0.675, 'grad_norm': 1.4541630744934082, 'learning_rate': 0.000245, 'epoch': 13.0}



[A
[A
[A
[A
[A
[A

                                                   
[A                                                
 65%|██████▌   | 1040/1600 [00:51<00:21, 26.43it/s]
[A

{'eval_loss': 0.8321579098701477, 'eval_Accuracy': 0.744, 'eval_AUC': 0.62, 'eval_runtime': 0.7159, 'eval_samples_per_second': 223.497, 'eval_steps_per_second': 27.937, 'epoch': 13.0}


                                                   
 70%|███████   | 1120/1600 [00:54<00:18, 26.63it/s]

{'loss': 0.5527, 'grad_norm': 5.128293991088867, 'learning_rate': 0.00020999999999999998, 'epoch': 14.0}



[A
[A
[A
[A
[A
[A

                                                   
[A                                                
 70%|███████   | 1120/1600 [00:55<00:18, 26.63it/s]
[A

{'eval_loss': 0.795167863368988, 'eval_Accuracy': 0.538, 'eval_AUC': 0.59, 'eval_runtime': 0.7117, 'eval_samples_per_second': 224.818, 'eval_steps_per_second': 28.102, 'epoch': 14.0}


                                                   
 75%|███████▌  | 1200/1600 [00:58<00:15, 26.36it/s]

{'loss': 0.5675, 'grad_norm': 5.743371963500977, 'learning_rate': 0.000175, 'epoch': 15.0}



[A
[A
[A
[A
[A
[A

                                                   
[A                                                
 75%|███████▌  | 1200/1600 [00:59<00:15, 26.36it/s]
[A

{'eval_loss': 0.7832334637641907, 'eval_Accuracy': 0.512, 'eval_AUC': 0.628, 'eval_runtime': 0.7161, 'eval_samples_per_second': 223.428, 'eval_steps_per_second': 27.928, 'epoch': 15.0}


                                                   
 80%|████████  | 1280/1600 [01:02<00:12, 26.36it/s]

{'loss': 0.5331, 'grad_norm': 5.878108501434326, 'learning_rate': 0.00014000000000000001, 'epoch': 16.0}



[A
[A
[A
[A
[A
                                                   
[A                                                

 80%|████████  | 1280/1600 [01:03<00:12, 26.36it/s]
[A
[A

{'eval_loss': 0.9728301167488098, 'eval_Accuracy': 0.775, 'eval_AUC': 0.609, 'eval_runtime': 0.7124, 'eval_samples_per_second': 224.589, 'eval_steps_per_second': 28.074, 'epoch': 16.0}


                                                   
 85%|████████▌ | 1360/1600 [01:06<00:09, 26.48it/s]

{'loss': 0.5508, 'grad_norm': 4.014152526855469, 'learning_rate': 0.00010499999999999999, 'epoch': 17.0}



[A
[A
[A
[A
[A
                                                   
[A                                                

 85%|████████▌ | 1360/1600 [01:07<00:09, 26.48it/s]
[A
[A

{'eval_loss': 0.768287181854248, 'eval_Accuracy': 0.619, 'eval_AUC': 0.613, 'eval_runtime': 0.7143, 'eval_samples_per_second': 223.991, 'eval_steps_per_second': 27.999, 'epoch': 17.0}


                                                   
 90%|█████████ | 1440/1600 [01:10<00:06, 26.61it/s]

{'loss': 0.4952, 'grad_norm': 1.6988605260849, 'learning_rate': 7.000000000000001e-05, 'epoch': 18.0}



[A
[A
[A
[A
[A
                                                   
[A                                                

 90%|█████████ | 1440/1600 [01:11<00:06, 26.61it/s]
[A
[A

{'eval_loss': 1.1242461204528809, 'eval_Accuracy': 0.812, 'eval_AUC': 0.613, 'eval_runtime': 0.7154, 'eval_samples_per_second': 223.663, 'eval_steps_per_second': 27.958, 'epoch': 18.0}


                                                   
 95%|█████████▌| 1520/1600 [01:14<00:02, 26.70it/s]

{'loss': 0.5045, 'grad_norm': 9.240407943725586, 'learning_rate': 3.5000000000000004e-05, 'epoch': 19.0}



[A
[A
[A
[A
[A
[A

                                                   
[A                                                
 95%|█████████▌| 1520/1600 [01:15<00:02, 26.70it/s]
[A

{'eval_loss': 0.8580640554428101, 'eval_Accuracy': 0.706, 'eval_AUC': 0.604, 'eval_runtime': 0.715, 'eval_samples_per_second': 223.781, 'eval_steps_per_second': 27.973, 'epoch': 19.0}


                                                   
100%|██████████| 1600/1600 [01:18<00:00, 26.59it/s]

{'loss': 0.486, 'grad_norm': 4.628840446472168, 'learning_rate': 0.0, 'epoch': 20.0}



[A
[A
[A
[A
[A
                                                   
[A                                                

100%|██████████| 1600/1600 [01:19<00:00, 26.59it/s]
[A
[A

{'eval_loss': 0.828974723815918, 'eval_Accuracy': 0.681, 'eval_AUC': 0.602, 'eval_runtime': 0.7131, 'eval_samples_per_second': 224.357, 'eval_steps_per_second': 28.045, 'epoch': 20.0}


                                                   
100%|██████████| 1600/1600 [01:20<00:00, 19.96it/s]

{'train_runtime': 80.1447, 'train_samples_per_second': 159.711, 'train_steps_per_second': 19.964, 'train_loss': 0.6451917147636413, 'epoch': 20.0}





TrainOutput(global_step=1600, training_loss=0.6451917147636413, metrics={'train_runtime': 80.1447, 'train_samples_per_second': 159.711, 'train_steps_per_second': 19.964, 'total_flos': 1683910754304000.0, 'train_loss': 0.6451917147636413, 'epoch': 20.0})

In [141]:
#load best epoch for testing
checkpoint_path = "bert-movie_classifier\checkpoint-800"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

  checkpoint_path = "bert-movie_classifier\checkpoint-800"


In [142]:
# get predictions
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

test_dataset = Dataset.from_dict({
    "overview": data_test_bert.tolist(),
    "label": labels_test_bert.tolist()
})

tokenized_test = test_dataset.map(tokenize_dataset, batched=True)

predictions = trainer.predict(tokenized_test)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
print(y_pred)

  trainer = Trainer(
Map: 100%|██████████| 200/200 [00:00<00:00, 12501.46 examples/s]
100%|██████████| 25/25 [00:00<00:00, 30.12it/s]

[0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0
 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0]





In [143]:
# calculate Accuracy, F1, percision, recall and confusion matrix
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)

print(classification_report(y_true, y_pred, digits=4))

F1-score:  0.2462

Confusion Matrix:
 [[143  31]
 [ 18   8]]
              precision    recall  f1-score   support

           0     0.8882    0.8218    0.8537       174
           1     0.2051    0.3077    0.2462        26

    accuracy                         0.7550       200
   macro avg     0.5467    0.5648    0.5499       200
weighted avg     0.7994    0.7550    0.7747       200



#### 8.3.5. BERT based multimodal classifier 

In [None]:
# CUSTOM MODEL FOR MULTIMODAL CLASSIFICATION WITH BERT AND LINEAR LAYER FOR METADATA (CERTIFICATE, RUNTIME, NUMBER OF VOTES)

class BERTWithMetadata(nn.Module):
    def __init__(self, model_name, num_metadata_features, num_labels=2,  class_weights=None):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.2)
        
        # Linear layer for metadata
        self.metadata_fc = nn.Linear(num_metadata_features, 128)
        
        # Combine [CLS] + metadata
        self.classifier = nn.Linear(self.bert.config.hidden_size + 128, num_labels)
        
        if class_weights is not None:
            self.register_buffer("class_weights", torch.tensor(class_weights, dtype=torch.float))
        else:
            self.class_weights = None
            
    
    def forward(self, input_ids, attention_mask, metadata, labels=None, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        
        meta_output = torch.relu(self.metadata_fc(metadata))
        combined = torch.cat((pooled_output, meta_output), dim=1)
        combined = self.dropout(combined)
        logits = self.classifier(combined)
        
        loss = None
        if labels is not None:
            if self.class_weights is not None:
                loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
            else:
                loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [None]:
#split data into train, test, validation sets

encoder = OneHotEncoder(sparse_output=False)

bert_data_improved = imdb_data[["Overview", "Certificate", "Runtime", "No_of_Votes"]]
bert_data_improved["label"] = labels

train_df, test_df = train_test_split(bert_data_improved, test_size=0.2, random_state=7, stratify=bert_data_improved["label"])
train_df, eval_df = train_test_split(train_df, test_size=0.2, random_state=7, stratify=train_df["label"])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bert_data_improved["label"] = labels


In [None]:
# create metadata for model
train_encodings = tokenizer(
    train_df["Overview"].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

eval_encodings = tokenizer(
    eval_df["Overview"].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

test_encodings = tokenizer(
    test_df["Overview"].tolist(),
    truncation=True,
    padding=True,
    max_length=256
)

certificate_train = train_df[["Certificate"]].fillna("Unknown")
certificate_eval  = eval_df[["Certificate"]].fillna("Unknown")
certificate_test  = test_df[["Certificate"]].fillna("Unknown")

ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
certificate_train = ohe.fit_transform(certificate_train)
certificate_eval  = ohe.transform(certificate_eval)
certificate_test  = ohe.transform(certificate_test)

num_features_train = train_df[["Runtime", "No_of_Votes"]].fillna(0)
num_features_eval  = eval_df[["Runtime", "No_of_Votes"]].fillna(0)
num_features_test  = test_df[["Runtime", "No_of_Votes"]].fillna(0)

scaler = StandardScaler()
num_train = scaler.fit_transform(num_features_train)
num_eval  = scaler.transform(num_features_eval)
num_test  = scaler.transform(num_features_test)

# Combine
train_metadata = np.concatenate([certificate_train, num_train], axis=1)
eval_metadata  = np.concatenate([certificate_eval, num_eval], axis=1)
test_metadata  = np.concatenate([certificate_test, num_test], axis=1)

In [161]:
# get labels
train_labels = train_df["label"].values
eval_labels  = eval_df["label"].values
test_labels  = test_df["label"].values

In [162]:
# custom dataset to use with model

class MovieDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, metadata, labels):
        self.encodings = encodings
        self.metadata = metadata
        self.labels = labels

    def __getitem__(self, idx):
        
        if isinstance(idx, list) or isinstance(idx, np.ndarray):
            batch = [self.__getitem__(i) for i in idx]
            return {key: torch.stack([d[key] for d in batch]) for key in batch[0]}

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["metadata"] = torch.tensor(self.metadata[idx], dtype=torch.float)
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [163]:
train_dataset = MovieDataset(train_encodings, train_metadata, train_labels)
eval_dataset  = MovieDataset(eval_encodings,  eval_metadata,  eval_labels)
test_dataset  = MovieDataset(test_encodings,  test_metadata,  test_labels)

In [164]:
# custom data collator
def collate_fn(batch):
    
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    metadata = [item["metadata"] for item in batch]
    labels = [item["labels"] for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    metadata = torch.stack(metadata)
    labels = torch.stack(labels)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "metadata": metadata,
        "labels": labels
    }

In [None]:
# initiate the model using custom class
model = BERTWithMetadata("bert-base-uncased", num_metadata_features=train_metadata.shape[1], num_labels=2, class_weights=class_weights)


  self.register_buffer("class_weights", torch.tensor(class_weights, dtype=torch.float))


In [None]:
#TRAIN MODEL

training_args = TrainingArguments(
    output_dir="./bert-movie_classifier_with_metadata",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=30,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)


trainer.train()

  trainer = Trainer(
 71%|███████   | 1589/2240 [2:10:10<53:19,  4.92s/it]
  3%|▎         | 80/2400 [00:04<02:08, 18.06it/s]

{'loss': 0.7571, 'grad_norm': 5.260904312133789, 'learning_rate': 0.00048333333333333334, 'epoch': 1.0}


                                                 
  3%|▎         | 80/2400 [00:04<02:08, 18.06it/s]

{'eval_loss': 0.6345117688179016, 'eval_Accuracy': 0.55, 'eval_AUC': 0.844, 'eval_runtime': 0.2469, 'eval_samples_per_second': 647.938, 'eval_steps_per_second': 80.992, 'epoch': 1.0}


  7%|▋         | 160/2400 [00:10<02:02, 18.35it/s]

{'loss': 0.6843, 'grad_norm': 28.46091651916504, 'learning_rate': 0.00046666666666666666, 'epoch': 2.0}



  7%|▋         | 160/2400 [00:10<02:02, 18.35it/s]

{'eval_loss': 1.0118478536605835, 'eval_Accuracy': 0.875, 'eval_AUC': 0.829, 'eval_runtime': 0.2351, 'eval_samples_per_second': 680.53, 'eval_steps_per_second': 85.066, 'epoch': 2.0}


 10%|█         | 240/2400 [00:16<01:57, 18.40it/s]

{'loss': 0.6184, 'grad_norm': 12.798458099365234, 'learning_rate': 0.00045000000000000004, 'epoch': 3.0}


                                                  
 10%|█         | 240/2400 [00:16<01:57, 18.40it/s]

{'eval_loss': 0.49742794036865234, 'eval_Accuracy': 0.9, 'eval_AUC': 0.826, 'eval_runtime': 0.2379, 'eval_samples_per_second': 672.586, 'eval_steps_per_second': 84.073, 'epoch': 3.0}


 13%|█▎        | 320/2400 [00:21<01:57, 17.68it/s]

{'loss': 0.6324, 'grad_norm': 11.25619888305664, 'learning_rate': 0.00043333333333333337, 'epoch': 4.0}


                                                  
 13%|█▎        | 320/2400 [00:22<01:57, 17.68it/s]

{'eval_loss': 0.48079657554626465, 'eval_Accuracy': 0.844, 'eval_AUC': 0.825, 'eval_runtime': 0.2432, 'eval_samples_per_second': 657.954, 'eval_steps_per_second': 82.244, 'epoch': 4.0}


 17%|█▋        | 400/2400 [00:27<01:49, 18.27it/s]

{'loss': 0.6184, 'grad_norm': 1.884811282157898, 'learning_rate': 0.0004166666666666667, 'epoch': 5.0}



 17%|█▋        | 400/2400 [00:27<01:49, 18.27it/s]

{'eval_loss': 0.48202961683273315, 'eval_Accuracy': 0.869, 'eval_AUC': 0.82, 'eval_runtime': 0.2375, 'eval_samples_per_second': 673.677, 'eval_steps_per_second': 84.21, 'epoch': 5.0}


 20%|██        | 480/2400 [00:32<01:45, 18.28it/s]

{'loss': 0.5731, 'grad_norm': 11.358325004577637, 'learning_rate': 0.0004, 'epoch': 6.0}



 20%|██        | 480/2400 [00:33<01:45, 18.28it/s]

{'eval_loss': 0.4934304356575012, 'eval_Accuracy': 0.869, 'eval_AUC': 0.824, 'eval_runtime': 0.231, 'eval_samples_per_second': 692.739, 'eval_steps_per_second': 86.592, 'epoch': 6.0}


 23%|██▎       | 560/2400 [00:38<01:40, 18.27it/s]

{'loss': 0.5837, 'grad_norm': 14.294907569885254, 'learning_rate': 0.00038333333333333334, 'epoch': 7.0}



 23%|██▎       | 560/2400 [00:38<01:40, 18.27it/s]

{'eval_loss': 0.475970596075058, 'eval_Accuracy': 0.856, 'eval_AUC': 0.825, 'eval_runtime': 0.2431, 'eval_samples_per_second': 658.048, 'eval_steps_per_second': 82.256, 'epoch': 7.0}


 27%|██▋       | 640/2400 [00:43<01:34, 18.60it/s]

{'loss': 0.5861, 'grad_norm': 18.777746200561523, 'learning_rate': 0.00036666666666666667, 'epoch': 8.0}


                                                  
 27%|██▋       | 640/2400 [00:44<01:34, 18.60it/s]

{'eval_loss': 0.48037442564964294, 'eval_Accuracy': 0.85, 'eval_AUC': 0.825, 'eval_runtime': 0.2424, 'eval_samples_per_second': 659.951, 'eval_steps_per_second': 82.494, 'epoch': 8.0}


 30%|███       | 720/2400 [00:49<01:32, 18.10it/s]

{'loss': 0.629, 'grad_norm': 18.599761962890625, 'learning_rate': 0.00035, 'epoch': 9.0}



 30%|███       | 720/2400 [00:49<01:32, 18.10it/s]

{'eval_loss': 0.878714382648468, 'eval_Accuracy': 0.906, 'eval_AUC': 0.828, 'eval_runtime': 0.2679, 'eval_samples_per_second': 597.339, 'eval_steps_per_second': 74.667, 'epoch': 9.0}


 33%|███▎      | 800/2400 [00:55<01:26, 18.44it/s]

{'loss': 0.6437, 'grad_norm': 2.6764638423919678, 'learning_rate': 0.0003333333333333333, 'epoch': 10.0}


                                                  
 33%|███▎      | 800/2400 [00:55<01:26, 18.44it/s]

{'eval_loss': 0.48509567975997925, 'eval_Accuracy': 0.844, 'eval_AUC': 0.826, 'eval_runtime': 0.2525, 'eval_samples_per_second': 633.713, 'eval_steps_per_second': 79.214, 'epoch': 10.0}


 37%|███▋      | 880/2400 [01:01<01:22, 18.42it/s]

{'loss': 0.6959, 'grad_norm': 1.0985898971557617, 'learning_rate': 0.00031666666666666665, 'epoch': 11.0}


                                                  
 37%|███▋      | 880/2400 [01:01<01:22, 18.42it/s]

{'eval_loss': 0.5403578877449036, 'eval_Accuracy': 0.875, 'eval_AUC': 0.827, 'eval_runtime': 0.2479, 'eval_samples_per_second': 645.311, 'eval_steps_per_second': 80.664, 'epoch': 11.0}


 40%|████      | 960/2400 [01:06<01:18, 18.32it/s]

{'loss': 0.6033, 'grad_norm': 2.9429965019226074, 'learning_rate': 0.0003, 'epoch': 12.0}


                                                  
 40%|████      | 960/2400 [01:06<01:18, 18.32it/s]

{'eval_loss': 0.5180829763412476, 'eval_Accuracy': 0.862, 'eval_AUC': 0.829, 'eval_runtime': 0.2379, 'eval_samples_per_second': 672.471, 'eval_steps_per_second': 84.059, 'epoch': 12.0}


 43%|████▎     | 1040/2400 [01:12<01:15, 17.96it/s]

{'loss': 0.6262, 'grad_norm': 22.342514038085938, 'learning_rate': 0.00028333333333333335, 'epoch': 13.0}



 43%|████▎     | 1040/2400 [01:12<01:15, 17.96it/s]

{'eval_loss': 0.6443185210227966, 'eval_Accuracy': 0.912, 'eval_AUC': 0.826, 'eval_runtime': 0.2544, 'eval_samples_per_second': 628.956, 'eval_steps_per_second': 78.619, 'epoch': 13.0}


 47%|████▋     | 1120/2400 [01:17<01:14, 17.15it/s]

{'loss': 0.5906, 'grad_norm': 9.38235855102539, 'learning_rate': 0.0002666666666666667, 'epoch': 14.0}



 47%|████▋     | 1120/2400 [01:18<01:14, 17.15it/s]

{'eval_loss': 0.5990273356437683, 'eval_Accuracy': 0.894, 'eval_AUC': 0.826, 'eval_runtime': 0.2743, 'eval_samples_per_second': 583.309, 'eval_steps_per_second': 72.914, 'epoch': 14.0}


 50%|█████     | 1200/2400 [01:23<01:05, 18.31it/s]

{'loss': 0.5862, 'grad_norm': 6.188243865966797, 'learning_rate': 0.00025, 'epoch': 15.0}



 50%|█████     | 1200/2400 [01:23<01:05, 18.31it/s]

{'eval_loss': 0.5422160029411316, 'eval_Accuracy': 0.881, 'eval_AUC': 0.827, 'eval_runtime': 0.2469, 'eval_samples_per_second': 648.062, 'eval_steps_per_second': 81.008, 'epoch': 15.0}


 53%|█████▎    | 1280/2400 [01:29<01:01, 18.34it/s]

{'loss': 0.6293, 'grad_norm': 6.312684535980225, 'learning_rate': 0.00023333333333333333, 'epoch': 16.0}


                                                   
 53%|█████▎    | 1280/2400 [01:29<01:01, 18.34it/s]

{'eval_loss': 0.5088487267494202, 'eval_Accuracy': 0.862, 'eval_AUC': 0.825, 'eval_runtime': 0.2416, 'eval_samples_per_second': 662.23, 'eval_steps_per_second': 82.779, 'epoch': 16.0}


 57%|█████▋    | 1360/2400 [01:35<00:59, 17.49it/s]

{'loss': 0.6323, 'grad_norm': 5.275120735168457, 'learning_rate': 0.00021666666666666668, 'epoch': 17.0}



 57%|█████▋    | 1360/2400 [01:35<00:59, 17.49it/s]

{'eval_loss': 0.49735990166664124, 'eval_Accuracy': 0.856, 'eval_AUC': 0.825, 'eval_runtime': 0.2591, 'eval_samples_per_second': 617.597, 'eval_steps_per_second': 77.2, 'epoch': 17.0}


 60%|██████    | 1440/2400 [01:40<00:52, 18.39it/s]

{'loss': 0.5589, 'grad_norm': 6.47817325592041, 'learning_rate': 0.0002, 'epoch': 18.0}



 60%|██████    | 1440/2400 [01:40<00:52, 18.39it/s]

{'eval_loss': 0.5947973728179932, 'eval_Accuracy': 0.881, 'eval_AUC': 0.825, 'eval_runtime': 0.2399, 'eval_samples_per_second': 666.946, 'eval_steps_per_second': 83.368, 'epoch': 18.0}


 63%|██████▎   | 1520/2400 [01:46<00:48, 18.08it/s]

{'loss': 0.6688, 'grad_norm': 16.966161727905273, 'learning_rate': 0.00018333333333333334, 'epoch': 19.0}


                                                   
 63%|██████▎   | 1520/2400 [01:46<00:48, 18.08it/s]

{'eval_loss': 0.6029297113418579, 'eval_Accuracy': 0.888, 'eval_AUC': 0.825, 'eval_runtime': 0.2572, 'eval_samples_per_second': 621.97, 'eval_steps_per_second': 77.746, 'epoch': 19.0}


 67%|██████▋   | 1600/2400 [01:52<00:45, 17.78it/s]

{'loss': 0.6383, 'grad_norm': 8.557621955871582, 'learning_rate': 0.00016666666666666666, 'epoch': 20.0}



 67%|██████▋   | 1600/2400 [01:52<00:45, 17.78it/s]

{'eval_loss': 0.5009627938270569, 'eval_Accuracy': 0.85, 'eval_AUC': 0.824, 'eval_runtime': 0.2455, 'eval_samples_per_second': 651.739, 'eval_steps_per_second': 81.467, 'epoch': 20.0}


 70%|███████   | 1680/2400 [01:57<00:39, 18.12it/s]

{'loss': 0.6543, 'grad_norm': 1.3210625648498535, 'learning_rate': 0.00015, 'epoch': 21.0}


                                                   
 70%|███████   | 1680/2400 [01:58<00:39, 18.12it/s]

{'eval_loss': 0.6052649617195129, 'eval_Accuracy': 0.888, 'eval_AUC': 0.824, 'eval_runtime': 0.2388, 'eval_samples_per_second': 670.095, 'eval_steps_per_second': 83.762, 'epoch': 21.0}


 73%|███████▎  | 1760/2400 [02:03<00:35, 18.17it/s]

{'loss': 0.6078, 'grad_norm': 4.006030082702637, 'learning_rate': 0.00013333333333333334, 'epoch': 22.0}


                                                   
 73%|███████▎  | 1760/2400 [02:03<00:35, 18.17it/s]

{'eval_loss': 0.5268858671188354, 'eval_Accuracy': 0.881, 'eval_AUC': 0.824, 'eval_runtime': 0.2374, 'eval_samples_per_second': 673.939, 'eval_steps_per_second': 84.242, 'epoch': 22.0}


 77%|███████▋  | 1840/2400 [02:09<00:30, 18.17it/s]

{'loss': 0.589, 'grad_norm': 9.565999984741211, 'learning_rate': 0.00011666666666666667, 'epoch': 23.0}


                                                   
 77%|███████▋  | 1840/2400 [02:09<00:30, 18.17it/s]

{'eval_loss': 0.5217307209968567, 'eval_Accuracy': 0.881, 'eval_AUC': 0.824, 'eval_runtime': 0.2437, 'eval_samples_per_second': 656.564, 'eval_steps_per_second': 82.071, 'epoch': 23.0}


 80%|████████  | 1920/2400 [02:14<00:26, 18.32it/s]

{'loss': 0.6033, 'grad_norm': 6.097932815551758, 'learning_rate': 0.0001, 'epoch': 24.0}



 80%|████████  | 1920/2400 [02:15<00:26, 18.32it/s]

{'eval_loss': 0.5512757897377014, 'eval_Accuracy': 0.881, 'eval_AUC': 0.824, 'eval_runtime': 0.2398, 'eval_samples_per_second': 667.22, 'eval_steps_per_second': 83.403, 'epoch': 24.0}


 83%|████████▎ | 2000/2400 [02:20<00:22, 17.58it/s]

{'loss': 0.6088, 'grad_norm': 0.7419002652168274, 'learning_rate': 8.333333333333333e-05, 'epoch': 25.0}



 83%|████████▎ | 2000/2400 [02:20<00:22, 17.58it/s]

{'eval_loss': 0.5567864179611206, 'eval_Accuracy': 0.881, 'eval_AUC': 0.824, 'eval_runtime': 0.2482, 'eval_samples_per_second': 644.756, 'eval_steps_per_second': 80.595, 'epoch': 25.0}


 87%|████████▋ | 2080/2400 [02:26<00:17, 18.28it/s]

{'loss': 0.6226, 'grad_norm': 22.61118507385254, 'learning_rate': 6.666666666666667e-05, 'epoch': 26.0}


                                                   
 87%|████████▋ | 2080/2400 [02:26<00:17, 18.28it/s]

{'eval_loss': 0.5469404458999634, 'eval_Accuracy': 0.881, 'eval_AUC': 0.824, 'eval_runtime': 0.2366, 'eval_samples_per_second': 676.233, 'eval_steps_per_second': 84.529, 'epoch': 26.0}


 90%|█████████ | 2160/2400 [02:31<00:13, 17.88it/s]

{'loss': 0.6412, 'grad_norm': 1.549011468887329, 'learning_rate': 5e-05, 'epoch': 27.0}



 90%|█████████ | 2160/2400 [02:31<00:13, 17.88it/s]

{'eval_loss': 0.5213174223899841, 'eval_Accuracy': 0.881, 'eval_AUC': 0.824, 'eval_runtime': 0.2444, 'eval_samples_per_second': 654.581, 'eval_steps_per_second': 81.823, 'epoch': 27.0}


 93%|█████████▎| 2240/2400 [02:37<00:08, 18.13it/s]

{'loss': 0.5292, 'grad_norm': 3.2696540355682373, 'learning_rate': 3.3333333333333335e-05, 'epoch': 28.0}


                                                   
 93%|█████████▎| 2240/2400 [02:37<00:08, 18.13it/s]

{'eval_loss': 0.5294851064682007, 'eval_Accuracy': 0.881, 'eval_AUC': 0.824, 'eval_runtime': 0.2401, 'eval_samples_per_second': 666.351, 'eval_steps_per_second': 83.294, 'epoch': 28.0}


 97%|█████████▋| 2320/2400 [02:42<00:04, 18.10it/s]

{'loss': 0.5393, 'grad_norm': 2.0439987182617188, 'learning_rate': 1.6666666666666667e-05, 'epoch': 29.0}


                                                   
 97%|█████████▋| 2320/2400 [02:43<00:04, 18.10it/s]

{'eval_loss': 0.5611902475357056, 'eval_Accuracy': 0.875, 'eval_AUC': 0.823, 'eval_runtime': 0.2445, 'eval_samples_per_second': 654.327, 'eval_steps_per_second': 81.791, 'epoch': 29.0}


100%|██████████| 2400/2400 [02:49<00:00, 17.61it/s]

{'loss': 0.5822, 'grad_norm': 1.0492743253707886, 'learning_rate': 0.0, 'epoch': 30.0}


                                                   
100%|██████████| 2400/2400 [02:49<00:00, 17.61it/s]

{'eval_loss': 0.5653203725814819, 'eval_Accuracy': 0.875, 'eval_AUC': 0.823, 'eval_runtime': 0.2425, 'eval_samples_per_second': 659.688, 'eval_steps_per_second': 82.461, 'epoch': 30.0}


100%|██████████| 2400/2400 [02:51<00:00, 14.01it/s]

{'train_runtime': 171.319, 'train_samples_per_second': 112.072, 'train_steps_per_second': 14.009, 'train_loss': 0.6177941783269246, 'epoch': 30.0}





TrainOutput(global_step=2400, training_loss=0.6177941783269246, metrics={'train_runtime': 171.319, 'train_samples_per_second': 112.072, 'train_steps_per_second': 14.009, 'total_flos': 0.0, 'train_loss': 0.6177941783269246, 'epoch': 30.0})

In [169]:
#load best epoch for evaluation
checkpoint_path = "bert-movie_classifier_with_metadata\checkpoint-560\model.safetensors"

model = BERTWithMetadata("bert-base-uncased", num_metadata_features=eval_metadata.shape[1], num_labels=2, class_weights=class_weights)
load_model(model, checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained("bert-movie_classifier_with_metadata\checkpoint-560")
model.eval()

  checkpoint_path = "bert-movie_classifier_with_metadata\checkpoint-560\model.safetensors"
  tokenizer = AutoTokenizer.from_pretrained("bert-movie_classifier_with_metadata\checkpoint-560")
  self.register_buffer("class_weights", torch.tensor(class_weights, dtype=torch.float))


BERTWithMetadata(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [170]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [171]:
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids
print(y_pred)

100%|██████████| 25/25 [00:00<00:00, 52.53it/s]

[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0
 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0]





In [172]:
f1 = f1_score(y_true, y_pred)
cm = confusion_matrix(y_true, y_pred)
print(f"F1-score:  {f1:.4f}")
print("\nConfusion Matrix:\n", cm)

print(classification_report(y_true, y_pred, digits=4))

F1-score:  0.4478

Confusion Matrix:
 [[148  26]
 [ 11  15]]
              precision    recall  f1-score   support

           0     0.9308    0.8506    0.8889       174
           1     0.3659    0.5769    0.4478        26

    accuracy                         0.8150       200
   macro avg     0.6483    0.7137    0.6683       200
weighted avg     0.8574    0.8150    0.8315       200

