In [None]:
import os
import warnings
import pandas as pd
import torch
import numpy as np
import torch.nn.functional as F
from transformers import BertForSequenceClassification, BertTokenizer
warnings.filterwarnings('ignore')
import os
os.chdir("..")

DATA = "./data/texts.json"
MODEL = 'bert-base-uncased'
MODELS = {
    "RESPONSE_1" : "1. Your company/vision/mission in a tweet ! *",
    "RESPONSE_4" : "4. Which problem does your company solve? *",
    "RESPONSE_5" : "5. What sparked the founder(s) to set up the company? *",
    "RESPONSE_14" : "14. Which business area in the last year consumed most of your management attention? *",
    "RESPONSE_16" : "16. After your next financing round, will the founders jointly hold above or below 50% of the",
    "RESPONSE_18" : "18. How do you integrate ESG standards in your business model? *",
    "RESPONSE_19" : "19. When scaling the company, how do you ensure that the corporate culture does not suffer and",
    "Description":"Description",
    "Difference":"Difference",
}

In [None]:
def load_model(model_path, model_name=MODEL, num_labels=2):
    model = BertForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_labels, 
        output_attentions=False, 
        output_hidden_states=False
    )
    model.load_state_dict(torch.load(model_path))
    model.eval()  # Put the model in evaluation mode
    return model

In [None]:
class FTGAInference():
    def __init__(self) -> None:
        self.tokenizer = BertTokenizer.from_pretrained(MODEL)
        self.models = {}
        for key, value in iter(MODELS.items()):
            model = load_model("./data/" + key + ".pth")
            self.models[key] = model
    
    def inference(self, df, label="predict"):
        for key, model in iter(self.models.items()):
            df[label + "_" + key] = df[MODELS[key]].apply(lambda x: self.predict(x, model))
            
        model_keys = [label + "_" + key for key in self.models.keys()]
        df[label + "_total_sum"] = df[model_keys].sum(axis=1)

        df[label + "_average"] = (df[label + "_total_sum"] / len(self.models)).round().astype(int)
        return df
    
    def prepare_input(self, text):
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=128, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors='pt'  # Return PyTorch tensors
        )
        return inputs['input_ids'], inputs['attention_mask']
    

    def predict(self, text, model):
        model.eval()  # Make sure model is in eval mode for inference
        input_ids, attention_mask = self.prepare_input(text)
        with torch.no_grad():  # No need to track gradients for inference
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Use argmax to determine the predicted class (0 or 1)
            predicted_class = torch.argmax(logits, dim=-1).item()  # Get the class index as a Python int

            # Map the predicted class index to the desired output
            if predicted_class == 0:
                result = -1
            else:
                result = 6

        return result

In [None]:
Inference = FTGAInference()

In [None]:
df = pd.read_json(DATA)
df = Inference.inference(df)


In [None]:
df[["Filename", "predict_average"]]