In [None]:
import os
os.chdir("..")
os.environ['OPENAI_KEY'] = 'sk-'

from datasets import load_dataset

dataset = load_dataset("./FTGA", "2024", trust_remote_code=True)


In [None]:
val = dataset["valid"]
val

In [None]:
for v in val:
    print(v)

In [None]:
import os
import warnings
import pandas as pd
import torch
import numpy as np
import torch.nn.functional as F
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import logging
logging.set_verbosity_error()
warnings.filterwarnings('ignore')

DATA = "./data/preprocessed.json"
MODEL = 'bert-base-multilingual-uncased'
MODELS = {
    "RESPONSE_1" : "Your company/vision/mission in a tweet !",
    "RESPONSE_2" : "Which problem does your company solve?",
    "RESPONSE_3" : "What sparked the founder(s) to set up the company?",
    "RESPONSE_4" : "Please describe the business model of your company, including the main sources of revenue.",
    "RESPONSE_5" : "What is your USP versus traditional competitors or in the digital sphere?",
    "RESPONSE_6" : "Which target group(s) in which markets do you address primarily?",
    "RESPONSE_7" : "Please provide a brief competition landscape for your core markets.",
    "RESPONSE_8" : "Did you pivot in the past and why?",
    "RESPONSE_9" : "Which business area in the last year consumed most of your management attention?",
    "RESPONSE_10" : "What protects your business from new market entrants or copycats?",
    "RESPONSE_11" : "After your next financing round, will the founders jointly hold above or below 50% of the shares?",
    "RESPONSE_12" : "How do you integrate ESG standards in your business model?",
    "RESPONSE_13" : "When scaling the company, how do you ensure that the corporate culture does not suffer and that the shared values and goals remain intact?",
    "Difference" : "Differentiators",
}
MODELS = {
    "RESPONSE_1" : "Your company/vision/mission in a tweet !",
    "RESPONSE_4" : "Which problem does your company solve?",
    "RESPONSE_5" : "What sparked the founder(s) to set up the company?",
    "RESPONSE_6" : "Please describe the business model of your company, including the main sources of revenue.",
    "RESPONSE_7" : "What is your USP versus traditional competitors or in the digital sphere?",
    "RESPONSE_8" : "Which target group(s) in which markets do you address primarily?",
    "RESPONSE_12" : "Please provide a brief competition landscape for your core markets.",
    "RESPONSE_13" : "Did you pivot in the past and why?",
    "RESPONSE_14" : "Which business area in the last year consumed most of your management attention?",
    "RESPONSE_15" : "What protects your business from new market entrants or copycats?",
    "RESPONSE_16" : "After your next financing round, will the founders jointly hold above or below 50% of the shares?",
    "RESPONSE_18" : "How do you integrate ESG standards in your business model?",
    "RESPONSE_19" : "When scaling the company, how do you ensure that the corporate culture does not suffer and that the shared values and goals remain intact?",
    "Difference" : "Differentiators",
}

In [None]:
def load_bert_model(model_path, model_name=MODEL, num_labels=2):
    model = BertForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_labels, 
        output_attentions=False, 
        output_hidden_states=False
    )
    model.load_state_dict(torch.load(model_path))
    model.eval()  # Put the model in evaluation mode
    return model

from joblib import load

def load_rf_model(model_path):
    rf_model = load(model_path)
    return rf_model

In [None]:
from dotenv import load_dotenv
from tqdm import tqdm
from openai import OpenAI

load_dotenv()

client = OpenAI()

class FTGAInference():
    def __init__(self) -> None:
        self.tokenizer = BertTokenizer.from_pretrained(MODEL)
        self.bert_models = {}
        self.rf_models = {}
        for key, value in tqdm(iter(MODELS.items())):
            bert_model = load_bert_model("./data/" + key + ".pth")
            rf_model = load_rf_model("./data/" + key + "_rf.joblib")
            self.bert_models[key] = bert_model
            self.rf_models[key] = rf_model

    
    def inference(self, df, label="predict"):
        for key, bert_model in iter(self.bert_models.items()):
            df[label + "_bert_" + key] = df[MODELS[key]].apply(lambda x: self.predict_bert(x, bert_model))

        for key, rf_model in iter(self.rf_models.items()):
            df[label + "_rf_" + key] = df[MODELS[key]].apply(lambda x: self.predict_rf(x, rf_model))
            
        bert_model_keys = [label + "_bert_" + key for key in self.bert_models.keys()]
        df[label + "_total_sum_bert"] = df[bert_model_keys].sum(axis=1)

        rf_model_keys = [label + "_rf_" + key for key in self.rf_models.keys()]
        df[label + "_total_sum_rf"] = df[rf_model_keys].sum(axis=1)

        df[label + "_average_bert"] = (df[label + "_total_sum_bert"] / len(self.bert_models)).round().astype(int).clip(lower=1)
        df[label + "_average_rf"] = (df[label + "_total_sum_rf"] / len(self.rf_models)).round().astype(int).clip(lower=1)
        
        df[label + "_weighted_average"] = ( 0.2 * df[label + "_average_bert"] + 0.8 * df[label + "_average_rf"]).round().astype(int).clip(lower=1).squeeze()
        return df
    
    def prepare_input(self, text):
        inputs = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=128, 
            pad_to_max_length=True, 
            return_attention_mask=True, 
            return_tensors='pt'  # Return PyTorch tensors
        )
        return inputs['input_ids'], inputs['attention_mask']
    

    def predict_bert(self, text, model):
        if text == None or len(text) < 1:
            return None
        model.eval()  # Make sure model is in eval mode for inference
        input_ids, attention_mask = self.prepare_input(text)
        with torch.no_grad():  # No need to track gradients for inference
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Use argmax to determine the predicted class (0 or 1)
            predicted_class = torch.argmax(logits, dim=-1).item()  # Get the class index as a Python int

            # Map the predicted class index to the desired output
            if predicted_class == 0:
                result = -2
            else:
                result = 7

        return result
    
    def get_embedding(self, text, model="text-embedding-3-small"):
        text = text.replace("\n", " ")
        response = client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    
    def predict_rf(self, text, model):
        if text == None or len(text) < 1:
            return None
        embedding = self.get_embedding(text)
        embedding = np.array(embedding).reshape(1, -1)
        prediction = model.predict(embedding)
        if prediction == 0:
            result = -2
        else:
            result = 7

        return result

In [None]:
Inference = FTGAInference()

In [None]:
df = Inference.inference(pd.DataFrame(val))

In [None]:
df.head()

In [None]:
df[["predict_weighted_average"]].mean()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df[["Startup/Person Name", "predict_average_bert", "predict_average_rf", "predict_weighted_average"]]

In [None]:
columns_to_keep = ["Startup/Person Name"]
for key in MODELS:
    columns_to_keep.append(MODELS[key])

for p in df.columns:
    if not p.startswith("predict"):
        continue
    columns_to_keep.append(p)

df_website = df[columns_to_keep]
df_website.head()

In [None]:
# Convert all columns to string
df_website = df_website.astype(str)

# Calculate and add the new columns based on ratings
questions_rated = [1, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 18, 19, "Difference"]

for rating_number in questions_rated:
    if rating_number == "Difference":
        rf_rating = f"predict_rf_{rating_number}"
        bert_rating = f"predict_bert_{rating_number}"
        new_column = f"{rating_number}_rating"
    else:
        rf_rating = f"predict_rf_RESPONSE_{rating_number}"
        bert_rating = f"predict_bert_RESPONSE_{rating_number}"
        new_column = f"RESPONSE_{rating_number}_rating"
    df_website[new_column] = (df[rf_rating] + df[bert_rating]) / 2

# Drop the original rating columns from df_website
rating_columns_to_drop = [f"predict_rf_RESPONSE_{rn}" for rn in questions_rated if rn != "Difference"] + \
                         [f"predict_bert_RESPONSE_{rn}" for rn in questions_rated if rn != "Difference"] + \
                         ["predict_rf_Difference", "predict_bert_Difference"]

df_website.drop(columns=rating_columns_to_drop, axis=1, inplace=True)
df_website.head()

In [None]:
# Create the new column "Startup Name" by removing the ".pdf" extension
df_website['Startup Name'] = df_website['Startup/Person Name']

# Drop the "filename" column
df_website.drop(columns=['Startup/Person Name'], inplace=True)

In [None]:
for key in MODELS:
    if len(key.split("_")) > 1:
        column_name = "Answer " + key.split("_")[1]
        df_website[column_name] = df_website[MODELS[key]]
        df_website = df_website.drop(columns=[MODELS[key]])
        column_name = "Question " + key.split("_")[1]
        df_website[column_name] = MODELS[key]
    else:
        df_website["Answer 0"] = df_website[MODELS[key]]
        df_website["Question 0"] = MODELS[key]
        df_website = df_website.drop(columns=[MODELS[key]])


In [None]:
df_website.head()

In [None]:
# Create new rating columns with renamed names and copy content from the old columns
df_website['Rating 1'] = df_website['RESPONSE_1_rating']
df_website['Rating 4'] = df_website['RESPONSE_4_rating']
df_website['Rating 5'] = df_website['RESPONSE_5_rating']
df_website['Rating 6'] = df_website['RESPONSE_6_rating']
df_website['Rating 7'] = df_website['RESPONSE_7_rating']
df_website['Rating 8'] = df_website['RESPONSE_8_rating']
df_website['Rating 12'] = df_website['RESPONSE_12_rating']
df_website['Rating 13'] = df_website['RESPONSE_13_rating']
df_website['Rating 14'] = df_website['RESPONSE_14_rating']
df_website['Rating 15'] = df_website['RESPONSE_15_rating']
df_website['Rating 16'] = df_website['RESPONSE_16_rating']
df_website['Rating 18'] = df_website['RESPONSE_18_rating']
df_website['Rating 19'] = df_website['RESPONSE_19_rating']
df_website['Rating 0'] = df_website['Difference_rating']
df_website["Rating"] = df_website["predict_weighted_average"]

# Drop the original rating columns
df_website.drop(columns=[
    'RESPONSE_1_rating',
    'RESPONSE_4_rating',
    'RESPONSE_5_rating',
    'RESPONSE_6_rating',
    'RESPONSE_7_rating',
    'RESPONSE_8_rating',
    'RESPONSE_12_rating',
    'RESPONSE_13_rating',
    'RESPONSE_14_rating',
    'RESPONSE_15_rating',
    'RESPONSE_16_rating',
    'RESPONSE_18_rating',
    'RESPONSE_19_rating',
    'Difference_rating'
], inplace=True)

In [None]:
df_website.head()

In [None]:
desired_order = ["Startup Name", "Rating"]

numbers = []
for col in df_website.columns:
    if col.startswith("Question"):
        numbers.append(int(col.split(" ")[1]))
numbers.sort()
for n in numbers:
    desired_order.append(f"Question {n}")
    desired_order.append(f"Answer {n}")
    desired_order.append(f"Rating {n}")
# Reorder the columns
df_website = df_website.reindex(columns=desired_order)

In [None]:

df_website.head()

In [None]:
df_website.to_excel("ai_rating.xlsx")

In [None]:
len(df_website[df_website["Rating"] == "1"])