In [None]:
import pandas as pd
import numpy as np
import os
os.chdir("..")
     

DATA = "./data/preprocessed.json"
MODEL = "bert-base-uncased"

responses = {
    "RESPONSE_1" : "1. Your company/vision/mission in a tweet ! *",
    "RESPONSE_4" : "4. Which problem does your company solve? *",
    "RESPONSE_5" : "5. What sparked the founder(s) to set up the company? *",
    "RESPONSE_6" : "6. Please describe the business model of your company, including the main sources of revenue. *",
    "RESPONSE_7" : "7. What is your USP versus traditional competitors or those in the digital world? *",
    "RESPONSE_8" : "8. Which target group(s) in which markets do you address primarily? *",
    "RESPONSE_12" : "12. Please provide a brief competition landscape for your core markets. *",
    "RESPONSE_13" : "13. Did you pivot in the past and why?",
    "RESPONSE_14" : "14. Which business area in the last year consumed most of your management attention? *",
    "RESPONSE_15" : "15. What protects your business from new market entrants or copycats? *",
    "RESPONSE_16" : "16. After your next financing round, will the founders jointly hold above or below 50% of the",
    "RESPONSE_18" : "18. How do you integrate ESG standards in your business model? *",
    "RESPONSE_19" : "19. When scaling the company, how do you ensure that the corporate culture does not suffer and",
    "Description":"Description",
    "Difference":"Difference",
}

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, logging
from torch.utils.data import DataLoader, random_split, TensorDataset, RandomSampler, SequentialSampler
from tqdm.auto import tqdm, trange
import pandas as pd
import matplotlib.pyplot as plt
logging.set_verbosity_error()

def trainModel(text, labels, MODEL='bert-base-multilingual-uncased', patience=3):
    tokenizer = BertTokenizer.from_pretrained(MODEL, do_lower_case=True)
    input_ids = []
    attention_masks = []

    for t in text:
        encoded_dict = tokenizer.encode_plus(
            t, add_special_tokens=True, truncation=True, max_length=128, 
            pad_to_max_length=True, return_attention_mask=True, return_tensors='pt'
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    dataset = TensorDataset(input_ids, attention_masks, labels)
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    batch_size = 64
    train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
    validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

    model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=2, output_attentions=False, output_hidden_states=False)
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    epochs = 16
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    training_stats = []
    best_val_loss = float('inf')
    no_improve_epoch = 0

    for epoch_i in range(epochs):
        model.train()
        total_train_loss = 0

        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()        
            res = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            total_train_loss += res.loss.item()
            res.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        
        model.eval()
        total_eval_loss = 0
        total_eval_accuracy = 0

        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            with torch.no_grad():
                res = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            
            total_eval_loss += res.loss.item()
            logits = res.logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            total_eval_accuracy += np.sum(np.argmax(logits, axis=1).flatten() == label_ids.flatten()) / len(label_ids)

        avg_val_loss = total_eval_loss / len(validation_dataloader)
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)

        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Validation Loss': avg_val_loss,
            'Validation Accuracy': avg_val_accuracy
        })

        # Checkpoint model if validation loss improves
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            no_improve_epoch = 0
        else:
            no_improve_epoch += 1
        
        # Early stopping
        if no_improve_epoch >= patience:
            break

    return pd.DataFrame(training_stats).set_index('epoch'), model

In [None]:
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from openai import OpenAI
from tqdm import tqdm
import numpy as np
from dotenv import load_dotenv
from joblib import dump, load

load_dotenv()

client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

def trainRF(data, labels):

    print("Starting embedding process...")
    embeddings = []
    for text in (data):
        try:
            embedding = get_embedding(text)
            embeddings.append(embedding)
        except Exception as e:
            print(f"Error embedding text: {e}")
            embeddings.append(None)

    embeddings, labels = zip(*[(emb, lbl) for emb, lbl in zip(embeddings, labels) if emb is not None])

    if not embeddings:
        print("No valid embeddings to train on after filtering. Exiting.")
        exit()

    embeddings, labels = list(embeddings), list(labels)

    embeddings = [np.array(e) for e in embeddings]

    print("Splitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

    print("Training and tuning the model...")
    rf = RandomForestClassifier()
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', 0.5, 1]
    }
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")

    return best_model

In [None]:
def plot_training_stats(df_stats):
    # Create a figure and a set of subplots
    fig, axs = plt.subplots(2, 1, figsize=(10, 10))

    # Plot training and validation loss
    axs[0].plot(df_stats['Training Loss'], label='Training Loss')
    axs[0].plot(df_stats['Validation Loss'], label='Validation Loss')
    axs[0].set_xlabel('Epochs')
    axs[0].set_ylabel('Loss')
    axs[0].set_title('Training and Validation Loss')
    axs[0].legend()

    # Plot validation accuracy
    axs[1].plot(df_stats['Validation Accuracy'], label='Validation Accuracy', color='orange')
    axs[1].set_xlabel('Epochs')
    axs[1].set_ylabel('Accuracy')
    axs[1].set_title('Validation Accuracy')
    axs[1].legend()

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
for response_key, response_question in iter(responses.items()):
    df = pd.read_json(DATA)
    df = df[['Filename', response_question, 'label']]
    df[response_question] = df[response_question].str.lower()

    text = df[response_question].values
    labels = df['label'].values

    df_stats, model = trainModel(text, labels)
    print(f"Training on data {response_key} : {response_question}")
    plot_training_stats(df_stats)

    print("Training RF...")
    rf_model = trainRF(text, labels)
    dump(rf_model, "./data/" + response_key + "_rf.joblib")

    torch.save(model.state_dict(), "./data/" + response_key + ".pth")

# USE Models:

- RESPONSE_1 
- RESPONSE_5 
- RESPONSE_6 
- RESPONSE_7 
- RESPONSE_8 
- RESPONSE_12
- RESPONSE_14
- RESPONSE_15
- RESPONSE_18