In [32]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder


In [33]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [34]:
path = '/Users/aya/Dextract_DL/data/processed_pitch_data.csv'
data = pd.read_csv(path).drop('Unnamed: 0', axis=1)
data.columns = ['stage', 'url', 'industry', 'text']
print(f'There are {data.shape[0]} datapoints in the dataset.')
data.head()

There are 1799 datapoints in the dataset.


Unnamed: 0,stage,url,industry,text
0,Series D,./BestPitchPDFs/bolt.pdf,Tech,bolt bolt growing scaled global network around...
1,Late Stage,./BestPitchPDFs/spotify.pdf,Entertainment,distributed computer science aes cry language ...
2,Late Stage,./BestPitchPDFs/wework.pdf,Real Estate,space service whore fundamental community cyst...
3,Pre-Seed,./BestPitchPDFs/airbnb.pdf,Travel and Hospitality,welcome breakfast book rather problem price im...
4,Early Stage,./BestPitchPDFs/facebook.pdf,Media and Advertising,ers sie aes oes see classes bee directory user...


In [35]:
nan_indices = data[data['text'].isna()].index
print(f"indices of rows with NaN values in the 'text' column: {list(nan_indices)}")
#drop these rows
data = data.drop(nan_indices)


indices of rows with NaN values in the 'text' column: [376, 436, 869, 871, 873, 877, 929, 935, 963, 976, 977, 1075, 1081, 1098, 1099, 1118, 1158, 1208, 1223, 1269, 1278, 1284, 1295, 1320, 1328, 1338, 1575, 1619, 1723]


In [36]:
pitch_count = data.industry.value_counts()
pitch_count
low_count_industries = pitch_count[pitch_count < 50].index
# Remove industries with less than 50 pitches
data = data[~data.industry.isin(low_count_industries)]
print(data.industry.value_counts())
print(data.industry.value_counts().index)

Finance and Banking              237
Health Care                      211
Enterprise                       201
Consumer                         193
Data Analytics and Management    166
Tech                             165
Media and Advertising            108
Entertainment                     82
Other                             70
Education                         67
Cybersecurity                     62
Real Estate                       51
Name: industry, dtype: int64
Index(['Finance and Banking', 'Health Care', 'Enterprise', 'Consumer',
       'Data Analytics and Management', 'Tech', 'Media and Advertising',
       'Entertainment', 'Other', 'Education', 'Cybersecurity', 'Real Estate'],
      dtype='object')


In [42]:
# Split data into train and test sets
x_data = data.text.tolist()
y_data = data.industry.tolist()
train, test, y_train_raw, y_test_raw = train_test_split(x_data, y_data, 
                                                test_size=0.2, 
                                                #random_state=42, 
                                                stratify=data.industry)

In [47]:
label_to_id = {
    'Finance and Banking': 0,
    'Health Care': 1,
    'Enterprise': 2,
    'Consumer': 3,
    'Data Analytics and Management': 4,
    'Tech': 5,
    'Media and Advertising': 6,
    'Entertainment': 7,
    'Other': 8,
    'Education': 9,
    'Cybersecurity': 10,
    'Real Estate': 11
}

y_train = [label_to_id[label] for label in y_train_raw]
y_test = [label_to_id[label] for label in y_test_raw]
num_labels = len(np.unique(y_train))

In [48]:
class PitchDeckDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
        }
        

In [49]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

In [50]:
train_dataset = PitchDeckDataset(train, y_train, tokenizer, max_length=256)
test_dataset = PitchDeckDataset(test, y_test, tokenizer, max_length=256)

In [52]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [53]:
trainer.train()

# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)



Epoch,Training Loss,Validation Loss,Accuracy
1,1.3453,1.561682,0.541796
2,1.0582,1.33344,0.585139
3,0.9032,1.284782,0.619195


Evaluation results: {'eval_loss': 1.2847822904586792, 'eval_accuracy': 0.6191950464396285, 'eval_runtime': 183.2949, 'eval_samples_per_second': 1.762, 'eval_steps_per_second': 0.224, 'epoch': 3.0}


In [59]:
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)


In [60]:
print(classification_report(y_test, predicted_labels))

              precision    recall  f1-score   support

           0       0.69      0.77      0.73        48
           1       0.84      0.86      0.85        42
           2       0.56      0.62      0.59        40
           3       0.85      0.74      0.79        39
           4       0.59      0.48      0.53        33
           5       0.25      0.27      0.26        33
           6       0.50      0.55      0.52        22
           7       0.35      0.41      0.38        17
           8       0.73      0.57      0.64        14
           9       0.50      0.46      0.48        13
          10       0.90      0.75      0.82        12
          11       0.86      0.60      0.71        10

    accuracy                           0.62       323
   macro avg       0.63      0.59      0.61       323
weighted avg       0.63      0.62      0.62       323

