## Import various Packages

In [32]:
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot

from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bennettbishop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocess Data

### Import Data Set

In [88]:
root_path = 'data/full_dataset.csv'
df = pd.read_csv(root_path)
df.head()

Unnamed: 0,request,category,stratify_col
0,I’m looking for some information regarding pay...,HR,HR__payroll
1,I’m looking for some information regarding lea...,HR,HR__leave policy
2,I’m looking for some information regarding emp...,HR,HR__employee handbook
3,I’m looking for some information regarding job...,HR,HR__job application
4,I’m looking for some information regarding ben...,HR,HR__benefits enrollment


### Split into Train, Validation, Test using Stratified_Col

In [89]:
# Encode the 'category' column into numerical labels
df['encoded_text'] = df['category'].astype('category').cat.codes
print(f"Unique categories: {df['category'].nunique()}")
print(f"Encoded labels: {df['encoded_text'].unique()}")

# Separate columns for splitting
data_texts = df['request'].to_list()  # 'request' is the text data
data_labels = df['encoded_text'].to_list()  # Encoded class labels
stratify_values = df['stratify_col'].to_list()  # Stratification column

# Split the data into Train/Validation sets with stratification
train_texts, val_texts, train_labels, val_labels, train_stratify, val_stratify = train_test_split(
    data_texts, data_labels, stratify_values, 
    test_size=0.2, stratify=stratify_values, random_state=0
)

# Split the Train set further into Train/Test with stratification
train_texts, test_texts, train_labels, test_labels = train_test_split(
    train_texts, train_labels, 
    test_size=0.1, stratify=train_stratify, random_state=0
)

Unique categories: 5
Encoded labels: [2 1 3 4 0]


### Optional: Save test/train/val to csv

In [92]:
# Create DataFrames for each split
train_df = pd.DataFrame({
    'request': train_texts,
    'label': train_labels
})

val_df = pd.DataFrame({
    'request': val_texts,
    'label': val_labels
})

test_df = pd.DataFrame({
    'request': test_texts,
    'label': test_labels
})

# Save DataFrames to CSV files
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/validation.csv", index=False)
test_df.to_csv("data/test.csv", index=False)


Data splits saved as train.csv, validation.csv, and test.csv


### Check Out Split Sets

In [91]:
# Map numerical labels back to category names
label_mapping = dict(enumerate(df['category'].astype('category').cat.categories))
print("\nLabel Mapping (Encoded -> Category):")
for encoded, category in label_mapping.items():
    print(f"{encoded}: {category}")


print(f"Train set size after second split: {len(train_texts)}")
print(f"Test set size: {len(test_texts)}")
print(f"Example train_texts: {train_texts[1]}") 
print(f"Example train_labels: {train_labels[1]}")
print(f"Example val_texts: {val_texts[1]}") 
print(f"Example val_labels: {val_labels[1]}")
print(f"Example test_texts: {test_texts[1]}") 
print(f"Example test_labels: {test_labels[1]}")

# Output dataset information
print("\nFinal dataset information:")
print(f"Train set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")
print(f"Test set size: {len(test_texts)}")


Label Mapping (Encoded -> Category):
0: Facilities Management
1: Finance
2: HR
3: IT Support
4: Marketing
Train set size after second split: 3596
Test set size: 400
Example train_texts: Do you have the latest version of the diversity and inclusion policies handbook? I need it for a new hire orientation.
Example train_labels: 2
Example val_texts: Need access to server maintenance.
Example val_labels: 3
Example test_texts: Could you review and approve the budget variance analysis for this month? Let me know if there’s anything I need to address.
Example test_labels: 1

Final dataset information:
Train set size: 3596
Validation set size: 1000
Test set size: 400


KeyError: 'count'

<Figure size 800x800 with 0 Axes>