In [1]:
"""
Current 22-Jan-2023

My first proper machine learning project!

Takes as input a list of room names and floor areas. Outputs one of 76 ASHRAE space usage categories per room. Useful in automating
 a crucial step when calculating the loads of a building.

 Some tuning of the hyperparameters is still required; the model successfully makes predictions, but they are not great predictions.
 Having more than 700 data points for training and validation would also likely help.
"""

### Dataframe Preparation

In [2]:
space_usage_cats = ['User-Defined',
                    'CORRECTIONAL FACILITY: Booking/waiting',
                    'CORRECTIONAL FACILITY: Cell',
                    'CORRECTIONAL FACILITY: Dayroom',
                    'CORRECTIONAL FACILITY: Guard stations',
                    'EDUCATION: Art classroom',
                    'EDUCATION: Classroom (age 9 plus)',
                    'EDUCATION: Classroom (ages 5-8)',
                    'EDUCATION: Computer Lab',
                    'EDUCATION: Daycare (through age 4)',
                    'EDUCATION: Daycare Sickroom',
                    'EDUCATION: Lecture Classroom',
                    'EDUCATION: Lecture Hall (fixed seats)',
                    'EDUCATION: Media Center',
                    'EDUCATION: Multiuse Assembly',
                    'EDUCATION: Music/theater/dance',
                    'EDUCATION: Science Laboratory',
                    'EDUCATION: University/college Laboratory',
                    'EDUCATION: Wood/metal Shop',
                    'FOOD AND BEVERAGE SERVICE: Bar, Cocktail Lounge',
                    'FOOD AND BEVERAGE SERVICE: Cafeteria/Fast Food Dining',
                    'FOOD AND BEVERAGE SERVICE: Kitchen (cooking)',
                    'FOOD AND BEVERAGE SERVICE: Restaurant Dining Room',
                    'GENERAL: Break Room',
                    'GENERAL: Coffee Station',
                    'GENERAL: Conference/Meeting',
                    'GENERAL: Corridor',
                    'GENERAL: Occupiable Storage Room (liq/gel)',
                    'HOTEL / MOTEL / RESORT / DORM: Barracks sleeping area',
                    'HOTEL / MOTEL / RESORT / DORM: Bedroom/Living Room',
                    'HOTEL / MOTEL / RESORT / DORM: Laundry Room Within Dwelling Unit', # confirm if correct
                    'HOTEL / MOTEL / RESORT / DORM: Laundry Room, Central',
                    'HOTEL / MOTEL / RESORT / DORM: Lobby/Prefunction',
                    'HOTEL / MOTEL / RESORT / DORM: Multipurpose Assembly',
                    'MISCELLANEOUS: Bank or Bank Lobby',
                    'MISCELLANEOUS: Bank Vault/Safe Deposit',
                    'MISCELLANEOUS: Computer (not printing)',
                    'MISCELLANEOUS: Freezer and Refrigerated Spaces (<50°F)',
                    'MISCELLANEOUS: General Manufacturing (EXCLUDES HEAVY INDUSTRIAL AND PROCESSES USING CHEMICALS)',
                    'MISCELLANEOUS: Pharmacy (prep. area)',
                    'MISCELLANEOUS: Photo Studio',
                    'MISCELLANEOUS: Shipping/Receiving',
                    'MISCELLANEOUS: Sorting, Packing, Light Assembly',
                    'MISCELLANEOUS: Telephone Closet',
                    'MISCELLANEOUS: Transportation Waiting',
                    'MISCELLANEOUS: Warehouse',
                    'OFFICE: Breakroom',
                    'OFFICE: Main Entry Lobby',
                    'OFFICE: Occupiable Storage Room for Dry Materials',
                    'OFFICE: Office Space',
                    'OFFICE: Reception Area',
                    'OFFICE: Telephone/Data Entry',
                    'PUBLIC ASSEMBLY: Auditorium Seating Area',
                    'PUBLIC ASSEMBLY: Courtroom',
                    'PUBLIC ASSEMBLY: Legislative Chamber',
                    'PUBLIC ASSEMBLY: Library',
                    'PUBLIC ASSEMBLY: Lobby',
                    'PUBLIC ASSEMBLY: Museum (Children\'s)',
                    'PUBLIC ASSEMBLY: Museum/Gallery',
                    'PUBLIC ASSEMBLY: Place of Religious Worship',
                    'RESIDENTIAL: Common Corridor',
                    'RESIDENTIAL: Dwelling Unit',
                    'RETAIL: Barbershop',
                    'RETAIL: Beauty and Nail Salon',
                    'RETAIL: Coin-operated laundry',
                    'RETAIL: Mall common area',
                    'RETAIL: Pet shop (animal area)',
                    'RETAIL: Sales (except other categories here)',
                    'RETAIL: Supermarket',
                    'SPORTS: Bowling alley (seating)',
                    'SPORTS: Disco/dance floor',
                    'SPORTS: Gambling casino',
                    'SPORTS: Game arcade',
                    'SPORTS: Gym, sports arena (play area)',
                    'SPORTS: Health club/aerobics room',
                    'SPORTS: Health club/weight room',
                    'SPORTS: Spectator area',
                    'SPORTS: Stage, studio',
                    'SPORTS: Swimming (pool & deck)'
                    ]

SPACE_USAGE_CATS = []
for category in space_usage_cats:
    SPACE_USAGE_CATS.append(category.upper())

In [3]:
import pandas as pd
import numpy as np

df = pd.read_pickle('combined_df.pickle')
df.usage_cat = df.usage_cat.str.upper()
df.name = df.name.str.upper()
df.loc[df.usage_cat == 'EDUCATION: CLASSROOM (AGE 9+)', 'usage_cat'] = 'EDUCATION: CLASSROOM (AGE 9 PLUS)'
df.loc[df.usage_cat == 'FOOD SERVICE: CAFETERIA/FAST FOOD', 'usage_cat'] = 'FOOD AND BEVERAGE SERVICE: CAFETERIA/FAST FOOD DINING'
df.loc[df.usage_cat == 'FOOD SERVICE: KITCHEN (COOKING)', 'usage_cat'] = 'FOOD AND BEVERAGE SERVICE: KITCHEN (COOKING)'
df.loc[df.usage_cat == 'FOOD SERVICE: RESTAURANT DINING ROOM', 'usage_cat'] = 'FOOD AND BEVERAGE SERVICE: RESTAURANT DINING ROOM'
df.loc[df.usage_cat == 'EDUCATION: MULTI-USE ASSEMBLY', 'usage_cat'] = 'EDUCATION: MULTIUSE ASSEMBLY'
df.loc[df.usage_cat == 'PUBLIC ASSEMBLY: AUDITORIUM', 'usage_cat'] = 'PUBLIC ASSEMBLY: AUDITORIUM SEATING AREA'
df.loc[df.usage_cat == 'OFFICE: OCCUPIABLE STORAGE ROOM (DRY)', 'usage_cat'] = 'OFFICE: OCCUPIABLE STORAGE ROOM FOR DRY MATERIALS'
df.loc[df.usage_cat == 'MISCELLANEOUS: GENERAL MANUFACTURING', 'usage_cat'] = 'MISCELLANEOUS: GENERAL MANUFACTURING (EXCLUDES HEAVY INDUSTRIAL AND PROCESSES USING CHEMICALS)'


df[df.usage_cat.isin(['PUBLIC ASSEMBLY: AUDITORIUM'])]

Unnamed: 0,name,area,usage_cat


In [4]:
pd.set_option('display.max_colwidth', None)

# Checking to see if each value in the usage_cat column appears letter for letter in the full list of categories
df[df.usage_cat.isin(SPACE_USAGE_CATS) == False]

Unnamed: 0,name,area,usage_cat


In [5]:
# Lists any categories which appear in our training data
data_cats = df.usage_cat.tolist()
present_cats = list(set(data_cats))
present_cats.sort()

# Lists any categories in the full list which do not appear in our training data
missing_cats = list(set(SPACE_USAGE_CATS) - set(present_cats))
missing_cats.sort()

len(present_cats), len(missing_cats), len(SPACE_USAGE_CATS)

(42, 37, 79)

In [6]:
# Create a column in the dataframe containing an integer corresponding to a category
label_dict = dict(zip(SPACE_USAGE_CATS, range(len(SPACE_USAGE_CATS))))
df['usage_cat_integer'] = df.usage_cat.map(label_dict)
df.usage_cat_integer = df.usage_cat_integer.astype(float)
df

Unnamed: 0,name,area,usage_cat,usage_cat_integer
0,000 BASEMENT,1550,USER-DEFINED,0.0
1,100 MAIN FLOOR,1120,OFFICE: OFFICE SPACE,49.0
2,101 INTERVIEW/OW TRUSTEE,105,OFFICE: OFFICE SPACE,49.0
3,201 APARTMENT,323,RESIDENTIAL: DWELLING UNIT,61.0
4,201C BEDROOM,100,RESIDENTIAL: DWELLING UNIT,61.0
...,...,...,...,...
62,CR-03 CORRIDOR,1000,GENERAL: CORRIDOR,26.0
63,CR-05 CORRIDOR,950,GENERAL: CORRIDOR,26.0
64,V-01 VESTIBULE,50,USER-DEFINED,0.0
65,V-03 VESTIBULE,50,USER-DEFINED,0.0


In [7]:
df.describe(include='object')

Unnamed: 0,name,area,usage_cat
count,738,738,738
unique,737,412,42
top,ELECTRICAL ROOM,100,USER-DEFINED
freq,2,28,269


In [8]:
# Creating our input column
df['input'] = 'TEXT1: ' + df.name + '; TEXT2: ' + df.area

### Transformers Dataset Preparation

In [9]:
# Creating a dataset for Transformers to use
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['name', 'area', 'usage_cat', 'usage_cat_integer', 'input', '__index_level_0__'],
    num_rows: 738
})

In [10]:
# Select a pre-trained model and use it to create a tokenizer
model_nm = 'bert-base-uncased'

from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenize = AutoTokenizer.from_pretrained(model_nm)

In [11]:
tokenize.tokenize('This is an absolutely amazing sentence which is getting tokenized right now!!!')

['this',
 'is',
 'an',
 'absolutely',
 'amazing',
 'sentence',
 'which',
 'is',
 'getting',
 'token',
 '##ized',
 'right',
 'now',
 '!',
 '!',
 '!']

In [12]:
# Function to tokenize the 'input' column of our dataframe
def tokenize_fnc(x):
    return tokenize(x['input'], truncation=True, padding=True)

In [13]:
# Tokenize our dataset!
tokenized_ds = ds.map(tokenize_fnc, batched=False)



  0%|          | 0/738 [00:00<?, ?ex/s]

In [14]:
tokenized_ds[0]

{'name': '000 BASEMENT',
 'area': '1550',
 'usage_cat': 'USER-DEFINED',
 'usage_cat_integer': 0.0,
 'input': 'TEXT1: 000 BASEMENT; TEXT2: 1550',
 '__index_level_0__': 0,
 'input_ids': [101,
  3793,
  2487,
  1024,
  2199,
  8102,
  1025,
  3793,
  2475,
  1024,
  26245,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
# Transformers assumes that our labels column is always named 'labels'
tokenized_ds = tokenized_ds.rename_columns({'usage_cat_integer': 'labels'})
tokenized_ds = tokenized_ds.remove_columns(['input', 'name', 'area', 'usage_cat'])

columns_to_return = ['input_ids', 'labels', 'attention_mask', 'token_type_ids']
tokenized_ds.set_format(type='torch', columns=columns_to_return)
tokenized_ds

Dataset({
    features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 738
})

In [16]:
tokenized_ds.format['type']

'torch'

In [17]:
# Split into training and validation sets. 75% of data will be training data and
# 25% will be validation data
dds = tokenized_ds.train_test_split(0.25)
dds

DatasetDict({
    train: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 553
    })
    test: Dataset({
        features: ['labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 185
    })
})

In [18]:
!pip install evaluate
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

### Training

In [19]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Hyperparameters
batch_size = 4
epochs = 8
learn_rate = 5e-4

In [20]:
args = TrainingArguments('outputs', learning_rate=learn_rate, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
data_collator = DataCollatorWithPadding(tokenizer=tokenize)

trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'], data_collator=data_collator,
                  tokenizer=tokenize, compute_metrics=compute_metrics)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
data_collator = trainer.get_train_dataloader().collate_fn
data_collator

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [23]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()

trainer.train()
# trainer.evaluate()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 553
  Num Epochs = 8
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1112


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,463.718842,0.324324
2,No log,406.585968,0.324324
3,No log,396.069977,0.324324
4,455.493500,358.036743,0.324324
5,455.493500,388.309875,0.324324
6,455.493500,364.519012,0.324324
7,455.493500,360.052948,0.324324
8,279.248300,364.001099,0.324324


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 185
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 185
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Runn

TrainOutput(global_step=1112, training_loss=356.21151711443343, metrics={'train_runtime': 224.6861, 'train_samples_per_second': 19.69, 'train_steps_per_second': 4.949, 'total_flos': 39239681031594.0, 'train_loss': 356.21151711443343, 'epoch': 8.0})

### Testing Outputs of our Trained Model

In [24]:
# Figure out a way to save the model for later use on local computer. torch.save doesn't work??

In [25]:
# Prepping a dataframe with sample inputs
room_info = [['Lobby', 300], ['Staff Office', 220], ['Vestibule', 100]]

pred_df = pd.DataFrame(room_info, columns=['name', 'area'])
pred_df.name = pred_df.name.str.upper()
pred_df.area = pred_df.area.astype(str)
pred_df['input'] = 'TEXT1: ' + pred_df.name + '; TEXT2: ' + pred_df.area
pred_df


Unnamed: 0,name,area,input
0,LOBBY,300,TEXT1: LOBBY; TEXT2: 300
1,STAFF OFFICE,220,TEXT1: STAFF OFFICE; TEXT2: 220
2,VESTIBULE,100,TEXT1: VESTIBULE; TEXT2: 100


In [26]:
# Converting our dataframe to a dataset
pred_ds = Dataset.from_pandas(pred_df).map(tokenize_fnc, batched=False)

pred_ds = pred_ds.remove_columns(['input', 'name', 'area'])
columns_to_return = ['input_ids', 'attention_mask', 'token_type_ids']
pred_ds.set_format(type='torch', columns=columns_to_return)

pred_ds

  0%|          | 0/3 [00:00<?, ?ex/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3
})

In [27]:
# Running our inputs in dataset form through the model
preds = trainer.predict(pred_ds).predictions.astype(int)
preds = np.squeeze(preds)
preds.shape

***** Running Prediction *****
  Num examples = 3
  Batch size = 8


(3,)

In [28]:
# Final outputs!
label_dict
reversed_dict = dict([(value, key) for key, value in label_dict.items()])

pred_categories = []
for prediction in preds:
    category = reversed_dict[prediction]
    pred_categories.append(category)
pred_df['cat_prediction'] = pred_categories
pred_df

Unnamed: 0,name,area,input,cat_prediction
0,LOBBY,300,TEXT1: LOBBY; TEXT2: 300,"FOOD AND BEVERAGE SERVICE: BAR, COCKTAIL LOUNGE"
1,STAFF OFFICE,220,TEXT1: STAFF OFFICE; TEXT2: 220,"FOOD AND BEVERAGE SERVICE: BAR, COCKTAIL LOUNGE"
2,VESTIBULE,100,TEXT1: VESTIBULE; TEXT2: 100,USER-DEFINED
