In [None]:
"""
Current 22-Jan-2023

My first proper machine learning project!

Takes as input a list of room names and floor areas. Outputs one of 76 ASHRAE space usage categories per room. Useful in automating
 a crucial step when calculating the loads of a building.

 Some tuning of the hyperparameters is still required; the model successfully makes predictions, but they are not great predictions.
 Having more than 700 data points for training and validation would also likely help.
"""

### Dataframe Preparation

In [None]:
space_usage_cats = ['User-Defined',
                    'CORRECTIONAL FACILITY: Booking/waiting',
                    'CORRECTIONAL FACILITY: Cell',
                    'CORRECTIONAL FACILITY: Dayroom',
                    'CORRECTIONAL FACILITY: Guard stations',
                    'EDUCATION: Art classroom',
                    'EDUCATION: Classroom (age 9 plus)',
                    'EDUCATION: Classroom (ages 5-8)',
                    'EDUCATION: Computer Lab',
                    'EDUCATION: Daycare (through age 4)',
                    'EDUCATION: Daycare Sickroom',
                    'EDUCATION: Lecture Classroom',
                    'EDUCATION: Lecture Hall (fixed seats)',
                    'EDUCATION: Media Center',
                    'EDUCATION: Multiuse Assembly',
                    'EDUCATION: Music/theater/dance',
                    'EDUCATION: Science Laboratory',
                    'EDUCATION: University/college Laboratory',
                    'EDUCATION: Wood/metal Shop',
                    'FOOD AND BEVERAGE SERVICE: Bar, Cocktail Lounge',
                    'FOOD AND BEVERAGE SERVICE: Cafeteria/Fast Food Dining',
                    'FOOD AND BEVERAGE SERVICE: Kitchen (cooking)',
                    'FOOD AND BEVERAGE SERVICE: Restaurant Dining Room',
                    'GENERAL: Break Room',
                    'GENERAL: Coffee Station',
                    'GENERAL: Conference/Meeting',
                    'GENERAL: Corridor',
                    'GENERAL: Occupiable Storage Room (liq/gel)',
                    'HOTEL / MOTEL / RESORT / DORM: Barracks sleeping area',
                    'HOTEL / MOTEL / RESORT / DORM: Bedroom/Living Room',
                    'HOTEL / MOTEL / RESORT / DORM: Laundry Room Within Dwelling Unit', # confirm if correct
                    'HOTEL / MOTEL / RESORT / DORM: Laundry Room, Central',
                    'HOTEL / MOTEL / RESORT / DORM: Lobby/Prefunction',
                    'HOTEL / MOTEL / RESORT / DORM: Multipurpose Assembly',
                    'MISCELLANEOUS: Bank or Bank Lobby',
                    'MISCELLANEOUS: Bank Vault/Safe Deposit',
                    'MISCELLANEOUS: Computer (not printing)',
                    'MISCELLANEOUS: Freezer and Refrigerated Spaces (<50°F)',
                    'MISCELLANEOUS: General Manufacturing (EXCLUDES HEAVY INDUSTRIAL AND PROCESSES USING CHEMICALS)',
                    'MISCELLANEOUS: Pharmacy (prep. area)',
                    'MISCELLANEOUS: Photo Studio',
                    'MISCELLANEOUS: Shipping/Receiving',
                    'MISCELLANEOUS: Sorting, Packing, Light Assembly',
                    'MISCELLANEOUS: Telephone Closet',
                    'MISCELLANEOUS: Transportation Waiting',
                    'MISCELLANEOUS: Warehouse',
                    'OFFICE: Breakroom',
                    'OFFICE: Main Entry Lobby',
                    'OFFICE: Occupiable Storage Room for Dry Materials',
                    'OFFICE: Office Space',
                    'OFFICE: Reception Area',
                    'OFFICE: Telephone/Data Entry',
                    'PUBLIC ASSEMBLY: Auditorium Seating Area',
                    'PUBLIC ASSEMBLY: Courtroom',
                    'PUBLIC ASSEMBLY: Legislative Chamber',
                    'PUBLIC ASSEMBLY: Library',
                    'PUBLIC ASSEMBLY: Lobby',
                    'PUBLIC ASSEMBLY: Museum (Children\'s)',
                    'PUBLIC ASSEMBLY: Museum/Gallery',
                    'PUBLIC ASSEMBLY: Place of Religious Worship',
                    'RESIDENTIAL: Common Corridor',
                    'RESIDENTIAL: Dwelling Unit',
                    'RETAIL: Barbershop',
                    'RETAIL: Beauty and Nail Salon',
                    'RETAIL: Coin-operated laundry',
                    'RETAIL: Mall common area',
                    'RETAIL: Pet shop (animal area)',
                    'RETAIL: Sales (except other categories here)',
                    'RETAIL: Supermarket',
                    'SPORTS: Bowling alley (seating)',
                    'SPORTS: Disco/dance floor',
                    'SPORTS: Gambling casino',
                    'SPORTS: Game arcade',
                    'SPORTS: Gym, sports arena (play area)',
                    'SPORTS: Health club/aerobics room',
                    'SPORTS: Health club/weight room',
                    'SPORTS: Spectator area',
                    'SPORTS: Stage, studio',
                    'SPORTS: Swimming (pool & deck)'
                    ]

SPACE_USAGE_CATS = []
for category in space_usage_cats:
    SPACE_USAGE_CATS.append(category.upper())

In [None]:
import pandas as pd
import numpy as np

df = pd.read_pickle('combined_df.pickle')
df.usage_cat = df.usage_cat.str.upper()
df.name = df.name.str.upper()
df.loc[df.usage_cat == 'EDUCATION: CLASSROOM (AGE 9+)', 'usage_cat'] = 'EDUCATION: CLASSROOM (AGE 9 PLUS)'
df.loc[df.usage_cat == 'FOOD SERVICE: CAFETERIA/FAST FOOD', 'usage_cat'] = 'FOOD AND BEVERAGE SERVICE: CAFETERIA/FAST FOOD DINING'
df.loc[df.usage_cat == 'FOOD SERVICE: KITCHEN (COOKING)', 'usage_cat'] = 'FOOD AND BEVERAGE SERVICE: KITCHEN (COOKING)'
df.loc[df.usage_cat == 'FOOD SERVICE: RESTAURANT DINING ROOM', 'usage_cat'] = 'FOOD AND BEVERAGE SERVICE: RESTAURANT DINING ROOM'
df.loc[df.usage_cat == 'EDUCATION: MULTI-USE ASSEMBLY', 'usage_cat'] = 'EDUCATION: MULTIUSE ASSEMBLY'
df.loc[df.usage_cat == 'PUBLIC ASSEMBLY: AUDITORIUM', 'usage_cat'] = 'PUBLIC ASSEMBLY: AUDITORIUM SEATING AREA'
df.loc[df.usage_cat == 'OFFICE: OCCUPIABLE STORAGE ROOM (DRY)', 'usage_cat'] = 'OFFICE: OCCUPIABLE STORAGE ROOM FOR DRY MATERIALS'
df.loc[df.usage_cat == 'MISCELLANEOUS: GENERAL MANUFACTURING', 'usage_cat'] = 'MISCELLANEOUS: GENERAL MANUFACTURING (EXCLUDES HEAVY INDUSTRIAL AND PROCESSES USING CHEMICALS)'


df[df.usage_cat.isin(['PUBLIC ASSEMBLY: AUDITORIUM'])]

In [None]:
pd.set_option('display.max_colwidth', None)

# Checking to see if each value in the usage_cat column appears letter for letter in the full list of categories
df[df.usage_cat.isin(SPACE_USAGE_CATS) == False]

In [None]:
# Lists any categories which appear in our training data
data_cats = df.usage_cat.tolist()
present_cats = list(set(data_cats))
present_cats.sort()

# Lists any categories in the full list which do not appear in our training data
missing_cats = list(set(SPACE_USAGE_CATS) - set(present_cats))
missing_cats.sort()

len(present_cats), len(missing_cats), len(SPACE_USAGE_CATS)

In [None]:
# Create a column in the dataframe containing an integer corresponding to a category
label_dict = dict(zip(SPACE_USAGE_CATS, range(len(SPACE_USAGE_CATS))))
df['usage_cat_integer'] = df.usage_cat.map(label_dict)
df.usage_cat_integer = df.usage_cat_integer.astype(float)
df

In [None]:
df.describe(include='object')

In [None]:
# Creating our input column
df['input'] = 'TEXT1: ' + df.name + '; TEXT2: ' + df.area

### Transformers Dataset Preparation

In [None]:
# Creating a dataset for Transformers to use
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)
ds

In [None]:
# Select a pre-trained model and use it to create a tokenizer
model_nm = 'bert-base-uncased'

from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenize = AutoTokenizer.from_pretrained(model_nm)

In [None]:
tokenize.tokenize('This is an absolutely amazing sentence which is getting tokenized right now!!!')

In [None]:
# Function to tokenize the 'input' column of our dataframe
def tokenize_fnc(x):
    return tokenize(x['input'], truncation=True, padding=True)

In [None]:
# Tokenize our dataset!
tokenized_ds = ds.map(tokenize_fnc, batched=False)

In [None]:
tokenized_ds[0]

In [None]:
# Transformers assumes that our labels column is always named 'labels'
tokenized_ds = tokenized_ds.rename_columns({'usage_cat_integer': 'labels'})
tokenized_ds = tokenized_ds.remove_columns(['input', 'name', 'area', 'usage_cat'])

columns_to_return = ['input_ids', 'labels', 'attention_mask', 'token_type_ids']
tokenized_ds.set_format(type='torch', columns=columns_to_return)
tokenized_ds

In [None]:
tokenized_ds.format['type']

In [None]:
# Split into training and validation sets. 75% of data will be training data and
# 25% will be validation data
dds = tokenized_ds.train_test_split(0.25)
dds

In [None]:
!pip install evaluate
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

### Training

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Hyperparameters
batch_size = 4
epochs = 8
learn_rate = 5e-4

In [None]:
args = TrainingArguments('outputs', learning_rate=learn_rate, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
data_collator = DataCollatorWithPadding(tokenizer=tokenize)

trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'], data_collator=data_collator,
                  tokenizer=tokenize, compute_metrics=compute_metrics)

In [None]:
data_collator = trainer.get_train_dataloader().collate_fn
data_collator

In [None]:
import torch
torch.cuda.empty_cache()

import gc
gc.collect()

trainer.train()
# trainer.evaluate()

### Testing Outputs of our Trained Model

In [None]:
# Figure out a way to save the model for later use on local computer. torch.save doesn't work??

In [None]:
# Prepping a dataframe with sample inputs
room_info = [['Lobby', 300], ['Staff Office', 220], ['Vestibule', 100]]

pred_df = pd.DataFrame(room_info, columns=['name', 'area'])
pred_df.name = pred_df.name.str.upper()
pred_df.area = pred_df.area.astype(str)
pred_df['input'] = 'TEXT1: ' + pred_df.name + '; TEXT2: ' + pred_df.area
pred_df


In [None]:
# Converting our dataframe to a dataset
pred_ds = Dataset.from_pandas(pred_df).map(tokenize_fnc, batched=False)

pred_ds = pred_ds.remove_columns(['input', 'name', 'area'])
columns_to_return = ['input_ids', 'attention_mask', 'token_type_ids']
pred_ds.set_format(type='torch', columns=columns_to_return)

pred_ds

In [None]:
# Running our inputs in dataset form through the model
preds = trainer.predict(pred_ds).predictions.astype(int)
preds = np.squeeze(preds)
preds.shape

In [None]:
# Final outputs!
label_dict
reversed_dict = dict([(value, key) for key, value in label_dict.items()])

pred_categories = []
for prediction in preds:
    category = reversed_dict[prediction]
    pred_categories.append(category)
pred_df['cat_prediction'] = pred_categories
pred_df