In [None]:
from transformers import pipeline
import time
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Classifier model

In [None]:
#remove device = 0 if not using a gpu
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device = 0)

Device set to use cuda:0


In [None]:
LABELS = ["Construction of a new building", "Existing Structure Modification", "Demolition of a building"]

In [None]:
###Example
sequence = "The construction work involves building a new 13-story mixed-use building with 53 residential units, including a basement, and is being reviewed and inspected by a private provider for Atkins North America."

# Run zero-shot classification
result = classifier(sequence, LABELS)

print(result)

{'sequence': 'The construction work involves building a new 13-story mixed-use building with 53 residential units, including a basement, and is being reviewed and inspected by a private provider for Atkins North America.', 'labels': ['New construction', 'Existing Structure Modification', 'Demolition'], 'scores': [0.9633380174636841, 0.033094607293605804, 0.0035674471873790026]}


In [None]:
result = classifier("This description indicates that underground fire sprinkler system work is being performed.", LABELS)

print(result)

{'sequence': 'This description indicates that underground fire sprinkler system work is being performed.', 'labels': ['Existing Structure Modification', 'Demolition', 'New construction'], 'scores': [0.8803701996803284, 0.06437518447637558, 0.05525461584329605]}


In [None]:
result = classifier("The description indicates that construction work involved renovating the exterior of the Farnham Building located at 1012.", LABELS)

print(result)

{'sequence': 'The description indicates that construction work involved renovating the exterior of the Farnham Building located at 1012.', 'labels': ['Existing Structure Modification', 'New construction', 'Demolition'], 'scores': [0.824742317199707, 0.14647892117500305, 0.028778819367289543]}


## Classify with a zero-shot algorithm

In [None]:
com_directory = "/content/drive/MyDrive/Colab Notebooks/Building_Permit_Data"

In [None]:
df_flo = pd.read_csv(com_directory + "/df_flo_clean.csv.gz")

In [None]:
# Function to perform multilabel classification on a single text element
def classify_text(text, candidate_labels, multi = False):
    result = classifier(text, candidate_labels, multi_label = multi)
    return {label: score for label, score in zip(result['labels'], result['scores'])}

# Parallelized function applying classifier to DataFrame column
def parallel_classify(df, text_column, candidate_labels, max_workers=4):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(lambda x: classify_text(x, candidate_labels), df[text_column]))

    # Adding each probability score as a separate column
    for label in candidate_labels:
        df[label] = [res[label] for res in results]

    return df

In [None]:
# Batch classify texts efficiently on GPU
def batch_classify(df, text_column, candidate_labels, multi_label=False, batch_size=16):
    texts = df[text_column].tolist()

    # Run classification in batches
    results = classifier(texts, candidate_labels, multi_label=multi_label, batch_size=batch_size)

    # Handle results to add scores to DataFrame
    for label in candidate_labels:
        df[label] = [res['scores'][res['labels'].index(label)] for res in results]

    return df

In [None]:
import os

In [None]:
os.cpu_count()

12

In [None]:
def chunk_classify(df, text_column, candidate_labels, chunk_size = 1000, multi_label=False, batch_size=16):
  start = time.time()
  chunk_size = 1000  # tune based on GPU capacity
  df_list = []

  for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size].copy()
    classified_chunk = batch_classify(chunk, text_column, candidate_labels, multi_label=multi_label, batch_size = batch_size)
    df_list.append(classified_chunk)

  print(time.time() - start)

  return pd.concat(df_list, ignore_index=True)

In [None]:
classi = chunk_classify(df_flo, 'Corrected_Text', LABELS)

3472.525295972824


In [None]:
classi.to_csv(com_directory + "/df_flo_consorrenov.csv.gz", compression = "gzip")

In [None]:
classi

Unnamed: 0.2,Unnamed: 0,Index,Full Address,DESCRIPTION,tok,Corrected_Text,Unnamed: 0.1,Construction of a new building,Existing Structure Modification,Demolition of a building
0,520846,4617108,"900 biscayne blvd, miami, fl",WOOD/LAMINATE/TILE|KITCHEN REMODELING/BATHROOM...,38,The description indicates that the constructio...,,0.139578,0.809395,0.051027
1,520847,4617109,"900 biscayne blvd, miami, fl",SETTING FIXTURE: BATHTUB|SETTING FIXTURE: LAVA...,43,This building permit involved installing a bat...,,0.616231,0.370280,0.013489
2,520848,4617110,"2811 s bayshore dr, miami, fl",LIGHT SOCKET|ROUGH WIRING OUTLETS,10,The construction work involved installing ligh...,,0.480307,0.487720,0.031973
3,520849,4617111,"1737 n bayshore dr, miami, fl",WHEELCHAIR LIFT / ADA LIFT,11,The construction work involved installing a wh...,,0.527843,0.450536,0.021621
4,520850,4617112,"400 biscayne blvd, miami, fl",PHASED PERMIT-MECHANICAL,9,The construction work involves obtaining permi...,,0.445386,0.492986,0.061628
...,...,...,...,...,...,...,...,...,...,...
1001,521848,4618110,"3015 grand ave, miami, fl",OWNER IDENTIFICATION SIGNS (ELECTRICAL): NEW S...,15,This description indicates the installation of...,,0.122402,0.826208,0.051390
1002,521849,4618111,"3015 grand ave, miami, fl",BOX SIGN,2,A box sign refers to a type of illuminated sig...,,0.170807,0.752552,0.076641
1003,521850,4618112,"3015 grand ave, miami, fl",OWNER IDENTIFICATION SIGNS (ELECTRICAL): NEW S...,15,This building permit filing is for installing ...,,0.282064,0.703985,0.013950
1004,521851,4618113,"1401 brickell ave, miami, fl",REPLACE SAME LOCATION CENTRAL AC / HEATING SYSTEM,11,The construction work involved replacing the c...,,0.047319,0.928239,0.024442
