<a href="https://colab.research.google.com/github/AdamMohsen4/tenant_issue_tracker/blob/main/issue_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
# Load into Pandas
import pandas as pd
df = pd.read_csv("/tenant_issue_dataset.csv")
print(df.head())


                                       text   category      urgency  \
0  Basement shower drain partially clogged.   plumbing  semi-urgent   
1                  Oven clock reset needed.  appliance   non-urgent   
2          Several ant spotted in bathroom.       pest  semi-urgent   
3     Toilet in bathroom occasionally runs.   plumbing   non-urgent   
4       Silverfish in dining room bathroom.       pest  semi-urgent   

                    timestamp  has_photo  
0  2025-02-18 09:19:48.674110       True  
1  2025-02-08 07:14:24.465920       True  
2  2025-02-09 15:20:20.825646       True  
3  2025-02-03 09:14:29.568411      False  
4  2025-02-16 23:19:18.026007      False  


In [73]:
# Prepare data for category
from datasets import Dataset
category_dataset = Dataset.from_pandas(df[["text", "category"]])
category_dataset = category_dataset.train_test_split(test_size=0.2)
unique_categories = df["category"].unique()
category_labels = {cat: idx for idx, cat in enumerate(unique_categories)}
print("Category Labels:", category_labels)

# Prepare data for urgency
urgency_dataset = Dataset.from_pandas(df[["text", "urgency"]])
urgency_dataset = urgency_dataset.train_test_split(test_size=0.2)
unique_urgencies = df["urgency"].unique()
urgency_labels = {urg: idx for idx, urg in enumerate(unique_urgencies)}
print("Urgency Labels:", urgency_labels)

Category Labels: {'plumbing': 0, 'appliance': 1, 'pest': 2, 'hvac': 3, 'miscellaneous': 4, 'structural': 5, 'electrical': 6}
Urgency Labels: {'semi-urgent': 0, 'non-urgent': 1, 'urgent': 2}


In [74]:
# Tokenize with BERT
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize category dataset
tokenized_category_dataset = category_dataset.map(tokenize_function, batched=True)
tokenized_category_dataset = tokenized_category_dataset.map(lambda x: {"labels": category_labels[x["category"]]})
tokenized_category_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Tokenize urgency dataset
tokenized_urgency_dataset = urgency_dataset.map(tokenize_function, batched=True)
tokenized_urgency_dataset = tokenized_urgency_dataset.map(lambda x: {"labels": urgency_labels[x["urgency"]]})
tokenized_urgency_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [79]:
# Train category model
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B logging

category_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(category_labels))
training_args = TrainingArguments(
    output_dir="./category_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=category_model,
    args=training_args,
    train_dataset=tokenized_category_dataset["train"],
    eval_dataset=tokenized_category_dataset["test"],
    compute_metrics=lambda eval_pred: {"accuracy": (eval_pred[0].argmax(-1) == eval_pred[1]).mean()}
)
trainer.train()

# Train urgency model
urgency_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(urgency_labels))
# Create a new TrainingArguments instance with a different output_dir
urgency_training_args = TrainingArguments(
    output_dir="./urgency_results",  # Update output directory
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)
urgency_trainer = Trainer(
    model=urgency_model,
    args=urgency_training_args,  # Use the new TrainingArguments instance
    train_dataset=tokenized_urgency_dataset["train"],
    eval_dataset=tokenized_urgency_dataset["test"],
    compute_metrics=lambda eval_pred: {"accuracy": (eval_pred[0].argmax(-1) == eval_pred[1]).mean()}
)
urgency_trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7036,0.66672,0.85
2,0.1333,0.196885,0.964286
3,0.092,0.170772,0.964286


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5759,0.401916,0.892857
2,0.1132,0.127931,0.964286
3,0.08,0.114852,0.971429


TrainOutput(global_step=210, training_loss=0.3722016895101184, metrics={'train_runtime': 56.2192, 'train_samples_per_second': 29.883, 'train_steps_per_second': 3.735, 'total_flos': 55637299630080.0, 'train_loss': 0.3722016895101184, 'epoch': 3.0})

In [98]:
# Test prediction for both category and urgency
import torch # Import torch

def predict_issue(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Move inputs to the device of each model explicitly
    inputs_category = {key: val.to(category_model.device) for key, val in inputs.items()}
    inputs_urgency = {key: val.to(urgency_model.device) for key, val in inputs.items()}

    # Category prediction
    with torch.no_grad():
        category_outputs = category_model(**inputs_category)  # Use inputs_category here
    category_pred = category_outputs.logits.argmax().item()
    category = list(category_labels.keys())[list(category_labels.values()).index(category_pred)]

    # Urgency prediction
    with torch.no_grad():
        urgency_outputs = urgency_model(**inputs_urgency)  # Use inputs_urgency here
    urgency_pred = urgency_outputs.logits.argmax().item()
    urgency = list(urgency_labels.keys())[list(urgency_labels.values()).index(urgency_pred)]

    return category, urgency

test_text = "This is not urgent. But I wanted to inform you that the neighbours are loud again..."
category, urgency = predict_issue(test_text)
print(f"Predicted Category: {category}, Predicted Urgency: {urgency}")

Predicted Category: miscellaneous, Predicted Urgency: semi-urgent
