In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# TODO: Change this directory to point to CallDispatching-clean.xlsx. This file should satisfy the following requirements:
# A "descriptions" column for data and a "category" column for its corresponding category.
df = pd.read_excel("CallDispatching-clean.xlsx")
df.head()

In [None]:
# encode the integers to be binary, and prints the categorization distribution result.
integer_encoding_map = {
    i:j for i, j in zip(set(df["category"].values), range(len(set(df["category"].values))))
}
integer_encoding_map

In [None]:
category_count = {}

for key in integer_encoding_map.keys():
    category_count[key] = 0

for cate in df['category'].values:
    category_count[cate] += 1

print(category_count)

temp_df = pd.DataFrame.from_dict(category_count, orient='index')
temp_df.plot(kind='bar')

In [None]:
# store the data into a .csv file based on its binary label.
dataset = []

for _, item in df.iterrows():

    blank = [0 for _ in range(12)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('call_dispatching.csv')

In [None]:
!pip install torchtext==0.2.3

# Models

This section contains 6 models. We eventually adapted the Bert model as the most effective categorization model. See our paper for more details.

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

class CNN(nn.Module):
	def __init__(self, batch_size, output_size, in_channels, out_channels, kernel_heights, stride, padding, keep_probab, vocab_size, embedding_length, weights):
		super(CNN, self).__init__()

		"""
		Arguments
		---------
		batch_size : Size of each batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		in_channels : Number of input channels. Here it is 1 as the input data has dimension = (batch_size, num_seq, embedding_length)
		out_channels : Number of output channels after convolution operation performed on the input matrix
		kernel_heights : A list consisting of 3 different kernel_heights. Convolution will be performed 3 times and finally results from each kernel_height will be concatenated.
		keep_probab : Probability of retaining an activation node during dropout operation
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embedding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table
		--------

		"""
		self.batch_size = batch_size
		self.output_size = output_size
		self.in_channels = in_channels
		self.out_channels = out_channels
		self.kernel_heights = kernel_heights
		self.stride = stride
		self.padding = padding
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length

		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
		self.conv1 = nn.Conv2d(in_channels, out_channels, (kernel_heights[0], embedding_length), stride, padding)
		self.conv2 = nn.Conv2d(in_channels, out_channels, (kernel_heights[1], embedding_length), stride, padding)
		self.conv3 = nn.Conv2d(in_channels, out_channels, (kernel_heights[2], embedding_length), stride, padding)
		self.dropout = nn.Dropout(keep_probab)
		self.label = nn.Linear(len(kernel_heights)*out_channels, output_size)

	def conv_block(self, input, conv_layer):
		conv_out = conv_layer(input)# conv_out.size() = (batch_size, out_channels, dim, 1)
		activation = F.relu(conv_out.squeeze(3))# activation.size() = (batch_size, out_channels, dim1)
		max_out = F.max_pool1d(activation, activation.size()[2]).squeeze(2)# maxpool_out.size() = (batch_size, out_channels)

		return max_out

	def forward(self, input_sentences, batch_size=None):

		"""
		The idea of the Convolutional Neural Netwok for Text Classification is very simple. We perform convolution operation on the embedding matrix
		whose shape for each batch is (num_seq, embedding_length) with kernel of varying height but constant width which is same as the embedding_length.
		We will be using ReLU activation after the convolution operation and then for each kernel height, we will use max_pool operation on each tensor
		and will filter all the maximum activation for every channel and then we will concatenate the resulting tensors. This output is then fully connected
		to the output layers consisting two units which basically gives us the logits for both positive and negative classes.

		Parameters
		----------
		input_sentences: input_sentences of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

		Returns
		-------
		Output of the linear layer containing logits for pos & neg class.
		logits.size() = (batch_size, output_size)

		"""

		input = self.word_embeddings(input_sentences)
		# input.size() = (batch_size, num_seq, embedding_length)
		input = input.unsqueeze(1)
		# input.size() = (batch_size, 1, num_seq, embedding_length)
		max_out1 = self.conv_block(input, self.conv1)
		max_out2 = self.conv_block(input, self.conv2)
		max_out3 = self.conv_block(input, self.conv3)

		all_out = torch.cat((max_out1, max_out2, max_out3), 1)
		# all_out.size() = (batch_size, num_kernels*out_channels)
		fc_in = self.dropout(all_out)
		# fc_in.size()) = (batch_size, num_kernels*out_channels)
		logits = self.label(fc_in)

		return logits

In [None]:
# _*_ coding: utf-8 _*_

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

class LSTMClassifier(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(LSTMClassifier, self).__init__()

		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table

		"""

		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length

		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
		self.lstm = nn.LSTM(embedding_length, hidden_size)
		self.label = nn.Linear(hidden_size, output_size)

	def forward(self, input_sentence, batch_size=None):

		"""
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

		Returns
		-------
		Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
		final_output.shape = (batch_size, output_size)

		"""

		''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.'''
		input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences,  embedding_length)
		input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
		if batch_size is None:
			h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM
			c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM
		else:
			h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
		final_output = self.label(final_hidden_state[-1]) # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)

		return final_output

In [None]:
# _*_ coding: utf-8 _*_

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import numpy as np

class AttentionModel(torch.nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(AttentionModel, self).__init__()

		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table

		--------

		"""

		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length

		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
		self.lstm = nn.LSTM(embedding_length, hidden_size)
		self.label = nn.Linear(hidden_size, output_size)
		#self.attn_fc_layer = nn.Linear()

	def attention_net(self, lstm_output, final_state):

		"""
		Now we will incorporate Attention mechanism in our LSTM model. In this new model, we will use attention to compute soft alignment score corresponding
		between each of the hidden_state and the last hidden_state of the LSTM. We will be using torch.bmm for the batch matrix multiplication.

		Arguments
		---------

		lstm_output : Final output of the LSTM which contains hidden layer outputs for each sequence.
		final_state : Final time-step hidden state (h_n) of the LSTM

		---------

		Returns : It performs attention mechanism by first computing weights for each of the sequence present in lstm_output and and then finally computing the
				  new hidden state.

		Tensor Size :
					hidden.size() = (batch_size, hidden_size)
					attn_weights.size() = (batch_size, num_seq)
					soft_attn_weights.size() = (batch_size, num_seq)
					new_hidden_state.size() = (batch_size, hidden_size)

		"""

		hidden = final_state.squeeze(0)
		attn_weights = torch.bmm(lstm_output, hidden.unsqueeze(2)).squeeze(2)
		soft_attn_weights = F.softmax(attn_weights, 1)
		new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)

		return new_hidden_state

	def forward(self, input_sentences, batch_size=None):

		"""
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

		Returns
		-------
		Output of the linear layer containing logits for pos & neg class which receives its input as the new_hidden_state which is basically the output of the Attention network.
		final_output.shape = (batch_size, output_size)

		"""

		input = self.word_embeddings(input_sentences)
		input = input.permute(1, 0, 2)
		if batch_size is None:
			h_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(1, self.batch_size, self.hidden_size).cuda())
		else:
			h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())

		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0)) # final_hidden_state.size() = (1, batch_size, hidden_size)
		output = output.permute(1, 0, 2) # output.size() = (batch_size, num_seq, hidden_size)

		attn_output = self.attention_net(output, final_hidden_state)
		logits = self.label(attn_output)

		return logits

In [None]:
# _*_ coding: utf-8 _*_

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

class RCNN(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(RCNN, self).__init__()

		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embedding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table

		"""

		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length

		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)# Initializing the look-up table.
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) # Assigning the look-up table to the pre-trained GloVe word embedding.
		self.dropout = 0.8
		self.lstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True)
		self.W2 = nn.Linear(2*hidden_size+embedding_length, hidden_size)
		self.label = nn.Linear(hidden_size, output_size)

	def forward(self, input_sentence, batch_size=None):

		"""
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

		Returns
		-------
		Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
		final_output.shape = (batch_size, output_size)

		"""

		"""

		The idea of the paper "Recurrent Convolutional Neural Networks for Text Classification" is that we pass the embedding vector
		of the text sequences through a bidirectional LSTM and then for each sequence, our final embedding vector is the concatenation of
		its own GloVe embedding and the left and right contextual embedding which in bidirectional LSTM is same as the corresponding hidden
		state. This final embedding is passed through a linear layer which maps this long concatenated encoding vector back to the hidden_size
		vector. After this step, we use a max pooling layer across all sequences of texts. This converts any varying length text into a fixed
		dimension tensor of size (batch_size, hidden_size) and finally we map this to the output layer.

		"""
		input = self.word_embeddings(input_sentence) # embedded input of shape = (batch_size, num_sequences, embedding_length)
		input = input.permute(1, 0, 2) # input.size() = (num_sequences, batch_size, embedding_length)
		if batch_size is None:
			h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) # Initial hidden state of the LSTM
			c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda()) # Initial cell state of the LSTM
		else:
			h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())

		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))

		final_encoding = torch.cat((output, input), 2).permute(1, 0, 2)
		y = self.W2(final_encoding) # y.size() = (batch_size, num_sequences, hidden_size)
		y = y.permute(0, 2, 1) # y.size() = (batch_size, hidden_size, num_sequences)
		y = F.max_pool1d(y, y.size()[2]) # y.size() = (batch_size, hidden_size, 1)
		y = y.squeeze(2)
		logits = self.label(y)

		return logits

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

class RNN(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(RNN, self).__init__()

		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table

		"""

		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length

		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)
		self.rnn = nn.RNN(embedding_length, hidden_size, num_layers=2, bidirectional=True)
		self.label = nn.Linear(4*hidden_size, output_size)

	def forward(self, input_sentences, batch_size=None):

		"""
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

		Returns
		-------
		Output of the linear layer containing logits for pos & neg class which receives its input as the final_hidden_state of RNN.
		logits.size() = (batch_size, output_size)

		"""

		input = self.word_embeddings(input_sentences)
		input = input.permute(1, 0, 2)
		if batch_size is None:
			h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda()) # 4 = num_layers*num_directions
		else:
			h_0 =  Variable(torch.zeros(4, batch_size, self.hidden_size).cuda())
		output, h_n = self.rnn(input, h_0)
		# h_n.size() = (4, batch_size, hidden_size)
		h_n = h_n.permute(1, 0, 2) # h_n.size() = (batch_size, 4, hidden_size)
		h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])
		# h_n.size() = (batch_size, 4*hidden_size)
		logits = self.label(h_n) # logits.size() = (batch_size, output_size)

		return logits

In [None]:
# _*_ coding: utf-8 _*_

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

class SelfAttention(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length, weights):
		super(SelfAttention, self).__init__()

		"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		output_size : 2 = (pos, neg)
		hidden_sie : Size of the hidden_state of the LSTM
		vocab_size : Size of the vocabulary containing unique words
		embedding_length : Embeddding dimension of GloVe word embeddings
		weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table

		--------

		"""

		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		self.weights = weights

		self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
		self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
		self.dropout = 0.8
		self.bilstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True)
		# We will use da = 350, r = 30 & penalization_coeff = 1 as per given in the self-attention original ICLR paper
		self.W_s1 = nn.Linear(2*hidden_size, 350)
		self.W_s2 = nn.Linear(350, 30)
		self.fc_layer = nn.Linear(30*2*hidden_size, 2000)
		self.label = nn.Linear(2000, output_size)

	def attention_net(self, lstm_output):

		"""
		Now we will use self attention mechanism to produce a matrix embedding of the input sentence in which every row represents an
		encoding of the inout sentence but giving an attention to a specific part of the sentence. We will use 30 such embedding of
		the input sentence and then finally we will concatenate all the 30 sentence embedding vectors and connect it to a fully
		connected layer of size 2000 which will be connected to the output layer of size 2 returning logits for our two classes i.e.,
		pos & neg.

		Arguments
		---------

		lstm_output = A tensor containing hidden states corresponding to each time step of the LSTM network.
		---------

		Returns : Final Attention weight matrix for all the 30 different sentence embedding in which each of 30 embeddings give
				  attention to different parts of the input sentence.

		Tensor size : lstm_output.size() = (batch_size, num_seq, 2*hidden_size)
					  attn_weight_matrix.size() = (batch_size, 30, num_seq)

		"""
		attn_weight_matrix = self.W_s2(F.tanh(self.W_s1(lstm_output)))
		attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1)
		attn_weight_matrix = F.softmax(attn_weight_matrix, dim=2)

		return attn_weight_matrix

	def forward(self, input_sentences, batch_size=None):

		"""
		Parameters
		----------
		input_sentence: input_sentence of shape = (batch_size, num_sequences)
		batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

		Returns
		-------
		Output of the linear layer containing logits for pos & neg class.

		"""

		input = self.word_embeddings(input_sentences)
		input = input.permute(1, 0, 2)
		if batch_size is None:
			h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
		else:
			h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
			c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())

		output, (h_n, c_n) = self.bilstm(input, (h_0, c_0))
		output = output.permute(1, 0, 2)
		# output.size() = (batch_size, num_seq, 2*hidden_size)
		# h_n.size() = (1, batch_size, hidden_size)
		# c_n.size() = (1, batch_size, hidden_size)
		attn_weight_matrix = self.attention_net(output)
		# attn_weight_matrix.size() = (batch_size, r, num_seq)
		# output.size() = (batch_size, num_seq, 2*hidden_size)
		hidden_matrix = torch.bmm(attn_weight_matrix, output)
		# hidden_matrix.size() = (batch_size, r, 2*hidden_size)
		# Let's now concatenate the hidden_matrix and connect it to the fully connected layer.
		fc_out = self.fc_layer(hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2]))
		logits = self.label(fc_out)
		# logits.size() = (batch_size, output_size)

		return logits

After initializing the models, we load the dataset and load the method for training the model.

In [None]:
# _*_ coding: utf-8 _*_

import os
import sys
import torch
from torch.nn import functional as F
import numpy as np
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

from torchtext.data import Field, Dataset, Example
import pandas as pd

class DataFrameDataset(Dataset):
    def __init__(self, examples, fields, filter_pred=None):
        self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
        if filter_pred is not None:
            self.examples = filter(filter_pred, self.examples)
        self.fields = dict(fields)

        for n, f in list(self.fields.items()):
            if isinstance(n, tuple):
                self.fields.update(zip(n, f))
                del self.fields[n]

class SeriesExample(Example):
    @classmethod
    def fromSeries(cls, data, fields):
        return cls.fromdict(data.to_dict(), fields)

    @classmethod
    def fromdict(cls, data, fields):
        ex = cls()
        for key, field in fields.items():
            if key not in data:
                raise ValueError("Specified key {} was not found in "
                     "the input data".format(key))
            if field is not None:
                setattr(ex, key, field.preprocess(data[key]))
            else:
                setattr(ex, key, data[key])
        return ex


def load_dataset(test_sen=None, path):

    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.

    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.

    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.

    """

    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    datafields = { 'label' : LABEL, 'text' : TEXT }
    torch_dataset = pd.read_csv(path)
    # train_data, test_data = torch_dataset.split(split_ratio=[0.9, 0.1])

    torch_dataset = DataFrameDataset(torch_dataset, datafields)

    train_ds, test_ds = torch_dataset.split(split_ratio=[0.9, 0.1])

    TEXT.build_vocab(train_ds, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(test_ds)

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_ds.split() # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_ds), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds

In [None]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad != None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.text[0]
        target = batch.label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] != 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1

        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')

        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()

    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    model.eval()
    predictions = []
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] != 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            # print(target)
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            # print(prediction)
            loss = loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter), predictions

Below are the 12 categories. Each of them is used to binary classify whether the data belongs to this category. Since the code for each block is similar, I will include comments for only the first category, Minor Crash or Not.

# Minor Crash Or Not (Binary Classification)

In [None]:
# Redo the binary categorization
crash_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/crash_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

In [None]:
# convert to csv with the integer encoding
dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('crash_or_not.csv')

In [None]:
# call to the pre-loaded function to load our dataset
TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

In [None]:
# TODO: adjust the parameter and select the appropriate model for training data
learning_rate = 3e-4
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

# model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

In [None]:
# Training data
# TODO: You can adjust the number of epochs here.

for epoch in range(30):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc, _ = eval_model(model, valid_iter)
    test_loss, test_acc, preds = eval_model(model, test_iter)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
crash_df.head()

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "crash":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

In [None]:
from datasets import Dataset

In [None]:
allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

In [None]:
# parse the training dataset
allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/crash_or_not/train_conf.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/crash_or_not/test_conf.csv")

In [None]:
!pip install transformers[torch]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.dropout.p=0.5

In [None]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# Depending on the experiment purpose, add/ignore the dropout layer

# model.dropout.train()

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["test"],
#     compute_metrics=compute_metrics,
# )

trainer.evaluate()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-crash-classification")

In [None]:
preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])

In [None]:
# Generates the confusion matrix
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
def most_frequent(nums):
  counter = {}
  for num in nums:
    if num in counter:
      counter[num] += 1
    else:
      counter[num] = 1

  most_freq = max(counter, key=counter.get)
  freq = counter[most_freq]

  return most_freq, freq

In [None]:
from tqdm import tqdm

In [None]:
model = model.to('cuda')
model.dropout.train()

preds = []
confs = []

for i in tqdm(tokenized_datasets['test']):
    text = i['text']
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to("cuda")

    preds_list = []

    for ct in range(100):
        # print(ct)
        prediction = torch.argmax(model(**encoded_input).logits)
        preds_list.append((int(prediction.to("cpu").tolist())))


    finalized_pred, support = most_frequent(preds_list)
    preds.append(finalized_pred)
    confs.append(support/100)

In [None]:
golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])

In [None]:
print(confs[0])

In [None]:
new_preds = []
new_golds = []

for p, c, g in zip(preds, confs, golds):
    if c >= 0.9:
        new_preds.append(p)
        new_golds.append(g)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(new_golds, new_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(new_golds, new_preds))

# Lost Or Not

In [None]:
crash_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/lost_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

In [None]:
dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('lost_or_not.csv')

In [None]:
TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

learning_rate = 3e-4
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

# model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

In [None]:
for epoch in range(30):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc, _ = eval_model(model, valid_iter)
    test_loss, test_acc, preds = eval_model(model, test_iter)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.dropout.p=0.5

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "lost":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

In [None]:
from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

In [None]:
allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/lost_or_not/train_conf.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/lost_or_not/test_conf.csv")

In [None]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=3, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-lost-classification-conf")

In [None]:
preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
model = model.to('cuda')
model.dropout.train()

preds = []
confs = []

for i in tqdm(tokenized_datasets['test']):
    text = i['text']
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to("cuda")

    preds_list = []

    for ct in range(100):
        # print(ct)
        prediction = torch.argmax(model(**encoded_input).logits)
        preds_list.append((int(prediction.to("cpu").tolist())))


    finalized_pred, support = most_frequent(preds_list)
    preds.append(finalized_pred)
    confs.append(support/100)

In [None]:
new_preds = []
new_golds = []

for p, c, g in zip(preds, confs, golds):
    if c >= 0.9:
        new_preds.append(p)
        new_golds.append(g)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(new_golds, new_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(new_golds, new_preds))

# Aggressive Or Not

In [None]:
crash_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/aggressive_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

In [None]:
dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('aggressive_or_not.csv')

In [None]:
TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

learning_rate = 3e-4
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

# model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

In [None]:
for epoch in range(30):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc, _ = eval_model(model, valid_iter)
    test_loss, test_acc, preds = eval_model(model, test_iter)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "aggressive driver":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

In [None]:
from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

In [None]:
allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/aggressive_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/aggressive_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-aggressive-classification")

In [None]:
preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

# Check Welfare or not

In [None]:
crash_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/welfare_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('welfare_or_not.csv')

TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

learning_rate = 3e-4
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

# model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(30):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc, _ = eval_model(model, valid_iter)
    test_loss, test_acc, preds = eval_model(model, test_iter)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "check welfare":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/welfare_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/welfare_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-welfare-classification")

# Damaged Property or Not

In [None]:
crash_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/damaged_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('damaged_or_not.csv')

TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

learning_rate = 3e-4
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

# model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(30):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc, _ = eval_model(model, valid_iter)
    test_loss, test_acc, preds = eval_model(model, test_iter)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "damaged property":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/damaged_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/damaged_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-damaged-classification")

# Noise or Not

In [None]:
crash_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/noise_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('noise_or_not.csv')

TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

learning_rate = 3e-4
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 300

# model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
loss_fn = F.cross_entropy

for epoch in range(30):
    train_loss, train_acc = train_model(model, train_iter, epoch)
    val_loss, val_acc, _ = eval_model(model, valid_iter)
    test_loss, test_acc, preds = eval_model(model, test_iter)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "noise violation":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/damaged_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/damaged_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-noise-classification")

# Roadway or Not

In [None]:
crash_df = pd.read_excel("/content/roadway_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('roadway_or_not.csv')

# TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

# learning_rate = 3e-4
# batch_size = 32
# output_size = 2
# hidden_size = 256
# embedding_length = 300

# # model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# # model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# loss_fn = F.cross_entropy

# for epoch in range(30):
#     train_loss, train_acc = train_model(model, train_iter, epoch)
#     val_loss, val_acc, _ = eval_model(model, valid_iter)
#     test_loss, test_acc, preds = eval_model(model, test_iter)
#     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
!pip install datasets

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "roadway hazard":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/roadway_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/roadway_or_not/test.csv")

In [None]:
!pip install transformers
!pip install transformers[torch]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
print(trainer.evaluate())

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-roadway-classification")

# Abandoned or Not

In [None]:
crash_df = pd.read_excel("/content/abandoned_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('abandoned_or_not.csv')

# TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

# learning_rate = 3e-4
# batch_size = 32
# output_size = 2
# hidden_size = 256
# embedding_length = 300

# # model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# # model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# loss_fn = F.cross_entropy

# for epoch in range(30):
#     train_loss, train_acc = train_model(model, train_iter, epoch)
#     val_loss, val_acc, _ = eval_model(model, valid_iter)
#     test_loss, test_acc, preds = eval_model(model, test_iter)
#     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "abandoned vehicles":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/abandoned_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/abandoned_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
print(trainer.evaluate())

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-abandoned-classification")

# Drug or Not

In [None]:
crash_df = pd.read_excel("/content/drug_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('drug_or_not.csv')

# TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

# learning_rate = 3e-4
# batch_size = 32
# output_size = 2
# hidden_size = 256
# embedding_length = 300

# # model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# # model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# loss_fn = F.cross_entropy

# for epoch in range(30):
#     train_loss, train_acc = train_model(model, train_iter, epoch)
#     val_loss, val_acc, _ = eval_model(model, valid_iter)
#     test_loss, test_acc, preds = eval_model(model, test_iter)
#     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "drug or prostitution activity":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/drug_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/drug_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-drug-classification")

# Animal or Not

In [None]:
crash_df = pd.read_excel("/content/animal_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('animal_or_not.csv')

# TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

# learning_rate = 3e-4
# batch_size = 32
# output_size = 2
# hidden_size = 256
# embedding_length = 300

# # model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# # model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# loss_fn = F.cross_entropy

# for epoch in range(30):
#     train_loss, train_acc = train_model(model, train_iter, epoch)
#     val_loss, val_acc, _ = eval_model(model, valid_iter)
#     test_loss, test_acc, preds = eval_model(model, test_iter)
#     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "animal":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/animal_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/animal_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-animal-classification")

# Illegal Parking or Not

In [None]:
crash_df = pd.read_excel("/content/illegalparking_or_not.xlsx")
integer_encoding_map = {
    i:j for i, j in zip(set(crash_df["category"].values), range(len(set(crash_df["category"].values))))
}
print(integer_encoding_map)

dataset = []

for _, item in crash_df.iterrows():

    blank = [0 for _ in range(2)]
    idx = integer_encoding_map[item['category']]
    blank[idx] = 1
    dataset.append({'text': item['descriptions'], 'label': blank})


temp_df = pd.DataFrame(dataset)
temp_df.head()

temp_df.to_csv('illegalparking_or_not.csv')

# TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter, test_ds = load_dataset()

# learning_rate = 3e-4
# batch_size = 32
# output_size = 2
# hidden_size = 256
# embedding_length = 300

# # model = LSTMClassifier(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# model = CNN(batch_size=batch_size, output_size=output_size, in_channels=1, out_channels=1, kernel_heights=[3,6,9], stride=1, padding=0, keep_probab=0.5, vocab_size=vocab_size, embedding_length=embedding_length, weights=word_embeddings)
# # model = AttentionModel(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RCNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = RNN(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# # model = SelfAttention(batch_size, output_size, hidden_size, vocab_size, embedding_length, word_embeddings)
# loss_fn = F.cross_entropy

# for epoch in range(30):
#     train_loss, train_acc = train_model(model, train_iter, epoch)
#     val_loss, val_acc, _ = eval_model(model, valid_iter)
#     test_loss, test_acc, preds = eval_model(model, test_iter)
#     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')

In [None]:
positive = []
negative = []

for _, item in crash_df.iterrows():
    if item["category"] == "illegal parking":
        positive.append({'label': 1, "text": item['descriptions']})
    else:
        negative.append({'label': 0, "text": item['descriptions']})

from datasets import Dataset

allDataset = positive + negative
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/illegalparking_or_not/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/illegalparking_or_not/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=5, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-illegalparking-classification")

# Others-Major

In [None]:
major_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/others-major.xlsx")
major_df.head()

In [None]:
integer_encoding_map = {
    i:j for i, j in zip(set(major_df["category"].values), range(len(set(major_df["category"].values))))
}
integer_encoding_map

In [None]:
aggressive_driver = []
check_welfare = []
damaged_property = []

In [None]:
for _, item in major_df.iterrows():
    if item['category'] == "aggressive driver":
        aggressive_driver.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "check welfare":
        check_welfare.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "damaged property":
        damaged_property.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

In [None]:
allDataset = aggressive_driver + check_welfare + damaged_property
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

In [None]:
allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/others-major/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/others-major/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

In [None]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=3, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("/content/drive/MyDrive/bert-others-major-classification")

In [None]:
preds_report = trainer.predict(tokenized_datasets['test'])

preds = []

for i in preds_report[0]:
  preds.append(np.argmax(i))

golds = []

for i in tokenized_datasets["test"]:
  golds.append(i['label'])


import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

cm = confusion_matrix(golds, preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
print(classification_report(golds, preds))

# Others-Minor

In [None]:
minor_df = pd.read_excel("/content/drive/MyDrive/CIVIC-2023/CallDispatching/others-minor.xlsx")
minor_df.head()

In [None]:
integer_encoding_map = {
    i:j for i, j in zip(set(minor_df["category"].values), range(len(set(minor_df["category"].values))))
}
integer_encoding_map

In [None]:
drug_pros = []
animal = []
illegal_parking = []
abandoned_vehicle = []
found_property = []
roadway_harzard = []
noise_violation = []

In [None]:
for _, item in minor_df.iterrows():
    if item['category'] == "drug or prostitution activity":
        drug_pros.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "animal":
        animal.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "illegal parking":
        illegal_parking.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "abandoned vehicles":
        abandoned_vehicle.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "found property":
        found_property.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "roadway hazard":
        roadway_harzard.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

    if item['category'] == "noise violation":
        noise_violation.append({"label": integer_encoding_map[item["category"]], "text": item["descriptions"]})

In [None]:
allDataset = drug_pros + animal + illegal_parking + abandoned_vehicle + found_property + roadway_harzard + noise_violation
allDataset = pd.DataFrame(allDataset)

allDataset = Dataset.from_pandas(allDataset)
allDataset = allDataset.train_test_split(test_size=0.2)
allDataset['train']

print(set(allDataset['train']['label']))
print(set(allDataset['test']['label']))

In [None]:
allDataset['train'].to_csv("/content/drive/MyDrive/311_datasets/others-minor/train.csv")
allDataset['test'].to_csv("/content/drive/MyDrive/311_datasets/others-minor/test.csv")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = allDataset.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7)

In [None]:
# model.dropout

In [None]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy

# training_args = TrainingArguments(output_dir="test_trainer", logging_steps=1)

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=10, logging_steps=1)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=2023).select(range(len(tokenized_datasets["train"])))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=2023).select(range(len(tokenized_datasets["test"])))


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()