# Tag Conversations
Given a text transcript of a conversation between two people, we want to assign it a topic that they were most likely talking about.

### Importing Libraries

In [1]:
import os
import io
import re
import glob
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

Callback class is created to stop training of model at desired accuracy

In [2]:
class Endtrain(tf.keras.callbacks.Callback):
  """
  Model stops training when accuracy reaches 99.5% 
  """
  def on_epoch_end(self, epoch, logs={}):
    if (logs.get('accuracy')>0.995):
      self.model.stop_training = True


### Data Extraction from Transcription .txt file
For each given file, the entire converation is extracted as a single string conversation by removing time, few stop words such SILENCE, NOISE.Finally it's converted into a dataframe with a conversation ID and with it's associated label.

In [3]:
def sentences(texts):
    
    pattern = '.*([0-9]\.[0-9]+)'                                              #removes time from conversations
    conversations = list(set([re.sub(pattern, '', i) for i in texts]))
    pattern1 = '\n'                                                        #removes whitespaces from conversations
    conversations = list(set([re.sub(pattern1, '', i) for i in conversations]))
    conversations = ''.join(conversations)
    conversations = conversations.replace(' [noise] ',' ')
    conversations = conversations.replace(' [silence] ',' ')
    return conversations

work_dir = "/home/enian/Desktop/IDFY/tagging_test/"
contents = {}
    
for path in glob.glob(os.path.join(work_dir, "*.txt")):
    with io.open(path, mode="r", encoding="utf-8") as fd:
        contents[''.join(filter(str.isdigit, path))] = sentences(fd.readlines())
            
contents = {k: v for k, v in sorted(contents.items())}
dataset = pd.DataFrame(contents.items(), columns=['Conversation_ID', 'Conversation'])

label = pd.read_csv("/home/enian/Desktop/IDFY/tagging_test/metadata/dataset.csv", index_col=False)
dataset['Label'] = label['Label'].values  

### Loading Dataset
Out of 240 text blobs, 230 text blobs are used to train/validate the model and 10 text blobs are used to test the model.  

In [4]:
dataset.head()

Unnamed: 0,Conversation_ID,Conversation,Label
0,2023,that's hard yeah you know uh is he going to y...,Credit Card
1,2061,and uh you know unless the Navy funds it i pr...,Credit Card
2,2092,are a[re]- are you working anywhere while you...,Family Finance
3,2129,so anyway uh uh you know on no what is that u...,Job Benefits
4,2163,yeah that's one way to do it because that tha...,Credit Card


### Assigning Feature and Label sets

In [5]:
X = dataset['Conversation']
y = dataset['Label']

### Transforming label set with LabelEncoder
LabelEncoder() assigns a integer value for each label after sorting them. 
- 0 - Bank Bailout
- 1 - Budget
- 2 - Credit Card
- 3 - Family Finance
- 4 - Job Benefits
- 5 - Taxes  

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([2, 2, 3, 4, 2, 0, 5, 1, 3, 4, 5, 1, 4, 3, 4, 3, 2, 3, 5, 2, 5, 3,
       4, 2, 2, 2, 3, 5, 2, 0, 3, 0, 1, 3, 3, 1, 2, 0, 4, 5, 2, 5, 2, 3,
       4, 1, 2, 2, 4, 3, 4, 2, 0, 3, 5, 1, 1, 5, 2, 3, 5, 5, 2, 3, 4, 4,
       4, 5, 5, 2, 5, 4, 1, 2, 4, 2, 2, 1, 2, 5, 3, 3, 3, 4, 4, 1, 5, 2,
       5, 3, 2, 3, 3, 1, 2, 1, 2, 3, 4, 5, 2, 3, 5, 2, 1, 5, 5, 2, 4, 4,
       5, 1, 2, 2, 4, 2, 5, 3, 3, 3, 2, 3, 5, 3, 4, 1, 2, 2, 5, 4, 4, 2,
       5, 3, 2, 4, 2, 2, 1, 1, 5, 4, 5, 3, 4, 5, 1, 4, 1, 4, 3, 3, 4, 5,
       3, 5, 5, 3, 4, 3, 5, 4, 3, 3, 4, 4, 5, 4, 3, 5, 5, 3, 5, 3, 3, 3,
       4, 4, 5, 5, 3, 5, 5, 5, 3, 3, 4, 5, 4, 3, 4, 4, 3, 5, 1, 4, 5, 3,
       5, 4, 3, 3, 5, 3, 3, 1, 3, 3, 3, 3, 4, 4, 4, 4, 3, 5, 3, 5, 4, 4,
       4, 5, 3, 3, 4, 4, 5, 5, 3, 4])

### Splitting the train and testing sets

In [7]:
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Training Parameters 

In [8]:
vocab_size = 10000
embedding_dim = 32
max_length = 1500
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
num_epochs = 150
callback = Endtrain()

### Data Pre-processing for the model

In [9]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) # creates tokens for each word
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences) # creates a set of tokens
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # maintains uniformity in size

testing_sequences = tokenizer.texts_to_sequences(testing_sentences) 
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [10]:
# Need this block to get it to work with TensorFlow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

### Model

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), 
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

### Model Training

In [12]:
history = model.fit(training_padded, training_labels, epochs=num_epochs, callbacks=callback, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/150
6/6 - 2s - loss: 1.7878 - accuracy: 0.2500 - val_loss: 1.7895 - val_accuracy: 0.0870
Epoch 2/150
6/6 - 0s - loss: 1.7777 - accuracy: 0.2609 - val_loss: 1.7839 - val_accuracy: 0.0870
Epoch 3/150
6/6 - 0s - loss: 1.7689 - accuracy: 0.2609 - val_loss: 1.7790 - val_accuracy: 0.0870
Epoch 4/150
6/6 - 0s - loss: 1.7594 - accuracy: 0.2609 - val_loss: 1.7746 - val_accuracy: 0.0870
Epoch 5/150
6/6 - 0s - loss: 1.7481 - accuracy: 0.2609 - val_loss: 1.7688 - val_accuracy: 0.0870
Epoch 6/150
6/6 - 0s - loss: 1.7391 - accuracy: 0.2609 - val_loss: 1.7633 - val_accuracy: 0.0870
Epoch 7/150
6/6 - 0s - loss: 1.7274 - accuracy: 0.2609 - val_loss: 1.7569 - val_accuracy: 0.0870
Epoch 8/150
6/6 - 0s - loss: 1.7161 - accuracy: 0.2609 - val_loss: 1.7517 - val_accuracy: 0.0870
Epoch 9/150
6/6 - 0s - loss: 1.7039 - accuracy: 0.2609 - val_loss: 1.7459 - val_accuracy: 0.0870
Epoch 10/150
6/6 - 0s - loss: 1.6926 - accuracy: 0.2609 - val_loss: 1.7401 - val_accuracy: 0.0870
Epoch 11/150
6/6 - 0s - loss:

Epoch 85/150
6/6 - 0s - loss: 0.3641 - accuracy: 0.9511 - val_loss: 0.7075 - val_accuracy: 0.8696
Epoch 86/150
6/6 - 0s - loss: 0.3531 - accuracy: 0.9511 - val_loss: 0.7001 - val_accuracy: 0.8696
Epoch 87/150
6/6 - 0s - loss: 0.3420 - accuracy: 0.9511 - val_loss: 0.6808 - val_accuracy: 0.8696
Epoch 88/150
6/6 - 0s - loss: 0.3311 - accuracy: 0.9511 - val_loss: 0.6724 - val_accuracy: 0.8696
Epoch 89/150
6/6 - 0s - loss: 0.3210 - accuracy: 0.9511 - val_loss: 0.6659 - val_accuracy: 0.8696
Epoch 90/150
6/6 - 0s - loss: 0.3109 - accuracy: 0.9511 - val_loss: 0.6547 - val_accuracy: 0.8696
Epoch 91/150
6/6 - 0s - loss: 0.3021 - accuracy: 0.9511 - val_loss: 0.6405 - val_accuracy: 0.8696
Epoch 92/150
6/6 - 0s - loss: 0.2925 - accuracy: 0.9511 - val_loss: 0.6326 - val_accuracy: 0.8696
Epoch 93/150
6/6 - 0s - loss: 0.2839 - accuracy: 0.9511 - val_loss: 0.6273 - val_accuracy: 0.8696
Epoch 94/150
6/6 - 0s - loss: 0.2753 - accuracy: 0.9620 - val_loss: 0.6181 - val_accuracy: 0.8696
Epoch 95/150
6/6 - 0

Model is build with taining accuracy = 1 and testing accuracy =0.89

### Loading Testset

In [13]:
testset = pd.read_csv('/home/enian/Desktop/IDFY/tagging_test/test/testset.csv', index_col=False)
testset

Unnamed: 0,Conversation_ID,Conversation,Label
0,2022,i guess that's most of my um financial plans ...,Family Finance
1,2067,yes everybody in the country is preapproved i...,Credit Card
2,2085,right because you know i[f]- once you get tha...,Job Benefits
3,2298,right i don't think there is a i don't know i...,Bank Bailout
4,3001,didn't get that far you normally just try to ...,Job Benefits
5,3040,uh both those centers of of business plus eve...,Taxes
6,3409,right so i guess uh we've kind of exhausted t...,Credit Card
7,4000,we both have done that um we both work at com...,Family Finance
8,4044,and we weren't a debtor nation in those days ...,Taxes
9,4843,uh you know i mean they're treating it as a t...,Budget


### Data Pre-processing of TestSet

In [14]:
X_testset = testset['Conversation']

X_testset_sequences = tokenizer.texts_to_sequences(X_testset)
X_testset_padded = pad_sequences(X_testset_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
X_testset_padded = np.array(X_testset_padded)

### Model Prediction for TestSet

In [15]:
prediction = model.predict(X_testset_padded) * 100

Mapping all predicted values to corresponding labels

In [16]:
predicted_values = []

for i in range(10):
         
    dict = {
        'Bank Bailout':prediction[i][0],
        'Budget':prediction[i][1],
        'Credit Card':prediction[i][2],
        'Family Finance':prediction[i][3],
        'Job Benefits':prediction[i][4],
        'Taxes':prediction[i][5]
        }
    
    predicted_values.append(dict)

Retriving the predicted label for each instance of the testset

In [17]:
pred_list = []

for item in predicted_values:
    pred_list.append(max(item, key=item.get))  

Creating a dataframe with input and output values of the prediction of testset

In [18]:
prediction_DF = pd.DataFrame(pred_list, columns=['Predicted Label'])
prediction_DF['ID'] = testset['Conversation_ID']
prediction_DF['Actual Label'] = testset['Label']
prediction_DF['Label Probabilities'] = predicted_values
prediction_DF = prediction_DF[['ID', 'Actual Label', 'Predicted Label', 'Label Probabilities']]

In [19]:
pd.set_option('display.max_colwidth', None) #shows the column entire value 

In [20]:
prediction_DF 

Unnamed: 0,ID,Actual Label,Predicted Label,Label Probabilities
0,2022,Family Finance,Family Finance,"{'Bank Bailout': 0.011287683, 'Budget': 0.014482645, 'Credit Card': 0.4860656, 'Family Finance': 99.08468, 'Job Benefits': 0.18233491, 'Taxes': 0.22114274}"
1,2067,Credit Card,Credit Card,"{'Bank Bailout': 3.41153, 'Budget': 8.062574, 'Credit Card': 67.28123, 'Family Finance': 8.516489, 'Job Benefits': 3.656557, 'Taxes': 9.071617}"
2,2085,Job Benefits,Job Benefits,"{'Bank Bailout': 1.0361885, 'Budget': 7.8174734, 'Credit Card': 1.7788537, 'Family Finance': 14.409399, 'Job Benefits': 65.41844, 'Taxes': 9.539645}"
3,2298,Bank Bailout,Taxes,"{'Bank Bailout': 6.401846, 'Budget': 29.143078, 'Credit Card': 10.029702, 'Family Finance': 3.4768014, 'Job Benefits': 9.856279, 'Taxes': 41.092297}"
4,3001,Job Benefits,Job Benefits,"{'Bank Bailout': 0.043779835, 'Budget': 0.52578384, 'Credit Card': 0.31250897, 'Family Finance': 19.525118, 'Job Benefits': 79.14093, 'Taxes': 0.45187894}"
5,3040,Taxes,Taxes,"{'Bank Bailout': 2.368572, 'Budget': 13.672005, 'Credit Card': 1.306475, 'Family Finance': 1.2994086, 'Job Benefits': 5.5026546, 'Taxes': 75.85088}"
6,3409,Credit Card,Credit Card,"{'Bank Bailout': 1.3731953, 'Budget': 2.0076926, 'Credit Card': 76.40156, 'Family Finance': 12.715876, 'Job Benefits': 1.4372394, 'Taxes': 6.064428}"
7,4000,Family Finance,Family Finance,"{'Bank Bailout': 0.484196, 'Budget': 1.106208, 'Credit Card': 7.45833, 'Family Finance': 81.95137, 'Job Benefits': 4.5150476, 'Taxes': 4.4848385}"
8,4044,Taxes,Taxes,"{'Bank Bailout': 2.284034, 'Budget': 12.104473, 'Credit Card': 5.1391473, 'Family Finance': 3.7382405, 'Job Benefits': 8.386948, 'Taxes': 68.34715}"
9,4843,Budget,Taxes,"{'Bank Bailout': 4.0894427, 'Budget': 16.026861, 'Credit Card': 4.833179, 'Family Finance': 2.060556, 'Job Benefits': 3.0100923, 'Taxes': 69.97987}"
