# To do: 
    
    -> Create dataframe (Done)
     
    -> Make a class object for transcript data (Done)
    
    -> Use spAcy or NLTK to parse information (Done)
    
    -> Construct additional column(s) (Done)
    
    -> Convert df to csv (Done)
    

In [2]:
# Get JSON

import json
with open("calls.json", "r") as f:
    calls = json.load(f)
    

In [None]:
# Create Dataframe

import pandas as pd

titles = list(calls[1].keys())

df = pd.DataFrame(columns = titles)

for row_idx in range(len(calls)):
    df.loc[len(df.index)] = calls[row_idx]

df

In [4]:
# Create Transcripts class

class Transcripts: 
    """
    Transcripts object for more efficient API and scalability.

    Index corresponds to dataframe.
    """
    def __init__(self, transcript: list[dict]):

        self._scripts = []
        for i in transcript:
            temp_dict = {}
            for j in range(len(i)):
                temp_dict[i[j]['user']] = temp_dict.get(i[j]['user'], []) + [(i[j]['text'], j)]
            self._scripts.append(temp_dict)

    def get(self, idx: int, user: None | str = None) -> str:
        """
        If user is not specified will return whole transcript with USER and ASSISTANT labels. Runtime will be O(n + m)

        Otherwise will return the transcript of specified user. Runtime will be O(n) or O(m) respectively.

        """
        
        script = self._scripts[idx]
        res = ''
        
        if not user:
            # Merge two sorted arrays
            arr1 = list(script.get('user', ''))
            arr2 = list(script.get('assistant', ''))
            
            merged = []

            i, j = 0, 0

            while i < len(arr1) and j < len(arr2):
                if arr1[i][1] <= arr2[j][1]:
                    merged.append((arr1[i], 'user'))
                    i += 1
                else:
                    merged.append((arr2[j], 'assistant'))
                    j += 1

            for k in arr1[i:]:
                merged.append((k, 'user'))
            for k in arr2[j:]:
                merged.append((k, 'assistant'))

            for i in merged:
                res += f"{i[1]}: ".upper() + i[0][0] + ' '
        
        else:
            # User is specified
            for i in script.get(user, ''):
                res += i[0] + ' '
        return res
    

In [5]:
# Example usage of Transcripts object

transcript = Transcripts(df['transcripts'])

print(transcript.get(2), '\n')
print(transcript.get(2, 'user'), '\n')
print(transcript.get(2, 'assistant'))


USER: Morning. PKF. This is Alicia. ASSISTANT: Yea, are you still located at 1 Bligh Street?
 USER: No. Sorry. We're at 1 O'Connell Street. ASSISTANT: Great, thanks! [Finish]
  

Morning. PKF. This is Alicia. No. Sorry. We're at 1 O'Connell Street.  

Yea, are you still located at 1 Bligh Street?
 Great, thanks! [Finish]
 


# 4 categories:
 
         -> Yes, No:     
               * The most simple determiner of verification status.

         -> Assistant address:     
               * The address as dictated by the 'assistant'.      
                   $ In other words, the registered address prior to calling.

         -> User address:     
               * The address as dictated by the 'user'.     
                   $ Either a mention or a variation from Assistant address indicates an Address change.

         -> Level:

               * The level (if mentioned) by the 'user.           
                   $ Mentioning the level without uttering a 'no' indicates a likely address confirmation.
                   $ Verification_status is sparse, meaing that correcting false positives is easier than vice versa.

In [26]:
"""
    Using en_core_web_sm: the small version of spAcy's english model for time efficiency.
"""

import spacy                            # !pip install spacy
nlp = spacy.load("en_core_web_sm")      # !python -m spacy download en


In [7]:
user_response = []

for i in range(len(df)):
    doc = nlp(transcript.get(i, 'user'))
    yes_list = [token.text for token in doc if token.lemma_.lower() in ('yes', 'yeah', 'yep')]    
    no_list = [token.text for token in doc if token.lemma_.lower() in ('no', 'nah')]
    if yes_list and not no_list:
        user_response.append('Yes')
    elif no_list and not yes_list:
        user_response.append('No')
    else:
        user_response.append(None)

In [56]:
import re

"""
    Creating regex pattern to parse address information.
"""

address_endings = ['street', 'avenue', 'road', 'lane', 'drive', 'boulevard',
                    'court', 'place', 'square', 'alley']  

pattern = re.compile(r'\d*\s*([a-zA-Z]+(\s|\'[a-zA-Z]+)*)+\s*(?:' + '|'.join(address_endings) + r's*)', 
                     re.IGNORECASE) # Regex to determine if_address


In [57]:
assistant_address = []

"""
    Occasionally, python's re.search enters a never-ending loop while parsing certain pieces of text.
    
    A temporary array: temp, is used to reduce the amount of strings passed into re.search.
"""
for i in range(len(df)):
    address = None
    doc = nlp(transcript.get(i, 'assistant'))
    temp = []
    for j in doc.noun_chunks: 
        sentence = str(j)
        for ending in address_endings:
            if ending in sentence.lower():
                temp.append(sentence)
                break
    for s in temp:
        match = pattern.search(s)
        if match:
            address = match.group(0)
            break
    
    assistant_address.append(address)

'25 Martin Place'

In [58]:
user_address = []

for i in range(len(df)):
    address = None
    doc = nlp(transcript.get(i, 'user'))
    temp = []
    for j in doc.noun_chunks:
        sentence = str(j)
        for ending in address_endings:
            if ending in sentence.lower():
                temp.append(sentence)
                break
    for s in temp:
        match = pattern.search(s)
        if match:
            address = match.group(0)
            break
    
    user_address.append(address)

In [59]:
level = []

pattern = re.compile(r'\blevel\s+(\d+)\b', re.IGNORECASE)

for i in range(len(df)):
    text = transcript.get(i, 'user')
    match = pattern.search(text)
    res = None
    if match:
        res = match.group(0).capitalize()
    level.append(res)


In [61]:
df['user_response'] = user_response
df['assistant_address'] = assistant_address
df['user_address'] = user_address
df['level'] = level

In [41]:
# Verification status

verification_status = ['No Result' for _ in range(len(df))]

df['verification_status'] = verification_status

for i in range(len(df)):
    
    # Check for yes or no
    if df['user_response'][i] == 'Yes':
        df.at[i, 'verification_status'] = 'Address Confirmed'
    elif df['user_response'][i] == 'No':
        df.at[i, 'verification_status'] = 'Address Changed'
    
    # Check if user address is different from assistant address
    if df['assistant_address'][i] and df['user_address'][i]:
        if df['assistant_address'][i] != df['user_address'][i]:
            df.at[i, 'verification_status'] = 'Address Changed' # Don't have to check vice versa
    
    # Check if level was mentioned
    if not df['verification_status'][i] == 'Address Changed' and df['level'][i]:
        df.at[i, 'verification_status'] = 'Address Confirmed'
                   
df

Unnamed: 0,status,corrected_duration,end_at,c_id,to,from,completed,created_at,queue_status,endpoint_url,max_duration,error_message,request_data,transcripts,call_length,user_response,assistant_address,user_address,level,verification_status
0,completed,7,2023-12-07T22:03:14.000Z,d8f84e2d-c7e8-48db-85bf-d2904235e4a8,61285027935,+61344209195,True,2023-12-07T22:02:25.126766+00:00,,,5,,"{'phone_number': '+61285027935', 'reduce_laten...","[{'id': 1109683, 'created_at': '2023-12-07T22:...",4.903,,1 Bligh Street,,,No Result
1,completed,28,2023-12-07T22:02:58.000Z,1f9c15fa-8ef0-4b81-b71e-47f84a393584,61283791854,+61340601087,True,2023-12-07T22:02:24.884641+00:00,,,5,,"{'phone_number': '+61283791854', 'reduce_laten...","[{'id': 1109572, 'created_at': '2023-12-07T22:...",17.699,No,25 Martin Place,,,Address Changed
2,completed,78,2023-12-07T22:04:14.000Z,6af2fc64-88c7-45a4-8615-7dabbecbf407,61283466000,+61876663531,True,2023-12-07T22:02:23.703461+00:00,,,5,,"{'phone_number': '+61283466000', 'reduce_laten...","[{'id': 1109728, 'created_at': '2023-12-07T22:...",10.053,No,1 Bligh Street,1 O'Connell Street,,Address Changed
3,completed,10,2023-12-07T22:02:42.000Z,ce3b1694-fbcb-4540-be9d-5b2d041ae700,61283073595,+61860173635\n,True,2023-12-07T22:02:22.166738+00:00,,,5,,"{'phone_number': '+61283073595', 'reduce_laten...","[{'id': 1109580, 'created_at': '2023-12-07T22:...",0.59,,,,,No Result
4,completed,35,2023-12-07T22:03:06.000Z,1cdc3339-a335-407d-a112-bcebb3eabc18,61282599999,+61876663531,True,2023-12-07T22:02:21.716667+00:00,,,5,,"{'phone_number': '+61282599999', 'reduce_laten...","[{'id': 1109555, 'created_at': '2023-12-07T22:...",30.963,,2 Chifley Square,,,No Result
5,completed,18,2023-12-07T22:02:51.000Z,8b0a4ac6-44b2-4b3a-8f32-eae913ea06ef,61282415600,+61860173635\n,True,2023-12-07T22:02:20.289195+00:00,,,5,,"{'phone_number': '+61282415600', 'reduce_laten...","[{'id': 1109567, 'created_at': '2023-12-07T22:...",12.882,,25 Martin Place,,Level 32,Address Confirmed
6,completed,80,2023-12-07T22:03:45.000Z,28913206-c58c-4324-95dc-befb271bf253,61282393555,+61876663531,True,2023-12-07T22:02:19.661156+00:00,,,5,,"{'phone_number': '+61282393555', 'reduce_laten...","[{'id': 1109658, 'created_at': '2023-12-07T22:...",41.438,Yes,25 Martin Place,,Level 51,Address Confirmed
7,completed,85,2023-12-07T22:03:54.000Z,4452a426-a72c-4863-bfb9-4a47636f7b7d,61282393555,+61344209195,True,2023-12-07T22:02:19.171644+00:00,,,5,,"{'phone_number': '+61282393555', 'reduce_laten...","[{'id': 1109689, 'created_at': '2023-12-07T22:...",0.599,,,,,No Result
8,completed,13,2023-12-07T22:02:48.000Z,1b53f9ec-134e-4ce2-b05b-9566ead5e0ef,61282366000,+61340601087,True,2023-12-07T22:02:18.718847+00:00,,,5,,"{'phone_number': '+61282366000', 'reduce_laten...","[{'id': 1109566, 'created_at': '2023-12-07T22:...",7.86,,2 Chifley Square,,,No Result
9,completed,3,2023-12-07T22:02:45.000Z,580b3701-7c39-4eda-b0c9-e1baca5b1759,61282279600,+61344209195,True,2023-12-07T22:02:17.276351+00:00,,,5,,"{'phone_number': '+61282279600', 'reduce_laten...","[{'id': 1109591, 'created_at': '2023-12-07T22:...",1.975,,52 Martin Place,,,No Result


In [105]:
# Convert to CSV

df.to_csv(r'.\output.csv', index=False)