# Analysing dialogues between speakers of different ages

## Task 1: Description of Data and Methods (10/50) – (~max 600 words)

In [215]:
import os
import random
from collections import defaultdict
from pprint import pprint

import numpy as np
import pandas as pd

from lxml import etree

import altair as alt

The aim is to create a dataframe that deals with:
- Speaker
- age-range
- utterances
- individual tokens in that utterances
- word classes for those tokens (check if that provides any insights during classification)
- other utterance, dialogue, word variables

Variables like speaker, agerange, utterances and other related statistics can be collected from untagged. But inorder to get the tokens and their respective classes, tagged files should also be included.
- dir_corpus : untagged
- dir_corpus2 : tagged

In [265]:
dir_corpus = 'bnc2014spoken-xml/spoken/untagged/'
print(dir_corpus)
f_names = os.listdir(dir_corpus)
f_paths = [f"{dir_corpus}{f_name}" for f_name in f_names]

bnc2014spoken-xml/spoken/untagged/


In [266]:
dir_corpus2 = 'bnc2014spoken-xml/spoken/tagged/'
print(dir_corpus2)
f_names2 = os.listdir(dir_corpus2)
f_paths2 = [f"{dir_corpus2}{f_name}" for f_name in f_names2]

# f_paths2 = ['bnc2014spoken-xml/spoken/tagged/S2A5-tgd.xml']

print(f_paths2)

bnc2014spoken-xml/spoken/tagged/
['bnc2014spoken-xml/spoken/tagged/S23A-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S24A-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S24D-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S24E-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S263-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S26N-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S27D-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S28F-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S29Q-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S29X-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2A5-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2AJ-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2AX-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2B5-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2C9-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2CY-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2DD-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2E2-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2EF-tgd.xml', 'bnc2014spoken-xml/spoken/tagged/S2FQ-tgd.xml', 'bnc20

In [267]:
# delete cell

# f_paths = ['bnc2014spoken-xml/spoken/untagged/S2A5.xml']

f_paths

['bnc2014spoken-xml/spoken/untagged/S23A.xml',
 'bnc2014spoken-xml/spoken/untagged/S24A.xml',
 'bnc2014spoken-xml/spoken/untagged/S24D.xml',
 'bnc2014spoken-xml/spoken/untagged/S24E.xml',
 'bnc2014spoken-xml/spoken/untagged/S263.xml',
 'bnc2014spoken-xml/spoken/untagged/S26N.xml',
 'bnc2014spoken-xml/spoken/untagged/S27D.xml',
 'bnc2014spoken-xml/spoken/untagged/S28F.xml',
 'bnc2014spoken-xml/spoken/untagged/S29Q.xml',
 'bnc2014spoken-xml/spoken/untagged/S29X.xml',
 'bnc2014spoken-xml/spoken/untagged/S2A5.xml',
 'bnc2014spoken-xml/spoken/untagged/S2AJ.xml',
 'bnc2014spoken-xml/spoken/untagged/S2AX.xml',
 'bnc2014spoken-xml/spoken/untagged/S2B5.xml',
 'bnc2014spoken-xml/spoken/untagged/S2C9.xml',
 'bnc2014spoken-xml/spoken/untagged/S2CY.xml',
 'bnc2014spoken-xml/spoken/untagged/S2DD.xml',
 'bnc2014spoken-xml/spoken/untagged/S2E2.xml',
 'bnc2014spoken-xml/spoken/untagged/S2EF.xml',
 'bnc2014spoken-xml/spoken/untagged/S2FQ.xml',
 'bnc2014spoken-xml/spoken/untagged/S2FT.xml',
 'bnc2014spok

Now that we have the xml files ready, lets start organizing the data into a dataframe. Start with the untagged files. It can be split into 2 parts. The speakers are extracted to df_speakers and the utterances are extracted to df_utts. A list is created to append all the data to a single variable and then is later converted to 2 dataframes each for total speakers and total utterances.

In [299]:
Total_data_to_process = 500

In [300]:
dialogue_data = []
df_speakers_data = []

for _, path in enumerate(f_paths):  # for each of the dialogues
    if _ == Total_data_to_process:
        break
    
    # Read the XML data using pd.read_xml
    df_xml = pd.read_xml(path, xpath="//u")
    df_speakers = pd.read_xml(path, xpath="//speaker")  # get the speakers
    df_speakers_data.append(df_speakers)
    
    for index, row in df_xml.iterrows():  # for each utterance
        u = row['u']
        n = row['n']
        who = row['who']
        try:
            trans = row['trans']
        except KeyError:
            trans = None
        pause = row['pause.dur'] if 'pause.dur' in row else None
        vocal = row['vocal.desc'] if 'vocal.desc' in row else None
        anonType = row['anon.type'] if 'anon.type' in row else None
        anon_nameType = row['anon.nameType'] if 'anon.nameType' in row else None
        unclear = True if 'unclear' in row else False
        trunc = row['trunc'] if 'trunc' in row else None
        event = row['event.desc'] if 'event.desc' in row else None

        df_utts = {
            'u': u,
            'n': n,
            'who': who,
            'trans': trans,
            'pause': pause,
            'vocal': vocal,
            'anonType': anonType,
            'anon_nameType': anon_nameType,
            'unclear': unclear,
            'trunc': trunc,
            'event': event,
            'dialogue': path.split('/')[-1].replace(".xml", "")
        }

        dialogue_data.append(df_utts)

In [301]:
df_all_utts = pd.DataFrame(dialogue_data)
df_all_speakers = pd.concat(df_speakers_data)

In [302]:
df_all_utts

Unnamed: 0,u,n,who,trans,pause,vocal,anonType,anon_nameType,unclear,trunc,event,dialogue
0,words,1,S0094,,,,,,True,,,S23A
1,it's a games word? like a computer games word?,2,S0095,,,,,,True,,,S23A
2,yeah yeah,3,S0032,,,,,,True,,,S23A
3,oh,4,S0095,,,,,,True,,,S23A
4,I it's something I,5,S0032,overlap,,,,,True,,,S23A
...,...,...,...,...,...,...,...,...,...,...,...,...
510493,yeah,1104,S0464,,,,,,True,,,SDMJ
510494,erm,1105,S0456,,,,,,True,,,SDMJ
510495,oh have we? god,1106,S0464,,,,,,True,,,SDMJ
510496,I think I'll stop it there I'll never be able ...,1107,S0456,,,,,,True,,,SDMJ


In [303]:
df_all_speakers

Unnamed: 0,id,exactage,age1994,agerange,gender,nat,birthplace,birthcountry,l1,lingorig,...,dialect_l2,dialect_l3,dialect_l4,edqual,occupation,socgrade,nssec,l2,fls,in_core
0,S0021,27,25_34,19_29,F,British,Swindon,England,English,England,...,england,south,southwest,5_postgrad,Teacher,B,2,,,y
1,S0032,28,25_34,19_29,M,British,Yoevil,England,English,England,...,england,south,southwest,4_graduate,Software developer,A,1_2,,,y
2,S0094,33,25_34,30_39,F,British,Swindon,England,English,England,...,england,south,southwest,5_postgrad,PhD student,A,1_2,German,Welsh -- Beginner,y
3,S0095,33,25_34,30_39,M,British,Camarthen,Scotland,English,England,...,wales,wales,wales,5_postgrad,Self employed maker,E,uncat,,,y
0,S0261,41,35_44,40_49,M,British/New Zealand,Wellington,New Zealand,English,England/NZ,...,non_uk,non_uk,non_uk,4_graduate,Entrepreneur,A,1_2,,,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,S0230,57,45_59,50_59,F,British,"Harlow, Essex",England,English,England,...,england,south,unspecified,3_sixthform,care assistant,D,6,,,y
0,S0326,20,15_24,19_29,M,British,London,England,English,England,...,england,south,london,4_graduate,student,E,uncat,,,y
1,S0329,50,45_59,50_59,F,British,London,England,English,English,...,england,unspecified,unspecified,4_graduate,Literary Editor,B,2,French,,n
0,S0456,44,35_44,40_49,M,British/ German,Salzkotten,Germany,German,English,...,england,north,liverpool,5_postgrad,Lecturer (English Language and Literature),A,1_2,,,y


The tagged data follows the same process, except we include the words and their classes in the utterances. All the words and their related variables are extracted to df_word_data.

- df_all_speakers : contains all the speakers and their related variables.
- df_all_utts : contains all the utterances and their related variables.
- df_word_data : contains all the words and their related variables.

In [304]:
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml

In [305]:
dialogue_data2 = []
for _, path in enumerate(f_paths2):  # for each of the dialogues
    if _ == Total_data_to_process:
        break
    for i in get_xml(path).xpath('//u'):
        u_n = i.xpath('./@n')[0]
        u_who = i.xpath('./@who')[0]
        u_trans = i.xpath('./@trans')[0]
        u_confidence = i.xpath('./@whoConfidence')[0]

        full_data = []
        w_pos_data = []
        w_lemma_data = []
        w_class_data = []
        w_usas_data = []
        w_words_data = []
        for w in i.xpath('.//w'):
            
            w_pos_data.append(w.xpath('./@pos')[0])
            w_lemma_data.append(w.xpath('./@lemma')[0])
            w_class_data.append(w.xpath('./@class')[0])
            w_usas_data.append(w.xpath('./@usas')[0])
            w_words_data.append(w.xpath('./text()')[0])
    

        full_data.append({
                'u_n': u_n,
                'u_who': u_who,
                'u_trans': u_trans,
                'u_confidence': u_confidence,
                'w_pos': ', '.join(w_pos_data),
                'w_lemma': ', '.join(w_lemma_data),
                'w_class': ', '.join(w_class_data),
                'w_usas': ', '.join(w_usas_data),
                'w_words': ', '.join(w_words_data),
                'dialogue': path.split('/')[-1].replace("-tgd.xml", "")
        })

        dialogue_data2.extend(full_data)

df_word_data = pd.DataFrame(dialogue_data2)

In [306]:
df_word_data

Unnamed: 0,u_n,u_who,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue
0,1,S0094,nonoverlap,high,NN2,word,SUBST,Q3,words,S23A
1,2,S0095,nonoverlap,high,"PPH1, VBZ, AT1, NN2, NN1, YQUE, II, AT1, NN1, ...","it, be, a, game, word, PUNC, like, a, computer...","PRON, VERB, ART, SUBST, SUBST, STOP, PREP, ART...","Z8, A3, Z5, K5:1, Q3, , Z5, Z5, K5:2, K5:2, Q3,","it, 's, a, games, word, ?, like, a, computer, ...",S23A
2,3,S0032,nonoverlap,high,"UH, UH","yeah, yeah","INTERJ, INTERJ","Z4, Z4","yeah, yeah",S23A
3,4,S0095,nonoverlap,high,"UH, UH, DD1, VBZ, JJ","oh, oh, that, be, nice","INTERJ, INTERJ, ADJ, VERB, ADJ","Z4, Z4, Z8, A3, O4:2","oh, oh, that, 's, nice",S23A
4,5,S0032,overlap,high,"PPIS1, PPH1, VBZ, PN1, PPIS1, VH0, RR, VVN, JJ...","i, it, be, something, i, have, really, hear, z...","PRON, PRON, VERB, PRON, PRON, VERB, ADV, VERB,...","Z8, Z8, A3, Z8, Z8, Z5, A13:3, X3:2, Z99, S8, ...","I, it, 's, something, I, have, really, heard, ...",S23A
...,...,...,...,...,...,...,...,...,...,...
510493,1104,S0464,nonoverlap,high,UH,yeah,INTERJ,Z4,yeah,SDMJ
510494,1105,S0456,nonoverlap,high,"UH, VM21, VM22, VVI, RT, PPH1, RR, VVZ, PPIS1,...","erm, let, 's, see, now, it, still, work, i, do...","INTERJ, VERB, VERB, VERB, ADV, PRON, ADV, VERB...","Z4, Z5, Z5, X3:4, T1:1:2, Z8, T2, I3:1, Z8, Z5...","erm, let, 's, see, now, it, still, works, I, d...",SDMJ
510495,1106,S0464,nonoverlap,high,"UH, VH0, PPIS2, YQUE, NN1","oh, have, we, PUNC, god","INTERJ, VERB, PRON, STOP, SUBST","Z4, Z5, Z8, , Z4","oh, have, we, ?, god",SDMJ
510496,1107,S0456,nonoverlap,high,"PPIS1, VV0, PPIS1, VM, VVI, PPH1, RL, PPIS1, V...","i, think, i, will, stop, it, there, i, will, n...","PRON, VERB, PRON, VERB, VERB, PRON, ADV, PRON,...","Z8, X2:1, Z8, T1:1:3, T2, Z8, M6, Z8, T1:1:3, ...","I, think, I, 'll, stop, it, there, I, 'll, nev...",SDMJ


Next step would be to merge these 3 dataframes to a final dataframe that contains the required data for classification and analysis.

- df_all_utts
    - u_len : a variable that can be used to calculate the length of each utterances for a particular speaker.
    - avg_u_len_per_speaker : a variable that can be used to calculate the average length of utterances for a particular speaker.

- df_word_data

In [307]:
# create a new column called 'u_len' and get the count of total utterances for each speaker
df_all_utts['u_len'] = df_all_utts['u'].str.len()
# create a new column called 'avg_u_len_per_speaker' and get the average
df_all_utts['avg_u_len_per_speaker'] = df_all_utts.groupby('who')['u_len'].transform('mean')

In [308]:
# create a python script to drop pause, vocal, anon, unclear, trans, trunc, event from df_all_utts
df_all_utts = df_all_utts.drop(columns=['pause', 'vocal', 'anonType', 'anon_nameType', 'unclear', 'trans', 'trunc', 'event'])

In [309]:
df_all_utts

Unnamed: 0,u,n,who,dialogue,u_len,avg_u_len_per_speaker
0,words,1,S0094,S23A,5.0,20.775337
1,it's a games word? like a computer games word?,2,S0095,S23A,46.0,23.362179
2,yeah yeah,3,S0032,S23A,9.0,28.102171
3,oh,4,S0095,S23A,2.0,23.362179
4,I it's something I,5,S0032,S23A,18.0,28.102171
...,...,...,...,...,...,...
510493,yeah,1104,S0464,SDMJ,4.0,18.602386
510494,erm,1105,S0456,SDMJ,3.0,27.865338
510495,oh have we? god,1106,S0464,SDMJ,15.0,18.602386
510496,I think I'll stop it there I'll never be able ...,1107,S0456,SDMJ,69.0,27.865338


In [310]:
df_word_data

Unnamed: 0,u_n,u_who,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue
0,1,S0094,nonoverlap,high,NN2,word,SUBST,Q3,words,S23A
1,2,S0095,nonoverlap,high,"PPH1, VBZ, AT1, NN2, NN1, YQUE, II, AT1, NN1, ...","it, be, a, game, word, PUNC, like, a, computer...","PRON, VERB, ART, SUBST, SUBST, STOP, PREP, ART...","Z8, A3, Z5, K5:1, Q3, , Z5, Z5, K5:2, K5:2, Q3,","it, 's, a, games, word, ?, like, a, computer, ...",S23A
2,3,S0032,nonoverlap,high,"UH, UH","yeah, yeah","INTERJ, INTERJ","Z4, Z4","yeah, yeah",S23A
3,4,S0095,nonoverlap,high,"UH, UH, DD1, VBZ, JJ","oh, oh, that, be, nice","INTERJ, INTERJ, ADJ, VERB, ADJ","Z4, Z4, Z8, A3, O4:2","oh, oh, that, 's, nice",S23A
4,5,S0032,overlap,high,"PPIS1, PPH1, VBZ, PN1, PPIS1, VH0, RR, VVN, JJ...","i, it, be, something, i, have, really, hear, z...","PRON, PRON, VERB, PRON, PRON, VERB, ADV, VERB,...","Z8, Z8, A3, Z8, Z8, Z5, A13:3, X3:2, Z99, S8, ...","I, it, 's, something, I, have, really, heard, ...",S23A
...,...,...,...,...,...,...,...,...,...,...
510493,1104,S0464,nonoverlap,high,UH,yeah,INTERJ,Z4,yeah,SDMJ
510494,1105,S0456,nonoverlap,high,"UH, VM21, VM22, VVI, RT, PPH1, RR, VVZ, PPIS1,...","erm, let, 's, see, now, it, still, work, i, do...","INTERJ, VERB, VERB, VERB, ADV, PRON, ADV, VERB...","Z4, Z5, Z5, X3:4, T1:1:2, Z8, T2, I3:1, Z8, Z5...","erm, let, 's, see, now, it, still, works, I, d...",SDMJ
510495,1106,S0464,nonoverlap,high,"UH, VH0, PPIS2, YQUE, NN1","oh, have, we, PUNC, god","INTERJ, VERB, PRON, STOP, SUBST","Z4, Z5, Z8, , Z4","oh, have, we, ?, god",SDMJ
510496,1107,S0456,nonoverlap,high,"PPIS1, VV0, PPIS1, VM, VVI, PPH1, RL, PPIS1, V...","i, think, i, will, stop, it, there, i, will, n...","PRON, VERB, PRON, VERB, VERB, PRON, ADV, PRON,...","Z8, X2:1, Z8, T1:1:3, T2, Z8, M6, Z8, T1:1:3, ...","I, think, I, 'll, stop, it, there, I, 'll, nev...",SDMJ


Merge the columns from df_all_utts to df_word_data to get the required data for classification and analysis.

In [311]:
# Merge df_word_data and df_all_utts on 'who' and 'u_who'
df_word_utterance = df_all_utts.merge(df_word_data, left_on='who', right_on='u_who', how='inner')

# Drop the duplicate ['u_n', 'u_who','dialogue_x'] column from the merge
df_word_utterance = df_word_utterance.drop(['u_n', 'u_who','dialogue_x'], axis=1)

# Print the columns of the merged dataframe
print(df_word_utterance.columns)

MemoryError: Unable to allocate 14.0 GiB for an array with shape (1885192822,) and data type int64

In [None]:
df_word_utterance

Unnamed: 0,u,n,who,u_len,avg_u_len_per_speaker,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue_y
0,"an hour later , hope she stays down , rather...",1,S0024,51,47.752137,nonoverlap,high,"AT1, NNT1, RRR, VV0, PPHS1, VVZ, RP, RG, JJ","a, hour, later, hope, she, stay, down, rather,...","ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, ADJ","Z5, T1:3, T4, X2:6, Z8, M8, Z5, A13:5, T4","an, hour, later, hope, she, stays, down, rathe...",S2A5
1,"an hour later , hope she stays down , rather...",1,S0024,51,47.752137,nonoverlap,high,"UH, PPIS1, VV0, CCB, DD1, VBZ, RRQ, PPIS2, VBR...","yeah, i, know, but, that, be, why, we, be, a, ...","INTERJ, PRON, VERB, CONJ, ADJ, VERB, ADV, PRON...","Z4, Z8, X2:2, Z5, Z8, A3, A2:2, Z8, A3, Z5, T1...","yeah, I, know, but, that, 's, why, we, 're, an...",S2A5
2,"an hour later , hope she stays down , rather...",1,S0024,51,47.752137,nonoverlap,high,"VDD, PPY, NN1, NP1","do, you, text, --anonnamem","VERB, PRON, SUBST, SUBST","A1:1:1, Z8, Q1:2, Z1","did, you, text, --ANONnameM",S2A5
3,"an hour later , hope she stays down , rather...",1,S0024,51,47.752137,nonoverlap,high,UH,oh,INTERJ,Z4,oh,S2A5
4,"an hour later , hope she stays down , rather...",1,S0024,51,47.752137,nonoverlap,high,"DD, NN, VV0, NN1, NN1, RRR, CSN, NN2, UH","some, people, get, jet, lag, longer, than, oth...","ADJ, SUBST, VERB, SUBST, SUBST, ADV, CONJ, SUB...","N5, S2, A9, M5, M5, T1:3, Z5, A6:1, Z4","some, people, get, jet, lag, longer, than, oth...",S2A5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27140,"oh yeah ooh , ooh I got ready for bed as wel...",232,S0144,131,22.517241,nonoverlap,high,UH,mm,INTERJ,Z4,mm,S2A5
27141,"oh yeah ooh , ooh I got ready for bed as wel...",232,S0144,131,22.517241,nonoverlap,high,"PPHS1, VBZ, JJ, YQUE","she, be, asleep, PUNC","PRON, VERB, ADJ, STOP","Z8, A3, B1,","she, 's, asleep, ?",S2A5
27142,"oh yeah ooh , ooh I got ready for bed as wel...",232,S0144,131,22.517241,nonoverlap,high,"RR, RR, PPIS1, VVD, AT, NN1, VVI, RR21, RR22, RR","okay, well, i, hear, the, oven, ping, as, well...","ADV, ADV, PRON, VERB, ART, SUBST, VERB, ADV, A...","A5:1, A5:1, Z8, X3:2, Z5, O2, Y2, N5, N5, Z5","okay, well, I, heard, the, oven, ping, as, wel...",S2A5
27143,"oh yeah ooh , ooh I got ready for bed as wel...",232,S0144,131,22.517241,nonoverlap,high,"DDQ, VDZ, DD1, VVI, YQUE","what, do, that, mean, PUNC","PRON, VERB, ADJ, VERB, STOP","Z8, Z5, Z8, Q1:1,","what, does, that, mean, ?",S2A5


This dataset now has all the columns necessary for our analysis. But its based on speakers. Lets convert it to 2 groups based on agerange and then get the stats.

In [None]:
# display exactage and agerange of each speaker
df1 = df_all_speakers[['id', 'exactage', 'agerange','gender']]
df1

Unnamed: 0,id,exactage,agerange,gender
0,S0024,36,30_39,F
1,S0144,36,30_39,M


In [None]:
# get u, avg_u_len_per_speaker, dialogue, u_confidence, w_pos, w_lemma, w_class, w_usas, w_words as well as the u columns from df_word_utterance and group by 'who'
df2 = df_word_utterance.groupby('who').first()[['u', 'avg_u_len_per_speaker', 'dialogue_y', 'u_confidence', 'w_pos', 'w_lemma', 'w_class', 'w_usas', 'w_words']].reset_index()

Unnamed: 0,who,u,avg_u_len_per_speaker,dialogue_y,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words
0,S0024,"an hour later , hope she stays down , rather...",47.752137,S2A5,high,"AT1, NNT1, RRR, VV0, PPHS1, VVZ, RP, RG, JJ","a, hour, later, hope, she, stay, down, rather,...","ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, ADJ","Z5, T1:3, T4, X2:6, Z8, M8, Z5, A13:5, T4","an, hour, later, hope, she, stays, down, rathe..."
1,S0144,well she had those two hours earlier,22.517241,S2A5,high,"RR, PPHS1, VHD, DD2, MC, NNT2, RRR","well, she, have, those, two, hour, earlier","ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV","A5:1, Z8, A9, Z5, N1, T1:3, N4","well, she, had, those, two, hours, earlier"


In [None]:
df2 = df_word_utterance.groupby('who').agg({
    'u': ' '.join,
    'avg_u_len_per_speaker': 'first',
    'dialogue_y': 'count',
    # 'u_confidence': ' '.join,
    # 'w_pos': ' '.join,
    # 'w_lemma': ' '.join,
    'w_class': ' '.join,
    # 'w_usas': ' '.join,
    'w_words': ' '.join
}).reset_index()

df2.columns = ['who', 'joined_utterances', 'avg_u_len_per_speaker', 'dialogue_count', 'word_class', 'joined_words']

df2

Unnamed: 0,who,joined_utterances,avg_u_len_per_speaker,dialogue_count,word_class,joined_words
0,S0024,"an hour later , hope she stays down , rather...",47.752137,13689,"ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, A...","an, hour, later, hope, she, stays, down, rathe..."
1,S0144,well she had those two hours earlier well she ...,22.517241,13456,"ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV INTERJ,...","well, she, had, those, two, hours, earlier ye..."


In [None]:
# merge df1 and df_utts on the column 'who' and 'id' respectively
df_final = pd.merge(df1, df2, left_on='id', right_on='who')

# drop the duplicate column 'who'
df_final = df_final.drop(['who'], axis=1)

# remove duplicate rows in id column
df_final = df_final.drop_duplicates(subset=['id'])
df_final

Unnamed: 0,id,exactage,agerange,gender,joined_utterances,avg_u_len_per_speaker,dialogue_count,word_class,joined_words
0,S0024,36,30_39,F,"an hour later , hope she stays down , rather...",47.752137,13689,"ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, A...","an, hour, later, hope, she, stays, down, rathe..."
1,S0144,36,30_39,M,well she had those two hours earlier well she ...,22.517241,13456,"ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV INTERJ,...","well, she, had, those, two, hours, earlier ye..."


While converting the exactage to integer, I encountered a problem where the age was contained a character. So I had to remove it inorder to convert it to integer.

In [None]:
# get the exactage of each speaker '60s'
print(df_final[df_final['exactage'] == '60s'])

# convert that '60s' to 60
df_final.loc[df_final['exactage'] == '60s', 'exactage'] = 60

print(df_final[df_final['exactage'] == '60s'])

Empty DataFrame
Columns: [id, exactage, agerange, gender, joined_utterances, avg_u_len_per_speaker, dialogue_count, word_class, joined_words]
Index: []
Empty DataFrame
Columns: [id, exactage, agerange, gender, joined_utterances, avg_u_len_per_speaker, dialogue_count, word_class, joined_words]
Index: []


In [None]:
# remove rows with missing values
df_final = df_final.dropna()
# convert the column 'exactage' to integer
df_final['exactage'] = df_final['exactage'].astype(int)
# convert the column 'dialogue_count' to integer
df_final['dialogue_count'] = df_final['dialogue_count'].astype(int)

df_final.shape

(2, 9)

In [None]:
df_final.dtypes

id                        object
exactage                   int32
agerange                  object
gender                    object
joined_utterances         object
avg_u_len_per_speaker    float64
dialogue_count             int32
word_class                object
joined_words              object
dtype: object

In [None]:
df_final.describe()

Unnamed: 0,exactage,avg_u_len_per_speaker,dialogue_count
count,2.0,2.0,2.0
mean,36.0,35.134689,13572.5
std,0.0,17.843766,164.75588
min,36.0,22.517241,13456.0
25%,36.0,28.825965,13514.25
50%,36.0,35.134689,13572.5
75%,36.0,41.443413,13630.75
max,36.0,47.752137,13689.0


In [None]:
df_final['agerange'].value_counts().sort_index()

agerange
30_39    2
Name: count, dtype: int64

In [None]:
df_final = df_final.sort_values('agerange')
df_final

Unnamed: 0,id,exactage,agerange,gender,joined_utterances,avg_u_len_per_speaker,dialogue_count,word_class,joined_words
0,S0024,36,30_39,F,"an hour later , hope she stays down , rather...",47.752137,13689,"ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, A...","an, hour, later, hope, she, stays, down, rathe..."
1,S0144,36,30_39,M,well she had those two hours earlier well she ...,22.517241,13456,"ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV INTERJ,...","well, she, had, those, two, hours, earlier ye..."


In [None]:
import plotly.graph_objects as go

# create a pie chart of agerange distribution
fig = go.Figure(data=[go.Pie(labels=df_final['agerange'].value_counts().index,
                             values=df_final['agerange'].value_counts().values)])

# update the layout of the chart
fig.update_layout(title='Distribution of agerange')

# show the chart
fig.show()


In [None]:
df_final.head()

Unnamed: 0,id,exactage,agerange,gender,joined_utterances,avg_u_len_per_speaker,dialogue_count,word_class,joined_words
0,S0024,36,30_39,F,"an hour later , hope she stays down , rather...",47.752137,13689,"ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, A...","an, hour, later, hope, she, stays, down, rathe..."
1,S0144,36,30_39,M,well she had those two hours earlier well she ...,22.517241,13456,"ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV INTERJ,...","well, she, had, those, two, hours, earlier ye..."


In [None]:
# show the number of speakers in each age group
df_final['agerange'].value_counts()

agerange
30_39    2
Name: count, dtype: int64

In [None]:
df_agerange = df_final.groupby('agerange').agg({'avg_u_len_per_speaker': 'mean', 'dialogue_count': 'sum', 'id': 'count'}).reset_index()
df_agerange.rename(columns={'avg_u_len_per_speaker': 'avg_u_len_per_agerange'}, inplace=True)
df_agerange.rename(columns={'dialogue_count': 'num_of_dialogues'}, inplace=True)
df_agerange.rename(columns={'id': 'num_of_speakers'}, inplace=True)

# add a new column 'avg_utterances_per_speaker'
# df_agerange['avg_u_per_speaker'] = df_agerange['total_utterances'] / df_agerange['num_of_speakers']

In [None]:
df_agerange

Unnamed: 0,agerange,avg_u_len_per_agerange,num_of_dialogues,num_of_speakers
0,30_39,35.134689,27145,2


In [None]:
df_agerange.describe()

Unnamed: 0,avg_u_len_per_agerange,num_of_dialogues,num_of_speakers
count,1.0,1.0,1.0
mean,35.134689,27145.0,2.0
std,,,
min,35.134689,27145.0,2.0
25%,35.134689,27145.0,2.0
50%,35.134689,27145.0,2.0
75%,35.134689,27145.0,2.0
max,35.134689,27145.0,2.0


In [None]:
df_agerange.iloc[[2, 5, 6, 7, 8, 9], :]

In [None]:
# create a new dataframe that contains the same number of columns but only 2 rows. one with 19_29 and the other with 50_59, 60_69 to 90_99
df_agerange_2 = df_agerange.iloc[[2, 5, 6, 7, 8, 9], :]

# sum the values of the rows 5, 6, 7, 8, 9 and assign it to the row 3
df_agerange_2.iloc[1, 1:] = df_agerange.iloc[5:10, 1:].sum()

# drop the rows 5, 6, 7, 8, 9
df_agerange_2 = df_agerange_2.drop([6, 7, 8, 9])

# now add a new column 'agegroup' to the dataframe that contains two values 'younger' and 'older'
df_agerange_2['agegroup'] = ['19-29', '50+']

# drop the column 'agerange'
df_agerange_2 = df_agerange_2.drop(columns=['agerange'])

# bring the column 'agegroup' to the first position
cols = df_agerange_2.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_agerange_2 = df_agerange_2[cols]

# reset the index of the dataframe
df_agerange_2 = df_agerange_2.reset_index(drop=True)

df_agerange_2

In [None]:
df_agerange_2.describe()

In [2]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')