# Analysing dialogues between speakers of different ages

## Task 1: Description of Data and Methods (10/50) – (~max 600 words)

In [1]:
import os
import random
from collections import defaultdict
from pprint import pprint

import numpy as np
import pandas as pd

from lxml import etree

import altair as alt

The aim is to create a dataframe that deals with:
- Speaker
- age-range
- utterances
- individual tokens in that utterances
- word classes for those tokens (check if that provides any insights during classification)
- other utterance, dialogue, word variables

Variables like speaker, agerange, utterances and other related statistics can be collected from untagged. But inorder to get the tokens and their respective classes, tagged files should also be included.
- dir_corpus : untagged
- dir_corpus2 : tagged

In [2]:
dir_corpus = 'bnc2014spoken-xml/spoken/untagged/'
print(dir_corpus)
f_names = os.listdir(dir_corpus)
f_paths = [f"{dir_corpus}{f_name}" for f_name in f_names]

bnc2014spoken-xml/spoken/untagged/


In [3]:
dir_corpus2 = 'bnc2014spoken-xml/spoken/tagged/'
print(dir_corpus2)
f_names2 = os.listdir(dir_corpus2)
f_paths2 = [f"{dir_corpus2}{f_name}" for f_name in f_names2]

f_paths2 = ['bnc2014spoken-xml/spoken/tagged/S2A5-tgd.xml']

print(f_paths2)

bnc2014spoken-xml/spoken/tagged/
['bnc2014spoken-xml/spoken/tagged/S2A5-tgd.xml']


In [4]:
# delete cell

f_paths = ['bnc2014spoken-xml/spoken/untagged/S2A5.xml']

f_paths

['bnc2014spoken-xml/spoken/untagged/S2A5.xml']

Now that we have the xml files ready, lets start organizing the data into a dataframe. Start with the untagged files. It can be split into 2 parts. The speakers are extracted to df_speakers and the utterances are extracted to df_utts. A list is created to append all the data to a single variable and then is later converted to 2 dataframes each for total speakers and total utterances.

In [5]:
dialogue_data = []
for path in f_paths: # for each of the dialogues
    print(path)
    df_utts = pd.read_xml(path, xpath="//u")
    # add the diaogue file name to the dataframe
    df_utts['dialogue'] = path.split('/')[-1].replace(".xml", "")
    df_speakers = pd.read_xml(path, xpath="//speaker")
    dialogue_data.append((df_utts, df_speakers))

bnc2014spoken-xml/spoken/untagged/S2A5.xml


The tagged data follows the same process, except we include the words and their classes in the utterances. All the words and their related variables are extracted to df_word_data.

- df_all_speakers : contains all the speakers and their related variables.
- df_all_utts : contains all the utterances and their related variables.
- df_word_data : contains all the words and their related variables.

In [6]:
def get_xml(f_path):
    with open(f_path, 'r') as f:
        f = f.read()
    xml = etree.fromstring(f)
    return xml

In [7]:
dialogue_data2 = []
for path in f_paths2:  # for each of the dialogues
    for i in get_xml(path).xpath('//u'):
        u_n = i.xpath('./@n')[0]
        u_who = i.xpath('./@who')[0]
        u_trans = i.xpath('./@trans')[0]
        u_confidence = i.xpath('./@whoConfidence')[0]

        full_data = []
        w_pos_data = []
        w_lemma_data = []
        w_class_data = []
        w_usas_data = []
        w_words_data = []
        for w in i.xpath('.//w'):
            w_pos_data.append(w.xpath('./@pos')[0])
            w_lemma_data.append(w.xpath('./@lemma')[0])
            w_class_data.append(w.xpath('./@class')[0])
            w_usas_data.append(w.xpath('./@usas')[0])
            w_words_data.append(w.xpath('./text()')[0])
        
        full_data.append({
                'u_n': u_n,
                'u_who': u_who,
                'u_trans': u_trans,
                'u_confidence': u_confidence,
                'w_pos': w_pos_data,
                'w_lemma': w_lemma_data,
                'w_class': w_class_data,
                'w_usas': w_usas_data,
                'w_words': w_words_data,
                'dialogue': path.split('/')[-1].replace("-tgd.xml", "")
        })

        dialogue_data2.extend(full_data)

df_word_data = pd.DataFrame(dialogue_data2)

In [8]:
df_word_data

Unnamed: 0,u_n,u_who,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue
0,1,S0024,nonoverlap,high,"[AT1, NNT1, RRR, VV0, PPHS1, VVZ, RP, RG, JJ]","[a, hour, later, hope, she, stay, down, rather...","[ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, ...","[Z5, T1:3, T4, X2:6, Z8, M8, Z5, A13:5, T4]","[an, hour, later, hope, she, stays, down, rath...",S2A5
1,2,S0144,nonoverlap,high,"[RR, PPHS1, VHD, DD2, MC, NNT2, RRR]","[well, she, have, those, two, hour, earlier]","[ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV]","[A5:1, Z8, A9, Z5, N1, T1:3, N4]","[well, she, had, those, two, hours, earlier]",S2A5
2,3,S0024,nonoverlap,high,"[UH, PPIS1, VV0, CCB, DD1, VBZ, RRQ, PPIS2, VB...","[yeah, i, know, but, that, be, why, we, be, a,...","[INTERJ, PRON, VERB, CONJ, ADJ, VERB, ADV, PRO...","[Z4, Z8, X2:2, Z5, Z8, A3, A2:2, Z8, A3, Z5, T...","[yeah, I, know, but, that, 's, why, we, 're, a...",S2A5
3,4,S0144,nonoverlap,high,[],[],[],[],[],S2A5
4,5,S0024,nonoverlap,high,"[VDD, PPY, NN1, NP1]","[do, you, text, --anonnamem]","[VERB, PRON, SUBST, SUBST]","[A1:1:1, Z8, Q1:2, Z1]","[did, you, text, --ANONnameM]",S2A5
...,...,...,...,...,...,...,...,...,...,...
228,229,S0024,nonoverlap,high,[UH],[yeah],[INTERJ],[Z4],[yeah],S2A5
229,230,S0144,nonoverlap,high,"[DDQ, VDZ, DD1, VVI, YQUE]","[what, do, that, mean, PUNC]","[PRON, VERB, ADJ, VERB, STOP]","[Z8, Z5, Z8, Q1:1, ]","[what, does, that, mean, ?]",S2A5
230,231,S0024,nonoverlap,high,"[VVZ, AT, NN2, VBR, JJ, UH]","[mean, the, potato, be, ready, yay]","[VERB, ART, SUBST, VERB, ADJ, INTERJ]","[Q1:1, Z5, L3, A3, O4:1, Z2]","[means, the, potatoes, are, ready, yay]",S2A5
231,232,S0144,nonoverlap,high,"[UH, UH, UH, UH, PPIS1, VVD, JJ, IF, NN1, RR21...","[oh, yeah, ooh, ooh, i, get, ready, for, bed, ...","[INTERJ, INTERJ, INTERJ, INTERJ, PRON, VERB, A...","[Z4, Z4, Z4, Z4, Z8, A2:1, O4:1, Z5, H5, N5, N...","[oh, yeah, ooh, ooh, I, got, ready, for, bed, ...",S2A5


In [9]:
# Concatenate all speakers dataframes into one
df_all_speakers = pd.concat([speaker_data[1] for speaker_data in dialogue_data])
df_all_speakers

Unnamed: 0,id,exactage,age1994,agerange,gender,nat,birthplace,birthcountry,l1,lingorig,...,dialect_l2,dialect_l3,dialect_l4,edqual,occupation,socgrade,nssec,l2,fls,in_core
0,S0024,36,35_44,30_39,F,British,Norwich,England,English,England,...,england,south,unspecified,5_postgrad,lecturer,A,1_2,,,n
1,S0144,36,35_44,30_39,M,British,London,England,English,England,...,england,south,unspecified,5_postgrad,Lecturer,A,1_2,,,y


In [10]:
# Concatenate all utterances dataframes into one
df_all_utts = pd.concat([utt_data[0] for utt_data in dialogue_data])
df_all_utts

Unnamed: 0,n,who,u,pause,vocal,anon,unclear,trans,trunc,event,dialogue
0,1,S0024,an hour later,,,,,,,,S2A5
1,2,S0144,well she had those two hours earlier,,,,,,,,S2A5
2,3,S0024,yeah I know but that's why we're an hour late ...,,,,,,,,S2A5
3,4,S0144,,,,,,,,,S2A5
4,5,S0024,did you text,,,,,,,,S2A5
...,...,...,...,...,...,...,...,...,...,...,...
228,229,S0024,yeah,,,,,,,,S2A5
229,230,S0144,what does that mean?,,,,,,,,S2A5
230,231,S0024,means the potatoes are ready,,,,,,,,S2A5
231,232,S0144,,,,,,,,,S2A5


Next step would be to merge these 3 dataframes to a final dataframe that contains the required data for classification and analysis.

- df_all_utts
    - u_len : a variable that can be used to calculate the length of each utterances for a particular speaker.
    - avg_u_len_per_speaker : a variable that can be used to calculate the average length of utterances for a particular speaker.

- df_word_data

In [11]:
# create a new column called 'u_len' and get the count of total utterances for each speaker
df_all_utts['u_len'] = df_all_utts['u'].str.len()
# create a new column called 'avg_u_len_per_speaker' and get the average
df_all_utts['avg_u_len_per_speaker'] = df_all_utts.groupby('who')['u_len'].transform('mean')

In [12]:
# create a python script to drop pause, vocal, anon, unclear, trans, trunc, event from df_all_utts
df_all_utts = df_all_utts.drop(columns=['pause', 'vocal', 'anon', 'unclear', 'trans', 'trunc', 'event'])

In [13]:
df_word_data

Unnamed: 0,u_n,u_who,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue
0,1,S0024,nonoverlap,high,"[AT1, NNT1, RRR, VV0, PPHS1, VVZ, RP, RG, JJ]","[a, hour, later, hope, she, stay, down, rather...","[ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, ...","[Z5, T1:3, T4, X2:6, Z8, M8, Z5, A13:5, T4]","[an, hour, later, hope, she, stays, down, rath...",S2A5
1,2,S0144,nonoverlap,high,"[RR, PPHS1, VHD, DD2, MC, NNT2, RRR]","[well, she, have, those, two, hour, earlier]","[ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV]","[A5:1, Z8, A9, Z5, N1, T1:3, N4]","[well, she, had, those, two, hours, earlier]",S2A5
2,3,S0024,nonoverlap,high,"[UH, PPIS1, VV0, CCB, DD1, VBZ, RRQ, PPIS2, VB...","[yeah, i, know, but, that, be, why, we, be, a,...","[INTERJ, PRON, VERB, CONJ, ADJ, VERB, ADV, PRO...","[Z4, Z8, X2:2, Z5, Z8, A3, A2:2, Z8, A3, Z5, T...","[yeah, I, know, but, that, 's, why, we, 're, a...",S2A5
3,4,S0144,nonoverlap,high,[],[],[],[],[],S2A5
4,5,S0024,nonoverlap,high,"[VDD, PPY, NN1, NP1]","[do, you, text, --anonnamem]","[VERB, PRON, SUBST, SUBST]","[A1:1:1, Z8, Q1:2, Z1]","[did, you, text, --ANONnameM]",S2A5
...,...,...,...,...,...,...,...,...,...,...
228,229,S0024,nonoverlap,high,[UH],[yeah],[INTERJ],[Z4],[yeah],S2A5
229,230,S0144,nonoverlap,high,"[DDQ, VDZ, DD1, VVI, YQUE]","[what, do, that, mean, PUNC]","[PRON, VERB, ADJ, VERB, STOP]","[Z8, Z5, Z8, Q1:1, ]","[what, does, that, mean, ?]",S2A5
230,231,S0024,nonoverlap,high,"[VVZ, AT, NN2, VBR, JJ, UH]","[mean, the, potato, be, ready, yay]","[VERB, ART, SUBST, VERB, ADJ, INTERJ]","[Q1:1, Z5, L3, A3, O4:1, Z2]","[means, the, potatoes, are, ready, yay]",S2A5
231,232,S0144,nonoverlap,high,"[UH, UH, UH, UH, PPIS1, VVD, JJ, IF, NN1, RR21...","[oh, yeah, ooh, ooh, i, get, ready, for, bed, ...","[INTERJ, INTERJ, INTERJ, INTERJ, PRON, VERB, A...","[Z4, Z4, Z4, Z4, Z8, A2:1, O4:1, Z5, H5, N5, N...","[oh, yeah, ooh, ooh, I, got, ready, for, bed, ...",S2A5


In [14]:
df_all_utts[df_all_utts['who'] == 'S0024']['u'].unique()

array(['an hour later',
       "yeah I know but that's why we're an hour late isn't it?",
       'did you text', 'oh', 'some people get jet lag', 'mm',
       'everybody says this direction', 'yeah', 'we were both', 'we no',
       None, 'I hope she sleep tonight', "I'm fed up of her getting up",
       'no', 'she slept in my in our bed',
       "again and I've got and I'm not allowed to do that cos I wake her up hi and I'm like oh",
       'and they say you', "but she just wouldn't settle down",
       'and she settled down when she was being cuddled',
       'a bit worried that her',
       'her change happened from cuddling I think',
       "right at the end of the holiday which means that she's jet-lagged and she's gets",
       'and so', "can't like can't be mean",
       "gotta be confused as a baby haven't you?", 'all of that',
       'wanna go through the new bag actually', "and there's two pairs",
       'so', "so we'll see how she goes",
       "I mean she's fine in all her d

In [15]:
# merge df_all_utts and df_word_data on 'who' from df_all_utts and 'u_who' from df_word_data
df_all_utts = df_all_utts.merge(df_word_data, left_on='who', right_on='u_who')

df_all_utts

Unnamed: 0,n,who,u,dialogue_x,u_len,avg_u_len_per_speaker,u_n,u_who,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue_y
0,1,S0024,an hour later,S2A5,13.0,27.085714,1,S0024,nonoverlap,high,"[AT1, NNT1, RRR, VV0, PPHS1, VVZ, RP, RG, JJ]","[a, hour, later, hope, she, stay, down, rather...","[ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, ...","[Z5, T1:3, T4, X2:6, Z8, M8, Z5, A13:5, T4]","[an, hour, later, hope, she, stays, down, rath...",S2A5
1,1,S0024,an hour later,S2A5,13.0,27.085714,3,S0024,nonoverlap,high,"[UH, PPIS1, VV0, CCB, DD1, VBZ, RRQ, PPIS2, VB...","[yeah, i, know, but, that, be, why, we, be, a,...","[INTERJ, PRON, VERB, CONJ, ADJ, VERB, ADV, PRO...","[Z4, Z8, X2:2, Z5, Z8, A3, A2:2, Z8, A3, Z5, T...","[yeah, I, know, but, that, 's, why, we, 're, a...",S2A5
2,1,S0024,an hour later,S2A5,13.0,27.085714,5,S0024,nonoverlap,high,"[VDD, PPY, NN1, NP1]","[do, you, text, --anonnamem]","[VERB, PRON, SUBST, SUBST]","[A1:1:1, Z8, Q1:2, Z1]","[did, you, text, --ANONnameM]",S2A5
3,1,S0024,an hour later,S2A5,13.0,27.085714,7,S0024,nonoverlap,high,[UH],[oh],[INTERJ],[Z4],[oh],S2A5
4,1,S0024,an hour later,S2A5,13.0,27.085714,9,S0024,nonoverlap,high,"[DD, NN, VV0, NN1, NN1, RRR, CSN, NN2, UH]","[some, people, get, jet, lag, longer, than, ot...","[ADJ, SUBST, VERB, SUBST, SUBST, ADV, CONJ, SU...","[N5, S2, A9, M5, M5, T1:3, Z5, A6:1, Z4]","[some, people, get, jet, lag, longer, than, ot...",S2A5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27140,232,S0144,,S2A5,,21.988889,224,S0144,nonoverlap,high,[UH],[mm],[INTERJ],[Z4],[mm],S2A5
27141,232,S0144,,S2A5,,21.988889,226,S0144,nonoverlap,high,"[PPHS1, VBZ, JJ, YQUE]","[she, be, asleep, PUNC]","[PRON, VERB, ADJ, STOP]","[Z8, A3, B1, ]","[she, 's, asleep, ?]",S2A5
27142,232,S0144,,S2A5,,21.988889,228,S0144,nonoverlap,high,"[RR, RR, PPIS1, VVD, AT, NN1, VVI, RR21, RR22,...","[okay, well, i, hear, the, oven, ping, as, wel...","[ADV, ADV, PRON, VERB, ART, SUBST, VERB, ADV, ...","[A5:1, A5:1, Z8, X3:2, Z5, O2, Y2, N5, N5, Z5]","[okay, well, I, heard, the, oven, ping, as, we...",S2A5
27143,232,S0144,,S2A5,,21.988889,230,S0144,nonoverlap,high,"[DDQ, VDZ, DD1, VVI, YQUE]","[what, do, that, mean, PUNC]","[PRON, VERB, ADJ, VERB, STOP]","[Z8, Z5, Z8, Q1:1, ]","[what, does, that, mean, ?]",S2A5


In [16]:
# display exactage and agerange of each speaker
df1 = df_all_speakers[['id', 'exactage', 'agerange','gender']]
df1

Unnamed: 0,id,exactage,agerange,gender
0,S0024,36,30_39,F
1,S0144,36,30_39,M


In [17]:
df_all_utts

Unnamed: 0,n,who,u,dialogue_x,u_len,avg_u_len_per_speaker,u_n,u_who,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue_y
0,1,S0024,an hour later,S2A5,13.0,27.085714,1,S0024,nonoverlap,high,"[AT1, NNT1, RRR, VV0, PPHS1, VVZ, RP, RG, JJ]","[a, hour, later, hope, she, stay, down, rather...","[ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, ...","[Z5, T1:3, T4, X2:6, Z8, M8, Z5, A13:5, T4]","[an, hour, later, hope, she, stays, down, rath...",S2A5
1,1,S0024,an hour later,S2A5,13.0,27.085714,3,S0024,nonoverlap,high,"[UH, PPIS1, VV0, CCB, DD1, VBZ, RRQ, PPIS2, VB...","[yeah, i, know, but, that, be, why, we, be, a,...","[INTERJ, PRON, VERB, CONJ, ADJ, VERB, ADV, PRO...","[Z4, Z8, X2:2, Z5, Z8, A3, A2:2, Z8, A3, Z5, T...","[yeah, I, know, but, that, 's, why, we, 're, a...",S2A5
2,1,S0024,an hour later,S2A5,13.0,27.085714,5,S0024,nonoverlap,high,"[VDD, PPY, NN1, NP1]","[do, you, text, --anonnamem]","[VERB, PRON, SUBST, SUBST]","[A1:1:1, Z8, Q1:2, Z1]","[did, you, text, --ANONnameM]",S2A5
3,1,S0024,an hour later,S2A5,13.0,27.085714,7,S0024,nonoverlap,high,[UH],[oh],[INTERJ],[Z4],[oh],S2A5
4,1,S0024,an hour later,S2A5,13.0,27.085714,9,S0024,nonoverlap,high,"[DD, NN, VV0, NN1, NN1, RRR, CSN, NN2, UH]","[some, people, get, jet, lag, longer, than, ot...","[ADJ, SUBST, VERB, SUBST, SUBST, ADV, CONJ, SU...","[N5, S2, A9, M5, M5, T1:3, Z5, A6:1, Z4]","[some, people, get, jet, lag, longer, than, ot...",S2A5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27140,232,S0144,,S2A5,,21.988889,224,S0144,nonoverlap,high,[UH],[mm],[INTERJ],[Z4],[mm],S2A5
27141,232,S0144,,S2A5,,21.988889,226,S0144,nonoverlap,high,"[PPHS1, VBZ, JJ, YQUE]","[she, be, asleep, PUNC]","[PRON, VERB, ADJ, STOP]","[Z8, A3, B1, ]","[she, 's, asleep, ?]",S2A5
27142,232,S0144,,S2A5,,21.988889,228,S0144,nonoverlap,high,"[RR, RR, PPIS1, VVD, AT, NN1, VVI, RR21, RR22,...","[okay, well, i, hear, the, oven, ping, as, wel...","[ADV, ADV, PRON, VERB, ART, SUBST, VERB, ADV, ...","[A5:1, A5:1, Z8, X3:2, Z5, O2, Y2, N5, N5, Z5]","[okay, well, I, heard, the, oven, ping, as, we...",S2A5
27143,232,S0144,,S2A5,,21.988889,230,S0144,nonoverlap,high,"[DDQ, VDZ, DD1, VVI, YQUE]","[what, do, that, mean, PUNC]","[PRON, VERB, ADJ, VERB, STOP]","[Z8, Z5, Z8, Q1:1, ]","[what, does, that, mean, ?]",S2A5


In [18]:
df_word_data

Unnamed: 0,u_n,u_who,u_trans,u_confidence,w_pos,w_lemma,w_class,w_usas,w_words,dialogue
0,1,S0024,nonoverlap,high,"[AT1, NNT1, RRR, VV0, PPHS1, VVZ, RP, RG, JJ]","[a, hour, later, hope, she, stay, down, rather...","[ART, SUBST, ADV, VERB, PRON, VERB, ADV, ADV, ...","[Z5, T1:3, T4, X2:6, Z8, M8, Z5, A13:5, T4]","[an, hour, later, hope, she, stays, down, rath...",S2A5
1,2,S0144,nonoverlap,high,"[RR, PPHS1, VHD, DD2, MC, NNT2, RRR]","[well, she, have, those, two, hour, earlier]","[ADV, PRON, VERB, ADJ, ADJ, SUBST, ADV]","[A5:1, Z8, A9, Z5, N1, T1:3, N4]","[well, she, had, those, two, hours, earlier]",S2A5
2,3,S0024,nonoverlap,high,"[UH, PPIS1, VV0, CCB, DD1, VBZ, RRQ, PPIS2, VB...","[yeah, i, know, but, that, be, why, we, be, a,...","[INTERJ, PRON, VERB, CONJ, ADJ, VERB, ADV, PRO...","[Z4, Z8, X2:2, Z5, Z8, A3, A2:2, Z8, A3, Z5, T...","[yeah, I, know, but, that, 's, why, we, 're, a...",S2A5
3,4,S0144,nonoverlap,high,[],[],[],[],[],S2A5
4,5,S0024,nonoverlap,high,"[VDD, PPY, NN1, NP1]","[do, you, text, --anonnamem]","[VERB, PRON, SUBST, SUBST]","[A1:1:1, Z8, Q1:2, Z1]","[did, you, text, --ANONnameM]",S2A5
...,...,...,...,...,...,...,...,...,...,...
228,229,S0024,nonoverlap,high,[UH],[yeah],[INTERJ],[Z4],[yeah],S2A5
229,230,S0144,nonoverlap,high,"[DDQ, VDZ, DD1, VVI, YQUE]","[what, do, that, mean, PUNC]","[PRON, VERB, ADJ, VERB, STOP]","[Z8, Z5, Z8, Q1:1, ]","[what, does, that, mean, ?]",S2A5
230,231,S0024,nonoverlap,high,"[VVZ, AT, NN2, VBR, JJ, UH]","[mean, the, potato, be, ready, yay]","[VERB, ART, SUBST, VERB, ADJ, INTERJ]","[Q1:1, Z5, L3, A3, O4:1, Z2]","[means, the, potatoes, are, ready, yay]",S2A5
231,232,S0144,nonoverlap,high,"[UH, UH, UH, UH, PPIS1, VVD, JJ, IF, NN1, RR21...","[oh, yeah, ooh, ooh, i, get, ready, for, bed, ...","[INTERJ, INTERJ, INTERJ, INTERJ, PRON, VERB, A...","[Z4, Z4, Z4, Z4, Z8, A2:1, O4:1, Z5, H5, N5, N...","[oh, yeah, ooh, ooh, I, got, ready, for, bed, ...",S2A5


In [20]:
# get u, avg_u_len_per_speaker, dialogue for each speaker
df_counts = df_all_utts.groupby('who').agg({'u': 'count', 'avg_u_len_per_speaker': 'first', 'dialogue_y': 'count'})
df_counts.columns = ['total_utterances', 'avg_u_len_per_speaker', 'dialogue_count']
df_counts = df_counts.reset_index()
print(df_counts)

     who  total_utterances  avg_u_len_per_speaker  dialogue_count
0  S0024             12285              27.085714           13689
1  S0144             10440              21.988889           13456


In [None]:
# merge df1 and df_utts on the column 'who' and 'id' respectively
df_final = pd.merge(df1, df_counts, left_on='id', right_on='who')

# drop the column 'who'
df_final = df_final.drop(columns=['who'])

# remove duplicate rows in id column
df_final = df_final.drop_duplicates(subset=['id'])
df_final

In [None]:
df_final.describe()

In [None]:
# get the exactage of each speaker '60s'
print(df_final[df_final['exactage'] == '60s'])

# convert that '60s' to 60
df_final.loc[df_final['exactage'] == '60s', 'exactage'] = 60

print(df_final[df_final['exactage'] == '60s'])

In [None]:
# remove rows with missing values
df_final = df_final.dropna()

df_final.shape

In [None]:
# convert the column 'exactage' to integer
df_final['exactage'] = df_final['exactage'].astype(int)

In [None]:
# convert the gender column to categorical
# df_final['gender'] = pd.Categorical(df_final['gender']).codes
# df_final

In [None]:
df_final.describe()

In [None]:
df_final['agerange'].value_counts().sort_index()

In [None]:
df_final = df_final.sort_values('agerange')
df_final

In [None]:
import plotly.graph_objects as go

# create a pie chart of agerange distribution
fig = go.Figure(data=[go.Pie(labels=df_final['agerange'].value_counts().index,
                             values=df_final['agerange'].value_counts().values)])

# update the layout of the chart
fig.update_layout(title='Distribution of agerange')

# show the chart
fig.show()


In [None]:
df_final.dtypes

In [None]:
df_all_speakers

In [None]:
df_final.head()

In [None]:
# show the number of speakers in each age group
df_final['agerange'].value_counts()

In [None]:
df_agerange = df_final.groupby('agerange').agg({'total_utterances': 'sum', 'avg_u_len_per_speaker': 'mean', 'dialogue_count': 'sum', 'id': 'count'}).reset_index()
df_agerange.rename(columns={'avg_u_len_per_speaker': 'avg_u_len_per_agerange'}, inplace=True)
df_agerange.rename(columns={'dialogue_count': 'num_of_dialogues'}, inplace=True)
df_agerange.rename(columns={'id': 'num_of_speakers'}, inplace=True)

# add a new column 'avg_utterances_per_speaker'
df_agerange['avg_u_per_speaker'] = df_agerange['total_utterances'] / df_agerange['num_of_speakers']

In [None]:
df_agerange

In [None]:
df_agerange.describe()

In [None]:
df_agerange.iloc[[2, 5, 6, 7, 8, 9], :]

In [None]:
# create a new dataframe that contains the same number of columns but only 2 rows. one with 19_29 and the other with 50_59, 60_69 to 90_99
df_agerange_2 = df_agerange.iloc[[2, 5, 6, 7, 8, 9], :]

# sum the values of the rows 5, 6, 7, 8, 9 and assign it to the row 3
df_agerange_2.iloc[1, 1:] = df_agerange.iloc[5:10, 1:].sum()

# drop the rows 5, 6, 7, 8, 9
df_agerange_2 = df_agerange_2.drop([6, 7, 8, 9])

# now add a new column 'agegroup' to the dataframe that contains two values 'younger' and 'older'
df_agerange_2['agegroup'] = ['19-29', '50+']

# drop the column 'agerange'
df_agerange_2 = df_agerange_2.drop(columns=['agerange'])

# bring the column 'agegroup' to the first position
cols = df_agerange_2.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_agerange_2 = df_agerange_2[cols]

# reset the index of the dataframe
df_agerange_2 = df_agerange_2.reset_index(drop=True)

df_agerange_2

In [None]:
df_agerange_2.describe()