In [1]:
import re
from nltk.corpus import stopwords
from datasets import load_dataset
from collections import defaultdict

# DialogSum

## Description
DialogSum dataset was created using various dialogue datasets. The dataset merged the source datasets with a common format. There may exist more than 2 people in the dialogues. In order to use the dataset for a chatbot, we have some constraints beforehand:

1. A dialogue has only 2 people.
1. A dialogue can be made up of any daily life conversation.
1. A dialogue may consists of technical terms, only if the dialogue can be used in a daily conversatiob

In [2]:
dataset = 'knkarthick/dialogsum'
trainset = load_dataset(dataset, split='train')

trainset

Using custom data configuration knkarthick--dialogsum-cd575843ad07bb63
Found cached dataset csv (/Users/bugrahamzagundog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd575843ad07bb63/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})

In [3]:
def filter_func(x):
    dialogue = x['dialogue']
    persons = set(filter(lambda x: '#person' in x, dialogue.lower().split()))

    return len(persons) == 2

trainset = trainset.filter(filter_func)
trainset[0]

Loading cached processed dataset at /Users/bugrahamzagundog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd575843ad07bb63/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-5a300de32c77547b.arrow


{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

### Dialogue Length Analysis

In [4]:
def analyze_length(x):
    dialog = x['dialogue']
    dialog = re.sub("\n", "", dialog)
    dialogs = re.split('#Person[\d]#: ', dialog)[1:]
    
    char_length = list(map(len, dialogs))
    
    person1 = char_length[::2]
    person2 = char_length[1::2]
    

    x['Person1 Char Avg'] = sum(person1) / len(person1)
    x['Person1 Char Max'] = max(person1)
    
    x['Person2 Char Avg'] = sum(person2) / len(person2)
    x['Person2 Char Max'] = max(person2)
    
    x['Total Char Avg'] = sum(char_length) / len(char_length)
    x['Total Char Max'] = max(char_length)
    
    word_count = list(map(lambda x: x.count(' ')+1, dialogs))
    
    person1 = word_count[::2]
    person2 = word_count[1::2]
    
    x['Person1 Word Avg'] = sum(person1) / len(person1)
    x['Person1 Word Max'] = max(person1)
    
    x['Person2 Word Avg'] = sum(person2) / len(person2)
    x['Person2 Word Max'] = max(person2)
    
    x['Total Word Avg'] = sum(word_count) / len(word_count)
    x['Total Word Max'] = max(word_count)
    
    return x
    
trainset = trainset.map(analyze_length)
trainset

Loading cached processed dataset at /Users/bugrahamzagundog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd575843ad07bb63/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-03973742634ac395.arrow


Dataset({
    features: ['id', 'dialogue', 'summary', 'topic', 'Person1 Char Avg', 'Person1 Char Max', 'Person2 Char Avg', 'Person2 Char Max', 'Total Char Avg', 'Total Char Max', 'Person1 Word Avg', 'Person1 Word Max', 'Person2 Word Avg', 'Person2 Word Max', 'Total Word Avg', 'Total Word Max'],
    num_rows: 12333
})

In [5]:
def preprocess_fn(x):
    # preprocess_fn removes the non-alphanumeric symbols and stopwords
    # stopwords can be found in nltk english stopwords
    tokenizer = TreebankWordTokenizer()
    non_punctuation = re.compile(r'[^A-Za-z0-9\s]')
    stops = set([re.sub(non_punctuation, ' ', x) for x in stopwords.words('english')])
    
    dialogue = x['dialogue']

    dialogue = re.sub('[^A-Za-z0-9\s]', ' ', dialogue)
    dialogue = re.sub('Person[\d]', '', dialogue)
    dialogue = ' '.join(list(filter(lambda x: x not in stops, dialogue.lower().split())))
    
    x['dialogue'] = dialogue
    return x
    
trainset = trainset.map(preprocess_fn)

trainset[0]



  0%|          | 0/12333 [00:00<?, ?ex/s]

{'id': 'train_0',
 'dialogue': 'hi mr smith doctor hawkins today found would good idea get check yes well one 5 years one every year know figure long nothing wrong go see doctor well best way avoid serious illnesses find early try come least year good ok let see eyes ears look fine take deep breath please smoke mr smith yes smoking leading cause lung cancer heart disease know really quit tried hundreds times seem kick habit well classes medications might help give information leave ok thanks doctor',
 'summary': "Mr. Smith's getting a check-up, and Doctor Hawkins advises him to have one every year. Hawkins'll give some information about their classes and medications to help Mr. Smith quit smoking.",
 'topic': 'get a check-up',
 'Person1 Char Avg': 94.83333333333333,
 'Person1 Char Max': 133,
 'Person2 Char Avg': 36.5,
 'Person2 Char Max': 74,
 'Total Char Avg': 65.66666666666667,
 'Total Char Max': 133,
 'Person1 Word Avg': 18.166666666666668,
 'Person1 Word Max': 28,
 'Person2 Word Av

### Word Frequency Analysis

In [6]:
word_freqs = defaultdict(lambda: 0)

for dialog in trainset['dialogue']:
    for word in dialog.split():
        word_freqs[word] += 1
    
most_freq = sorted(list((k, v) for k, v in word_freqs.items()), key=lambda x: x[1], reverse=True)
most_freq[:30]

[('like', 8794),
 ('yes', 7548),
 ('well', 7462),
 ('think', 5981),
 ('good', 5969),
 ('know', 5726),
 ('get', 5450),
 ('go', 5320),
 ('would', 4919),
 ('one', 4883),
 ('see', 4799),
 ('really', 4699),
 ('oh', 4685),
 ('time', 4582),
 ('right', 4379),
 ('want', 4227),
 ('going', 3825),
 ('take', 3620),
 ('much', 3579),
 ('need', 3365),
 ('ok', 3295),
 ('let', 3128),
 ('please', 3118),
 ('sure', 3097),
 ('could', 2834),
 ('work', 2651),
 ('people', 2629),
 ('thank', 2608),
 ('look', 2515),
 ('got', 2489)]