In [22]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, deto

from datasets import load_dataset
from collections import defaultdict
from matplotlib import pyplot as plt

# DialogSum

## Description
DialogSum dataset was created using various dialogue datasets. The dataset merged the source datasets with a common format. There may exist more than 2 people in the dialogues. In order to use the dataset for a chatbot, we have some constraints beforehand:

1. A dialogue has only 2 people.
1. A dialogue can be made up of any daily life conversation.
1. A dialogue may consists of technical terms, only if the dialogue can be used in a daily conversatiob

In [16]:
dataset = 'knkarthick/dialogsum'
trainset = load_dataset(dataset, split='train')

trainset

Using custom data configuration knkarthick--dialogsum-cd575843ad07bb63
Found cached dataset csv (/Users/bugrahamzagundog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd575843ad07bb63/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 12460
})

In [17]:
def filter_func(x):
    dialogue = x['dialogue']
    persons = set(filter(lambda x: '#person' in x, dialogue.lower().split()))

    return len(persons) == 2

trainset = trainset.filter(filter_func)
trainset[0]

Loading cached processed dataset at /Users/bugrahamzagundog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd575843ad07bb63/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-5a300de32c77547b.arrow


{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

### Dialogue Length Analysis

In [4]:
def analyze_length(x):
    dialog = x['dialogue']
    dialog = re.sub("\n", "", dialog)
    dialogs = re.split('#Person[\d]#: ', dialog)[1:]
    
    char_length = list(map(len, dialogs))
    
    person1 = char_length[::2]
    person2 = char_length[1::2]
    

    x['Person1 Char Avg'] = sum(person1) / len(person1)
    x['Person1 Char Max'] = max(person1)
    
    x['Person2 Char Avg'] = sum(person2) / len(person2)
    x['Person2 Char Max'] = max(person2)
    
    x['Total Char Avg'] = sum(char_length) / len(char_length)
    x['Total Char Max'] = max(char_length)
    
    word_count = list(map(lambda x: x.count(' ')+1, dialogs))
    
    person1 = word_count[::2]
    person2 = word_count[1::2]
    
    x['Person1 Word Avg'] = sum(person1) / len(person1)
    x['Person1 Word Max'] = max(person1)
    
    x['Person2 Word Avg'] = sum(person2) / len(person2)
    x['Person2 Word Max'] = max(person2)
    
    x['Total Word Avg'] = sum(word_count) / len(word_count)
    x['Total Word Max'] = max(word_count)
    
    return x
    
trainset = trainset.map(analyze_length)
trainset

Loading cached processed dataset at /Users/bugrahamzagundog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd575843ad07bb63/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-777551ed5820dafe.arrow


Dataset({
    features: ['id', 'dialogue', 'summary', 'topic', 'Person1 Char Avg', 'Person1 Char Max', 'Person2 Char Avg', 'Person2 Char Max', 'Total Char Avg', 'Total Char Max', 'Person1 Word Avg', 'Person1 Word Max', 'Person2 Word Avg', 'Person2 Word Max', 'Total Word Avg', 'Total Word Max'],
    num_rows: 12333
})

In [None]:
def preprocess_fn(x):
    # preprocess_fn removes the non-alphanumeric symbols and stopwords
    # stopwords can be found in nltk english stopwords
    
    stops = set(stopwords.words('english'))
    
    dialogue = x['dialogue']
        
    print(list(filter(lambda x: x.lower() not in stops, word_tokenize(dialogue))))
    #dialogue = ' '.join(list(filter(lambda x: x.lower() not in stops, dialogue.split())))

    input()
    dialogue = re.sub('[^A-Za-z0-9\s]', '', dialogue)
    dialogue = re.sub('\s', ' ', dialogue)
    x['dialogue'] = dialogue
    return x
    
trainset = trainset.map(preprocess_fn)

trainset[0]

  0%|          | 0/12333 [00:00<?, ?ex/s]

['#', 'Person1', '#', ':', 'Hi', ',', 'Mr.', 'Smith', '.', "'m", 'Doctor', 'Hawkins', '.', 'today', '?', '#', 'Person2', '#', ':', 'found', 'would', 'good', 'idea', 'get', 'check-up', '.', '#', 'Person1', '#', ':', 'Yes', ',', 'well', ',', "n't", 'one', '5', 'years', '.', 'one', 'every', 'year', '.', '#', 'Person2', '#', ':', 'know', '.', 'figure', 'long', 'nothing', 'wrong', ',', 'go', 'see', 'doctor', '?', '#', 'Person1', '#', ':', 'Well', ',', 'best', 'way', 'avoid', 'serious', 'illnesses', 'find', 'early', '.', 'try', 'come', 'least', 'year', 'good', '.', '#', 'Person2', '#', ':', 'Ok.', '#', 'Person1', '#', ':', 'Let', 'see', '.', 'eyes', 'ears', 'look', 'fine', '.', 'Take', 'deep', 'breath', ',', 'please', '.', 'smoke', ',', 'Mr.', 'Smith', '?', '#', 'Person2', '#', ':', 'Yes', '.', '#', 'Person1', '#', ':', 'Smoking', 'leading', 'cause', 'lung', 'cancer', 'heart', 'disease', ',', 'know', '.', 'really', 'quit', '.', '#', 'Person2', '#', ':', "'ve", 'tried', 'hundreds', 'times', '

In [15]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### Word Frequency Analysis

In [14]:
word_freqs = defaultdict(lambda: 0)

for dialog in trainset['dialogue']:
    for word in dialog.split():
        word_freqs[word] += 1
    
most_freq = sorted(list((k, v) for k, v in word_freqs.items()), key=lambda x: x[1], reverse=True)
most_freq[:30]

[('person1', 60690),
 ('person2', 55892),
 ('like', 8779),
 ('well', 8169),
 ('im', 8121),
 ('yes', 7520),
 ('think', 5975),
 ('good', 5886),
 ('know', 5715),
 ('get', 5436),
 ('thats', 5324),
 ('go', 5308),
 ('would', 4908),
 ('see', 4788),
 ('one', 4713),
 ('really', 4695),
 ('oh', 4644),
 ('time', 4397),
 ('right', 4358),
 ('want', 4226),
 ('going', 3809),
 ('dont', 3789),
 ('ill', 3739),
 ('take', 3600),
 ('much', 3577),
 ('need', 3365),
 ('ok', 3293),
 ('please', 3110),
 ('sure', 3095),
 ('could', 2822)]

In [9]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each