In [1]:
# !pip install datasets

In [2]:
from datasets import load_dataset
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
dataset = load_dataset("knkarthick/dialogsum")

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})


In [5]:
train_df = pd.DataFrame(dataset["train"])
val_df = pd.DataFrame(dataset["validation"])
test_df = pd.DataFrame(dataset["test"])

In [6]:
train_df.head()


Unnamed: 0,id,dialogue,summary,topic
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12460 entries, 0 to 12459
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        12460 non-null  object
 1   dialogue  12460 non-null  object
 2   summary   12460 non-null  object
 3   topic     12460 non-null  object
dtypes: object(4)
memory usage: 389.5+ KB


In [8]:
train_df.isnull().sum()

id          0
dialogue    0
summary     0
topic       0
dtype: int64

In [9]:
val_df.isnull().sum()

id          0
dialogue    0
summary     0
topic       0
dtype: int64

In [10]:
test_df.isnull().sum()

id          0
dialogue    0
summary     0
topic       0
dtype: int64

In [11]:
def replace_speaker_tags(text):
    text = re.sub(r'#Person1#:', 'Speaker 1:', text)
    text = re.sub(r'#Person2#:', 'Speaker 2:', text)
    return re.sub(r'\s+', ' ', text).strip()

In [12]:
train_df['cleaned_dialogue'] = train_df['dialogue'].apply(replace_speaker_tags)
val_df['cleaned_dialogue'] = val_df['dialogue'].apply(replace_speaker_tags)
test_df['cleaned_dialogue'] = test_df['dialogue'].apply(replace_speaker_tags)

In [13]:
train_df['cleaned_dialogue'][0]

"Speaker 1: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today? Speaker 2: I found it would be a good idea to get a check-up. Speaker 1: Yes, well, you haven't had one for 5 years. You should have one every year. Speaker 2: I know. I figure as long as there is nothing wrong, why go see the doctor? Speaker 1: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good. Speaker 2: Ok. Speaker 1: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith? Speaker 2: Yes. Speaker 1: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit. Speaker 2: I've tried hundreds of times, but I just can't seem to kick the habit. Speaker 1: Well, we have classes and some medications that might help. I'll give you more information before you leave. Speaker 2: Ok, thanks doctor."

In [14]:
train_df['cleaned_dialogue'].duplicated().sum()

597

In [15]:
train_df.drop_duplicates(subset=['cleaned_dialogue'], inplace=True)
test_df.drop_duplicates(subset=['cleaned_dialogue'], inplace=True)
val_df.drop_duplicates(subset=['cleaned_dialogue'], inplace=True)

In [16]:
train_df.shape

(11863, 5)

In [17]:
train_df['topic'].value_counts()

topic
job interview        158
shopping             155
daily casual talk    119
phone call            88
interview             75
                    ... 
room is cold           1
Wake up Call           1
Products               1
Order                  1
baggage pack           1
Name: count, Length: 7193, dtype: int64

In [18]:
num_unique_topics = len(train_df["topic"].unique())
print(num_unique_topics)

7193


In [19]:
train_df.head()

Unnamed: 0,id,dialogue,summary,topic,cleaned_dialogue
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up,"Speaker 1: Hi, Mr. Smith. I'm Doctor Hawkins. ..."
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines,"Speaker 1: Hello Mrs. Parker, how have you bee..."
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys,"Speaker 1: Excuse me, did you see a set of key..."
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend,Speaker 1: Why didn't you tell me you had a gi...
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance,"Speaker 1: Watsup, ladies! Y'll looking'fine t..."


In [20]:
def basic_clean(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

In [21]:
train_df['cleaned_dialogue'] = train_df['cleaned_dialogue'].apply(basic_clean)
test_df['cleaned_dialogue'] = test_df['cleaned_dialogue'].apply(basic_clean)
val_df['cleaned_dialogue'] = val_df['cleaned_dialogue'].apply(basic_clean)

In [22]:
train_df['tokens'] = train_df['cleaned_dialogue'].apply(word_tokenize)
test_df['tokens'] = test_df['cleaned_dialogue'].apply(word_tokenize)
val_df['tokens'] = val_df['cleaned_dialogue'].apply(word_tokenize)

In [23]:
stop_words = set(stopwords.words('english'))

In [24]:
train_df['tokens'] = train_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
test_df['tokens'] = test_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])
val_df['tokens'] = val_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [25]:
lemmatizer = WordNetLemmatizer()

In [26]:
train_df['tokens'] = train_df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
test_df['tokens'] = test_df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
val_df['tokens'] = val_df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [27]:
train_df['cleaned_dialogue'] = train_df['tokens'].apply(lambda x: ' '.join(x))
test_df['cleaned_dialogue'] = test_df['tokens'].apply(lambda x: ' '.join(x))
val_df['cleaned_dialogue'] = val_df['tokens'].apply(lambda x: ' '.join(x))

In [28]:
train_df['dialogue_length'] = train_df['cleaned_dialogue'].apply(lambda x: len(x.split()))
train_df['summary_length'] = train_df['summary'].apply(lambda x: len(x.split()))

In [29]:
test_df['dialogue_length'] = test_df['cleaned_dialogue'].apply(lambda x: len(x.split()))
test_df['summary_length'] = test_df['summary'].apply(lambda x: len(x.split()))

In [30]:
val_df['dialogue_length'] = val_df['cleaned_dialogue'].apply(lambda x: len(x.split()))
val_df['summary_length'] = val_df['summary'].apply(lambda x: len(x.split()))

In [31]:
encoder = LabelEncoder()
train_df['topic_encoded'] = encoder.fit_transform(train_df['topic'])

In [32]:
val_df = val_df[val_df['topic'].isin(encoder.classes_)]
val_df['topic_encoded'] = encoder.transform(val_df['topic'])

In [33]:
test_df = test_df[test_df['topic'].isin(encoder.classes_)]
test_df['topic_encoded'] = encoder.transform(test_df['topic'])

In [34]:
train_df.head()

Unnamed: 0,id,dialogue,summary,topic,cleaned_dialogue,tokens,dialogue_length,summary_length,topic_encoded
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up,speaker hi mr smith im doctor hawkins today sp...,"[speaker, hi, mr, smith, im, doctor, hawkins, ...",99,30,3544
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines,speaker hello mr parker speaker hello dr peter...,"[speaker, hello, mr, parker, speaker, hello, d...",76,18,6890
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys,speaker excuse see set key speaker kind key sp...,"[speaker, excuse, see, set, key, speaker, kind...",51,15,3360
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend,speaker didnt tell girlfriend speaker sorry th...,"[speaker, didnt, tell, girlfriend, speaker, so...",43,16,3816
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance,speaker watsup lady yll lookingfine tonight ma...,"[speaker, watsup, lady, yll, lookingfine, toni...",52,16,2708


In [35]:
train_df.shape

(11863, 9)

In [36]:
train_df["summary_length"].min()

5

In [38]:
train_df["summary_length"].max()

126

In [37]:
val_df.head()

Unnamed: 0,id,dialogue,summary,topic,cleaned_dialogue,tokens,dialogue_length,summary_length,topic_encoded
0,dev_0,"#Person1#: Hello, how are you doing today?\n#P...",#Person2# has trouble breathing. The doctor as...,see a doctor,speaker hello today speaker trouble breathing ...,"[speaker, hello, today, speaker, trouble, brea...",45,18,5834
4,dev_4,#Person1#: Did you go to school today?\n#Perso...,#Person1# didn't go to school today. #Person2#...,go to school,speaker go school today speaker course speaker...,"[speaker, go, school, today, speaker, course, ...",50,17,3697
5,dev_5,"#Person1#: Honey, I think you should quit smok...",#Person1# asks #Person2# to quit smoking for h...,quit smoking,speaker honey think quit smoking speaker said ...,"[speaker, honey, think, quit, smoking, speaker...",51,14,5458
7,dev_7,"#Person1#: Hey, Karen. Look like you got some ...",#Person1# asks Karen where Karen stayed and ho...,holidays,speaker hey karen look like got sun weekend sp...,"[speaker, hey, karen, look, like, got, sun, we...",75,24,3918
8,dev_8,#Person1#: How do you usually spend your leisu...,#Person1# asks about #Person2#'s hobbies. #Per...,hobby,speaker usually spend leisure time mean specia...,"[speaker, usually, spend, leisure, time, mean,...",47,13,3906


In [38]:
val_df.shape

(230, 9)

In [39]:
test_df.head()

Unnamed: 0,id,dialogue,summary,topic,cleaned_dialogue,tokens,dialogue_length,summary_length,topic_encoded
3,test_1_1,#Person1#: You're finally here! What took so l...,#Person2# arrives late because of traffic jam....,public transportation,speaker youre finally took long speaker got st...,"[speaker, youre, finally, took, long, speaker,...",113,22,5405
6,test_2_1,"#Person1#: Kate, you never believe what's happ...",#Person1# tells Kate that Masha and Hero get d...,divorce,speaker kate never believe whats happened spea...,"[speaker, kate, never, believe, whats, happene...",73,19,2948
9,test_3_1,"#Person1#: Happy Birthday, this is for you, Br...",#Person1# and Brian are at the birthday party ...,birthday party,speaker happy birthday brian speaker im happy ...,"[speaker, happy, birthday, brian, speaker, im,...",62,18,1731
21,test_7_1,#Person1#: Good coming. What can I do for you?...,#Person2# is checking out and asks #Person1# f...,bill,speaker good coming speaker im room im checkin...,"[speaker, good, coming, speaker, im, room, im,...",64,21,1718
30,test_10_1,#Person1#: Where are you going for your trip?\...,#Person2# plans to have a trip in Hebei but #P...,sandstorms,speaker going trip speaker think hebei good pl...,"[speaker, going, trip, speaker, think, hebei, ...",58,16,5761


In [40]:
test_df.shape

(186, 9)

In [48]:
len(test_df["dialogue"].iloc[0])

1209

In [41]:
train_data = train_df[['cleaned_dialogue', 'summary', 'topic']]
val_data = val_df[['cleaned_dialogue', 'summary', 'topic']]
test_data = test_df[['cleaned_dialogue', 'summary', 'topic']]

In [42]:
train_df.to_csv('cleaned_train.csv', index=False)
val_df.to_csv('cleaned_val.csv', index=False)
test_df.to_csv('cleaned_test.csv', index=False)