In [1]:
import json

# Read json file
with open('wizard_of_wikipedia/data.json') as f:
    data = json.load(f)

In [2]:
print('The data is a list of conversations.')
print('Each conversation contains the attributes:')
for att in data[0].keys():
    print('-', att)
print('The dialog object contains a list of objects, each one with the following attributes:')
for att in data[0]['dialog'][0].keys():
    print('-', att)
print('Each dialog contains at least 4-5 utterances.')

The data is a list of conversations.
Each conversation contains the attributes:
- chosen_topic
- persona
- wizard_eval
- dialog
- chosen_topic_passage
The dialog object contains a list of objects, each one with the following attributes:
- speaker
- text
- checked_sentence
- checked_passage
- retrieved_passages
- retrieved_topics
Each dialog contains at least 4-5 utterances.


In [3]:
n_dialogues = len(data)
print('There are a total of', n_dialogues, 'dialogues in the dataset.')

There are a total of 22311 dialogues in the dataset.


In [4]:
import numpy as np
n_utterances = np.sum(np.array([len(d['dialog']) for d in data]))
print('There are a total of', n_utterances, 'utterances in the dataset.')

There are a total of 201999 utterances in the dataset.


In [5]:
print('The average number of utterances per dialogue is', n_utterances/n_dialogues)

The average number of utterances per dialogue is 9.053785128411993


In [6]:
import re
dialogues = [d['dialog'] for d in data]
texts = []
for dialog in dialogues:
    for obj in dialog:
        texts.append(obj['text'])
full_text = " ".join(texts).lower()

In [7]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giacomocartechini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giacomocartechini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
c = nltk.Counter(full_text.split())

In [9]:
print(f'The number of words in the dataset is {sum(c.values())}.')
print(f'The vocabulary size is {len(c)}.')

The number of words in the dataset is 3305777.
The vocabulary size is 106894.


In [10]:
from nltk.corpus import stopwords
stopwords.words('english')

# Get the text without stopwords
c_no_stopwords = nltk.Counter({key: c[key] for key in c if key not in stopwords.words('english')})

print(f'The most common words excluding stopwords are: {c_no_stopwords.most_common(10)}')

The most common words excluding stopwords are: [('like', 25439), ('know', 16293), ('love', 14229), ('think', 11007), ('really', 10816), ('one', 10177), ('would', 9869), ("that's", 8427), ('also', 8170), ("i'm", 8114)]


In [11]:
n_topics = len(set([d['chosen_topic'] for d in data]))
print(f'The number of different topics is {n_topics}.')

The number of different topics is 1365.


In [12]:
# Create the dataset with full dialogues and topics
from nltk.tokenize import RegexpTokenizer

dataset = []
tokenizer = RegexpTokenizer(r'\w+') # This tokenizer removes punctuation
for d in data:
    topic = d['chosen_topic']
    dialogue = [obj['text'] for obj in d['dialog']]
    dialogue = " ".join(dialogue).lower()
    tokens = tokenizer.tokenize(dialogue)
    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    c = nltk.Counter(tokens)
    dataset.append((c, topic))

In [13]:
import pickle

# def dump_dataset_clustering():
#     with open('dumps/clustering/clustering_dataset.pkl', 'wb') as f:
#         pickle.dump(dataset, f)

def load_dataset_clustering():
    with open('dumps/clustering/clustering_dataset.pkl', 'rb') as f:
        dataset = pickle.load(f)
    return dataset

In [14]:
# Dump the dataset
# dump_dataset_clustering()

In [15]:
# Load the dataset
dataset = load_dataset_clustering()

In [16]:
# # Train a word2vec model
from gensim.models import Word2Vec

# # Create a list containing all the text pieces in each dialogue of the dataset
# texts = []
# for d in data:
#     dialogue = [obj['text'] for obj in d['dialog']]
#     texts += dialogue

# # Remove numbers and special characters
# texts = [re.sub('[^A-Za-z]+', ' ', text) for text in texts]

# # Tokenize the text
# tokenizer = RegexpTokenizer(r'\w+')
# texts = [tokenizer.tokenize(text) for text in texts]

# # Lower case
# texts = [[word.lower() for word in text] for text in texts]

# # Remove stopwords
# stopwords = set(stopwords.words('english'))
# texts = [[word for word in text if word not in stopwords] for text in texts]

In [17]:
# Train and dump the model
# model = Word2Vec(texts, vector_size=50, window=5, min_count=1, workers=8)
# model.save('dumps/clustering/word2vec.model')

In [19]:
# Load the model
model = Word2Vec.load('dumps/clustering/word2vec.model')

In [20]:
# Create tf-idf matrices

# def calculate_tf(counter, word):
#     return np.log(1 + counter[word] / sum(counter.values()))

# def calculate_idf(dataset, word):
#     return np.log(len(dataset) / (1 + sum(([1 for d in dataset if word in d[0].keys()])))) + 1

In [21]:
# Calculate tf
# tf = {(i, word): calculate_tf(dataset[i][0], word) for i in range(len(dataset)) for word in dataset[i][0].keys()}

In [22]:
# Calculate idf
# counter = nltk.Counter()
# for sample in dataset:
#     counter = counter + sample[0]
# idf = {word: calculate_idf(dataset, word) for word in counter.keys()}

In [24]:
# Dump tf and idf matrices
# with open('dumps/clustering/tf.pkl', 'wb') as f:
#     pickle.dump(tf, f)
# with open('dumps/clustering/idf.pkl', 'wb') as f:
#     pickle.dump(idf, f)

# Load tf and idf matrices
with open('dumps/clustering/tf.pkl', 'rb') as f:
    tf = pickle.load(f)
with open('dumps/clustering/idf.pkl', 'rb') as f:
    idf = pickle.load(f)

In [26]:
cluster = []

for i in range(len(dataset)):
    # Get the tf-idf vector for each word in the dialogue
    tf_idf = np.array([tf[(i, word)] * idf[word] for word in dataset[i][0].keys()])
    # Get the word embedding for each word in the dialogue
    embeddings = np.array([model.wv[word] for word in dataset[i][0].keys()])
    # Calculate the weighted average of the word embeddings
    weighted_average = np.average(embeddings, axis=0, weights=tf_idf)
    # Append the weighted average to the cluster
    cluster.append((weighted_average, dataset[i][1]))

cluster[:1]

KeyError: "Key '1995' not present"

In [None]:
dataset[0][0]