all imports

In [18]:
import spacy
from spacy.en import English
from spacy.tokens.doc import Doc

import os
import pandas as pd
import json
import pkg_resources

import itertools
from collections import defaultdict

from sklearn.externals import joblib

from scipy import sparse
from ast import literal_eval as make_tuple

from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.preprocessing import Normalizer
from sklearn.metrics.pairwise import pairwise_distances

import numpy as np

utils.spacy_utils.py

In [19]:
def load_vocab(verbose=False):
	if verbose:
		print('loading spacy vocab')
	return English().vocab

def iterate_spacy(path, vocab):
	with open(path + '.bin', 'rb') as spacy_file:
		with open(path + '.txt') as key_file:
			for doc_bytes in Doc.read_bytes(spacy_file):
				try:
					key = next(key_file)
					doc = Doc(vocab).from_bytes(doc_bytes)
					yield key.strip(), doc
				except:
					continue

def get_spacy_dict(path, vocab=None, verbose=5000):
	'''
		gets a dict of (key --> spacy object) from a path (as generated by the spacify function).
		can pass pre-loaded vocabulary to avoid the terrible load time.

		currently this is super-slow anyways, probably because it's reading in the entire dataset.
		in the ideal world, the dataset would be stored in separate chunks, and we could read in parallel.
	'''
	if not vocab:
		vocab = load_vocab(verbose)
	spacy_dict = {}
	iterable_docs = enumerate(iterate_spacy(path,vocab))
	for idx, (key, doc) in iterable_docs:
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)
		spacy_dict[key] = doc
	return spacy_dict

def spacify(text_iter, outfile_name, spacy_NLP=None, verbose=5000):
	'''
		spacifies, writes a spacy object = file w/ spacy objects + other files w/ keys to said objects
		text_iter: iterates over text to spacify, yielding index and text
		outfile_name: where to write the spacy file. will write outfile_name.bin, outfile_name.txt
		if you don't want to keep loading spacy NLP objects (which takes a while) then can
			pass an existing spacy_NLP.
	'''
	if not spacy_NLP:
		if verbose:
			print('loading spacy NLP')
		spacy_NLP = spacy.load('en')
	spacy_keys = []
	spacy_objs = []
	for idx,(text_idx, text) in enumerate(text_iter):
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)
		spacy_keys.append(text_idx)
		spacy_objs.append(spacy_NLP(text).to_bytes())
	with open(outfile_name + '.bin','wb') as f:
		[f.write(byte_val) for byte_val in spacy_objs]
	with open(outfile_name + '.txt','w') as f:
		f.write('\n'.join(spacy_keys))


utils.tree_utils.py

In [20]:
def read_arcs(arc_file, verbose=5000):
	arc_sets = {}
	with open(arc_file) as f:
		for idx,line in enumerate(f.readlines()):
			if (idx > 0) and (idx % verbose == 0):
				print('\t%03d' % idx)
			entry = json.loads(line)
			arc_sets[entry['idx']] = entry['arcs']
	return arc_sets

def read_uplinks(uplink_file):
	uplinks = {}
	with open(uplink_file) as f:
		for line in f.readlines():
			entry = json.loads(line)
			uplinks[tuple(entry['child'])] = [(tuple(x),y) for x,y in entry['parents']]
	return uplinks

def read_downlinks(downlink_file):
	downlinks = {}
	with open(downlink_file) as f:
		for line in f.readlines():
			entry = json.loads(line)
			downlinks[tuple(entry['parent'])] = [(tuple(x),y) for x,y in entry['children']]
	return downlinks

def read_nodecounts(nodecount_file):

	node_counts = {}
	with open(nodecount_file) as f:
		for line in f:
			split = line.split('\t')
			count = int(split[0])
			set_size = int(split[1])
			itemset = tuple([x.strip() for x in split[2:]])
			node_counts[itemset] = count
	return node_counts

motifs.extract_arcs

In [21]:
from spacy.symbols import *
NP_LABELS = set([nsubj, nsubjpass, dobj, iobj, pobj, attr])

def is_noun_ish(word):
	return (word.dep in NP_LABELS) or (word.tag_.startswith('NN') or word.tag_.startswith('PRP')) or (word.tag_.endswith('DT'))

def has_w_det(token):
	if token.tag_.startswith('W'): return token.text
	first_tok = next(token.subtree)
	if (first_tok.tag_.startswith('W')): return first_tok.text
	return False

def get_tok(token):
	if is_noun_ish(token):
		has_w = has_w_det(token)
		if has_w:
			return has_w.lower(), True
		else:
			return 'NN', True
	else:
		return token.text.lower(), False

def get_clean_tok(tok):
	out_tok, is_noun = get_tok(tok)
	return out_tok.replace('--','').strip(), is_noun

def is_alpha_ish(text):
	return text.isalpha() or text[1:].isalpha()

def is_usable(text):
	return is_alpha_ish(text) and (text != 'NN')


def get_arcs(root, follow_conj):

	# todo: could imagine version where nouns allowed
	arcs = set()
	root_tok, _ = get_clean_tok(root)
	if not is_usable(root_tok): return arcs

	arcs.add(root_tok + '_*')
	conj_elems = []
	for idx, kid in enumerate(root.children):
		if kid.dep_ in ['punct','cc']:
			continue
		elif kid.dep_ == 'conj':
			if follow_conj:
				conj_elems.append(kid)
		else:
			kid_tok, _ = get_clean_tok(kid)
			if is_usable(kid_tok):
				arcs.add(root_tok + '_' + kid_tok)

	first_elem = next(root.subtree)
	first_tok, _ = get_clean_tok(first_elem)
	if is_usable(first_tok):
		arcs.add(first_tok + '>*')
		try:
			second_elem = first_elem.nbor()
			second_tok, _ = get_clean_tok(second_elem)
			if is_usable(second_tok):
				arcs.add(first_tok + '>' + second_tok)
		except:
			pass

	for conj_elem in conj_elems:
		arcs.update(get_arcs(conj_elem, follow_conj))
	return arcs

def is_question(span):
	span_text = span.text.strip()
	return span_text[-1] == '?'

def extract_arcs(text_iter, spacy_filename, outfile, vocab, use_span=is_question ,
	follow_conj=True, verbose=5000):

	'''
		extracts all arcs going out of the root in a sentence. used to find question motifs.

		text_iter: iterates over text for which arcs are extracted
		spacy_filename: location of spacy objects (from spacy_utils.py)
		outfile: where to write the arcs.
		vocab: pre-loaded spacy vocabulary. if you pass None it will load vocab for you, but that's slow.
		use_span: filter to decide which sentences to use. the function takes in a spacy sentence object.
		follow_conj: whether to follow conjunctions and treat subtrees as sentences too.

	'''

	if verbose:
		print('reading spacy')
	spacy_dict = get_spacy_dict(spacy_filename, vocab)

	arc_entries = []
	for idx, (text_idx,text) in enumerate(text_iter):
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)
		spacy_obj = spacy_dict[text_idx]
		for span_idx, span in enumerate(spacy_obj.sents):
			if use_span(span):
				curr_arcset = get_arcs(span.root, follow_conj)
				arc_entries.append({'idx': '%s_%d' % (text_idx, span_idx), 'arcs': list(curr_arcset)})
	if verbose:
		print('\twriting arcs')
	with open(outfile, 'w') as f:
		f.write('\n'.join(json.dumps(arc_entry) for arc_entry in arc_entries))

motifs.make_arc_tree.py

In [22]:
def get_sorted_combos(itemset, k):
	combos = set()
	for set_ in itertools.combinations(itemset,k):
		combos.add(tuple(sorted(set_)))
	return combos
def get_mini_powerset(itemset,k=5):
	powerset = set()
	for k in range(1,min(k+1,len(itemset)+1)):
		powerset.update(get_sorted_combos(itemset,k))
	return powerset

def count_frequent_itemsets(arc_sets,min_support,k=5, verbose=5000):
	itemset_counts = defaultdict(lambda: defaultdict(int))
	span_to_itemsets = defaultdict(lambda: defaultdict(set))
	if verbose:
		print('\tfirst pass')
	for idx, (span_idx,arcs) in enumerate(arc_sets.items()):
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)
		for itemset in get_mini_powerset(arcs,k):
			itemset_counts[len(itemset)][itemset] += 1
			span_to_itemsets[span_idx][len(itemset)].add(itemset)
	
	for span_idx, count_dicts in span_to_itemsets.items():
		for i in range(1,k+1):
			count_dicts[i] = [arcset for arcset in count_dicts[i] if itemset_counts[i][arcset] >= min_support]
	if verbose:
		print('\tand then the rest')
	setsize = k+1
	spans_to_check = [span_idx for span_idx,span_dict in span_to_itemsets.items() if len(span_dict[k]) > 0]
	while len(spans_to_check) > 0:
		if verbose:
			print('\t',setsize,len(spans_to_check))
		for idx, span_idx in enumerate(spans_to_check):
			if verbose and (idx > 0) and (idx % verbose == 0):
				print('\t%03d' % idx)
			arcs = arc_sets[span_idx]
			if len(arcs) < setsize: continue
			sets_to_check = [set_ for set_ in span_to_itemsets[span_idx].get(setsize-1,[]) 
								if itemset_counts[setsize-1].get(set_,0) >= min_support]
			if len(sets_to_check) == 0: continue
			
			newsets = set()
			for arc in arcs:
				if itemset_counts[1].get((arc,),0) >= min_support:
					for set_ in sets_to_check:
						newset = tuple(sorted(set(set_+ (arc,))))
						if len(newset) == setsize:
							newsets.add(newset)
			for newset in newsets:
				itemset_counts[setsize][newset] += 1
				span_to_itemsets[span_idx][setsize].add(newset)
		for span_idx, count_dicts in span_to_itemsets.items():
			count_dicts[setsize] = [arcset for arcset in count_dicts[setsize] if itemset_counts[setsize][arcset] >= min_support]
		spans_to_check = [span_idx for span_idx,span_dict in span_to_itemsets.items() if len(span_dict[setsize]) > 0]
		setsize+=1
	return itemset_counts, span_to_itemsets

def make_arc_tree(arc_file, outname, min_support=5, verbose=5000):
	'''
		makes the tree of motifs. (G in the paper.)
	'''

	if verbose:
		print('\treading arcs')
	arc_sets = read_arcs(arc_file, verbose)

	if verbose:
		print('\tcounting itemsets')
	itemset_counts, span_to_itemsets = count_frequent_itemsets(arc_sets,min_support)
	new_itemset_counts = {}
	for setsize, size_dict in itemset_counts.items():
		for k,v in size_dict.items():
			if v >= min_support:
				new_itemset_counts[k] = v
	itemset_counts = new_itemset_counts
	itemset_counts[('*',)] = len(arc_sets)
	if verbose:
		print('\twriting itemsets')
	sorted_counts = sorted(itemset_counts.items(),key=lambda x: (-x[1],len(x[0]),x[0][0]))
	with open(outname + '_arc_set_counts.tsv', 'w') as f:
		for k,v in sorted_counts:
			f.write('%d\t%d\t%s\n' % (v, len(k), '\t'.join(k)))
	
	if verbose:
		print('\tbuilding tree')
	edges = []
	uplinks = defaultdict(dict)
	downlinks = defaultdict(dict)

	for itemset,count in itemset_counts.items():
		parents = []
		set_size = len(itemset)
		if set_size == 1:
			arc = itemset[0]
			if arc.endswith('*'):
				parents.append(('*',))
			elif '_' in arc:
				parents.append((arc.split('_')[0] + '_*',))
			elif '>' in arc:
				parents.append((arc.split('>')[0] + '>*',))

		else:
			for idx in range(set_size):
				parents.append(itemset[:idx] + itemset[idx+1:])
		for parent in parents:
			parent_count = itemset_counts[parent]
			pr_child = count / itemset_counts[parent]
			edges.append({'child': itemset, 'child_count': count,
						'parent': parent, 'parent_count': parent_count,
						'pr_child': pr_child})
			uplinks[itemset][parent] = {'pr_child': pr_child, 'parent_count': parent_count}
			downlinks[parent][itemset] = {'pr_child': pr_child, 'child_count': count}

	with open(outname + '_edges.json', 'w') as f:
		f.write('\n'.join(json.dumps(edge) for edge in edges))
	with open(outname + '_uplinks.json', 'w') as f:
		uplink_list = []
		for child, parent_dict in uplinks.items():
			uplink_list.append({'child': child, 'parents': sorted(parent_dict.items(),key=lambda x: x[1]['pr_child'])})
		uplink_list = sorted(uplink_list, key=lambda x: itemset_counts[x['child']], reverse=True)
		f.write('\n'.join(json.dumps(up) for up in uplink_list))
	with open(outname + '_downlinks.json', 'w') as f:
		downlink_list = []
		for parent, child_dict in downlinks.items():
			downlink_list.append({'parent': parent, 'children': sorted(child_dict.items(),key=lambda x: x[1]['pr_child'])})
		downlink_list = sorted(downlink_list, key=lambda x: itemset_counts[x['parent']], reverse=True)
		f.write('\n'.join(json.dumps(down) for down in downlink_list))

motifs.fit_questions.py

In [23]:
def contains_candidate(container, candidate):
	return set(candidate).issubset(container)

def fit_question(arc_set, downlinks, node_counts):
	fit_nodes = {}
	node_stack = [('*',)]
	while len(node_stack) > 0:
		next_node = node_stack.pop()
		node_count = node_counts.get(next_node,None)
		if node_count:
			entry = {'arcset': next_node, 'arcset_count': node_count}
			children = downlinks.get(next_node, [])
			valid_children = [child for child,_ in children if contains_candidate(arc_set, child)]

			if len(valid_children) == 0:
				entry['max_valid_child_count'] = 0
			else:
				entry['max_valid_child_count'] = max(node_counts.get(child,0) for child in valid_children)
			node_stack += valid_children
			fit_nodes[next_node] = entry
	return fit_nodes

def get_text_idx(span_idx):
	return '.'.join(span_idx.split('.')[:-1])

def fit_all(arc_file, tree_file, outfile, verbose=5000):
	'''
		figures out which motifs occur in each piece of text. 
		arc_file: listing of arcs per text, from extract_arcs
		tree_file: the motif graph, from make_arc_tree. note that
			this doesn't have to come from the same dataset as arc_file, in which case you're basically fitting a new dataset to motifs extracted elsewhere.
		outfile: where to put things.
	'''
	if verbose:
		print('\treading tree')
	arc_sets = read_arcs(arc_file, verbose)	

	downlinks = read_downlinks(tree_file + '_downlinks.json')
	node_counts = read_nodecounts(tree_file + '_arc_set_counts.tsv')


	if verbose:
		print('\tfitting arcsets')
	span_fit_entries = []
	for idx, (span_idx,arcs) in enumerate(arc_sets.items()):
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)
		text_idx = get_text_idx(span_idx)
		fit_nodes = fit_question(set(arcs), downlinks, node_counts)
		for fit_info in fit_nodes.values():
			fit_info['span_idx'] = span_idx
			fit_info['text_idx'] = text_idx
			span_fit_entries.append(fit_info)
	if verbose:
		print('\twriting fits')
	with open(outfile, 'w') as f:
		f.write('\n'.join(json.dumps(entry) for entry in span_fit_entries))

motifs.deduplicate_motifs

In [24]:
def deduplicate_motifs(question_fit_file, outfile, threshold=.9, verbose=5000):

	if verbose:
		print('\treading raw fits')
	span_to_fits = defaultdict(set)
	arcset_counts = defaultdict(int)
	with open(question_fit_file) as f:
		for idx,line in enumerate(f.readlines()):
			if verbose and (idx > 0) and (idx % verbose == 0):
				print('\t%03d' % idx)
			entry = json.loads(line)
			span_to_fits[entry['span_idx']].add(tuple(entry['arcset']))
			arcset_counts[tuple(entry['arcset'])] += 1
	if verbose:
		print('\tcounting cooccs')
	coocc_counts = defaultdict(lambda: defaultdict(int))
	for idx, (span_idx, fit_arcs) in enumerate(span_to_fits.items()):
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)
		fit_arcs = list(fit_arcs)
		for i in range(len(fit_arcs)):
			for j in range(i+1,len(fit_arcs)):
				arc1 = fit_arcs[i]
				arc2 = fit_arcs[j]
				coocc_counts[arc1][arc2] += 1
				coocc_counts[arc2][arc1] += 1
	if verbose:
		print('\tdeduplicating')
	superset_idx = 0
	supersets = defaultdict(set)
	arcset_to_superset = {}
	for arcset, count in arcset_counts.items():
		if arcset in arcset_to_superset: continue
		arcset_to_superset[arcset] = superset_idx
		supersets[superset_idx].add(arcset)
		stack = [arc2 for arc2,count2 in coocc_counts.get(arcset,{}).items()
					if (count2/count >= threshold) and (count2/arcset_counts[arc2] >= threshold)]
		while len(stack) > 0:
			neighbour = stack.pop()
			neighbour_count = arcset_counts[neighbour]
			arcset_to_superset[neighbour] = superset_idx
			supersets[superset_idx].add(neighbour)
			stack += [arc2 for arc2,count2 in coocc_counts.get(neighbour,{}).items()
					if (count2/neighbour_count >= threshold) and (count2/arcset_counts[arc2] >= threshold) and (arc2 not in arcset_to_superset)]
		superset_idx += 1
	superset_ids = {}
	for idx, superset in supersets.items():
		superset_ids[idx] = sorted(superset, key=lambda x: (arcset_counts[x],len(x)), reverse=True)[0]
	arcset_to_ids = {k: superset_ids[v] for k,v in arcset_to_superset.items()}
	supersets_by_id = [{'idx': k, 'id': superset_ids[k], 'items': list(v)} for k,v in supersets.items()]

	if verbose:
		print('\twriting')
	with open(outfile + '_arcset_to_super.json', 'w') as f:
		f.write('\n'.join(json.dumps({'arcset': k, 'super': v}) for k,v in arcset_to_ids.items()))
	with open(outfile + '_sets.json', 'w') as f:
		f.write('\n'.join(json.dumps(entry) for entry in supersets_by_id))
def get_text_idx(span_idx):
	return '.'.join(span_idx.split('.')[:-1])

def postprocess_fits(question_fit_file, question_tree_file, question_superset_file, verbose=5000):
	'''
		this entire file consists of two quite hacky scripts to remove 
		redundant motifs (i.e. p(m1|m2), p(m2|m1) > threshold)

	'''
	downlinks = read_downlinks(question_tree_file + '_downlinks.json')
	super_mappings = {}
	with open(question_superset_file) as f:
		for line in f.readlines():
			entry = json.loads(line)
			super_mappings[tuple(entry['arcset'])] = tuple(entry['super'])
	super_counts = defaultdict(int)
	span_to_fits = defaultdict(set)
	with open(question_fit_file) as f:
		for idx,line in enumerate(f.readlines()):
			if verbose and (idx > 0) and (idx % verbose == 0):
				print('\t%03d' % idx)
			entry = json.loads(line)
			span_to_fits[entry['span_idx']].add(tuple(entry['arcset']))
	for span_idx, fit_set in span_to_fits.items():
		super_fit_set = set([super_mappings[x] for x in fit_set if x != ('*',)])
		for x in super_fit_set:
			super_counts[x] += 1
		#span_to_super_fits[span_idx] = super_fit_set 
	if verbose:
		print('\tmaking new entries')
	new_entries = []
	for idx, (span_idx, fit_set) in enumerate(span_to_fits.items()):
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)
		text_idx = get_text_idx(span_idx)
		super_to_superchildren = defaultdict(set)
		for set_ in fit_set:
			if set_ == ('*',): continue
			superset = super_mappings[set_]
			super_to_superchildren[superset].update([super_mappings[child] for child,_ in downlinks.get(set_, []) if child in fit_set])
		for superset, superchildren in super_to_superchildren.items():
			entry = {'arcset': superset, 'arcset_count': super_counts[superset],
					'text_idx': text_idx, 'span_idx': span_idx}
			if len(superchildren) == 0:
				entry['max_child_count'] = 0
			else:
				entry['max_child_count'] = max(super_counts.get(child,0) for child in superchildren)
			new_entries.append(entry)
	with open(question_fit_file + '.super', 'w') as f:
		f.write('\n'.join(json.dumps(entry) for entry in new_entries))

motifs.extract_motifs

In [25]:
def is_uppercase_question(x):
	'''
		for reasonably well-formatted datasets like transcripts of some proceedings, i've included this filter that questions start w/ uppercase and end in a question mark. this filter can be varied/swapped out.
	'''
	text = x.text.strip()
	return (text[-1] == '?') and (text[0].isupper())

def is_uppercase(x):
	'''
		mainly because we otherwise get a bunch of badly parsed half-lines,
		enforce that answer sentences have to start in uppercase (reliable
		provided your data is well-formatted...)
	'''
	text = x.text.strip()
	return text[0].isupper()


def extract_question_motifs(question_text_iter, spacy_filename, motif_dir,
	question_filter_fn = is_uppercase_question,
	follow_conj=True,
	min_question_itemset_support=5,
	deduplicate_threshold=.9,
	verbose=5000):
	'''
		convenience pipeline to get question motifs. (see pipelines/extract_*_motifs for examples)
		question_text_iter: iterates over all questions
		spacy_filename: location of spacy objects
		motif_dir: directory where all motifs written
		question_filter_fn: only uses sentences in a question which corresponds to a question. can redefine.
		follow_conj: follows conjunctions to compound questions ("why...and how")
		min_question_itemset_support: the minimum number of times an itemset has to show up for the frequent itemset counter to consider it.
		deduplicate_threshold: how often two motifs co-occur (i.e. p(x|y) and p(y|x) for us to consider them redundant)
	'''
	print('running motif extraction pipeline')

	try:
		os.mkdir(motif_dir)
	except:
		print('\tmotif dir %s exists!' % motif_dir)

	print('loading spacy vocab')
	vocab = load_vocab()

	print('getting question arcs')
	question_arc_outfile = os.path.join(motif_dir, 'question_arcs.json')
	extract_arcs(question_text_iter, spacy_filename, question_arc_outfile, vocab, use_span=question_filter_fn, follow_conj=follow_conj, verbose=verbose)

	print('making motif tree')
	question_tree_outfile = os.path.join(motif_dir, 'question_tree')
	make_arc_tree(question_arc_outfile, question_tree_outfile, min_question_itemset_support, verbose=verbose)

	print('fitting motifs to questions')
	question_fit_outfile = os.path.join(motif_dir, 'question_fits.json')
	fit_all(question_arc_outfile, question_tree_outfile, question_fit_outfile, verbose=verbose)

	print('handling redundant motifs')
	question_super_outfile = os.path.join(motif_dir, 'question_supersets')
	deduplicate_motifs(question_fit_outfile, question_super_outfile, deduplicate_threshold, verbose=verbose)
	postprocess_fits(question_fit_outfile, question_tree_outfile, question_super_outfile + '_arcset_to_super.json')

	print('done motif extraction')

def extract_answer_arcs(answer_text_iter, spacy_filename, motif_dir, answer_filter_fn=is_uppercase, follow_conj=True, verbose=5000):
	'''
		convenience pipeline to get answer motifs
	'''

	print('running answer arc pipeline')
	try:
		os.mkdir(motif_dir)
	except:
		print('\tmotif dir %s exists!' % motif_dir)

	print('loading spacy vocab')
	vocab = load_vocab()

	print('getting answer arcs')
	answer_arc_outfile = os.path.join(motif_dir, 'answer_arcs.json')
	extract_arcs(answer_text_iter, spacy_filename, answer_arc_outfile, vocab, use_span=answer_filter_fn, follow_conj=follow_conj, verbose=verbose)

	print('done answer arc extraction')


clusters.build_joint_matrices.py

In [26]:
def get_text_idx(span_idx):
	return '.'.join(span_idx.split('.')[:-1])

def get_motifs_per_question(question_fit_file, answer_arc_file, superset_file,question_threshold, answer_threshold,  verbose=5000):
	question_to_fits = defaultdict(set)
	question_to_leaf_fits = defaultdict(set)
	motif_counts = defaultdict(int)


	super_mappings = {}
	with open(superset_file) as f:
		for line in f.readlines():
			entry = json.loads(line)
			super_mappings[tuple(entry['arcset'])] = tuple(entry['super'])

	with open(question_fit_file) as f:
		for idx, line in enumerate(f.readlines()):
			if verbose and (idx > 0) and (idx % verbose == 0):
				print('\t%03d' % idx)
			entry = json.loads(line)
			motif = tuple(entry['arcset'])
			super_motif = super_mappings[motif]
			if entry['arcset_count'] < question_threshold: continue
			if entry['max_child_count'] < question_threshold:
				question_to_leaf_fits[entry['text_idx']].add(super_motif)
				#if leaves_only: continue
			question_to_fits[entry['text_idx']].add(super_motif)
			motif_counts[super_motif] += 1
	question_to_fits = {k: [x for x in v if motif_counts[x] >= question_threshold] for k,v in question_to_fits.items()}
	motif_counts = {k:v for k,v in motif_counts.items() if v >= question_threshold}
	question_to_leaf_fits = {k: [x for x in v if motif_counts.get(x,0) >= question_threshold] for k,v in question_to_leaf_fits.items()}

	question_to_arcs = defaultdict(set)
	arc_sets = read_arcs(answer_arc_file)
	arc_counts = defaultdict(int)
	for span_idx, arcs in arc_sets.items():
		question_to_arcs[get_text_idx(span_idx)].update(arcs)
		for arc in arcs:
			arc_counts[arc] += 1
	question_to_arcs = {k: [x for x in v if arc_counts[x] >= answer_threshold] for k,v in question_to_arcs.items()}
	arc_counts = {k:v for k,v in arc_counts.items() if v >= answer_threshold}
	return question_to_fits, question_to_leaf_fits, motif_counts, question_to_arcs, arc_counts 

def build_joint_matrix(question_fit_file, answer_arc_file, superset_file, outfile, question_threshold, answer_threshold, verbose=5000):
	if verbose:
		print('\treading arcs and motifs')

	question_to_fits, question_to_leaf_fits, motif_counts, question_to_arcs, arc_counts =\
		 get_motifs_per_question(question_fit_file, answer_arc_file, superset_file, question_threshold, answer_threshold, verbose)
	question_term_list = list(motif_counts.keys())
	answer_term_list = list(arc_counts.keys())

	question_term_to_idx = {k:idx for idx,k in enumerate(question_term_list)}
	answer_term_to_idx = {k:idx for idx,k in enumerate(answer_term_list)}

	if verbose:
		print('\tbuilding matrices')
	question_term_idxes = []
	question_leaves = []
	question_doc_idxes = []
	answer_term_idxes = []
	answer_doc_idxes = []

	pair_idxes = list(set(question_to_fits.keys()).intersection(set(question_to_arcs.keys())))

	for idx, p_idx in enumerate(pair_idxes):
		if verbose and (idx > 0) and (idx % verbose == 0):
			print('\t%03d' % idx)

		question_terms = question_to_fits[p_idx]
		answer_terms = question_to_arcs[p_idx]

		for term in question_terms:
			term_idx = question_term_to_idx[term]
			question_term_idxes.append(term_idx)
			question_doc_idxes.append(idx)
			question_leaves.append(term in question_to_leaf_fits.get(p_idx,[]))
		for term in answer_terms:
			term_idx = answer_term_to_idx[term]
			answer_term_idxes.append(term_idx)
			answer_doc_idxes.append(idx)
	if verbose:
		print('\twriting stuff')
	
	np.save(outfile + '.q.tidx.npy', question_term_idxes)
	np.save(outfile + '.q.leaves.npy', question_leaves)
	np.save(outfile + '.a.tidx.npy', answer_term_idxes)
	np.save(outfile + '.q.didx.npy', question_doc_idxes)
	np.save(outfile + '.a.didx.npy', answer_doc_idxes)
	with open(outfile + '.q.terms.txt', 'w') as f:
		f.write('\n'.join('%d\t%s' % (motif_counts[term],term) for term in question_term_list))
	with open(outfile + '.a.terms.txt', 'w') as f:
		f.write('\n'.join('%d\t%s' % (arc_counts[term],term) for term in answer_term_list))
	with open(outfile + '.docs.txt', 'w') as f:
		f.write('\n'.join(pair_idxes))

clusters.cluster_functions

In [27]:
def load_joint_mtx(rootname):
	mtx_obj = {}
	#rootname = os.path.join(LATENT_DIR, data_name, feature_name)

	print('reading question tidxes')
	mtx_obj['q_tidxes'] = np.load(rootname + '.q.tidx.npy')
	print('reading question leaves')
	mtx_obj['q_leaves'] = np.load(rootname + '.q.leaves.npy')
	print('reading answer tidxes')
	mtx_obj['a_tidxes'] = np.load(rootname + '.a.tidx.npy')

	print('reading question didxes')
	mtx_obj['q_didxes'] = np.load(rootname + '.q.didx.npy')
	print('reading answer didxes')
	mtx_obj['a_didxes'] = np.load(rootname + '.a.didx.npy')

	print('reading question terms')
	mtx_obj['q_terms'] = []
	mtx_obj['q_term_to_idx'] = {}
	mtx_obj['q_term_counts'] = []
	fname = rootname + '.q.terms.txt'
	with open(fname) as f:
		for idx, line in enumerate(f.readlines()):
			count,term = line.split('\t')
			term = term.strip()
			term = make_tuple(term)
			mtx_obj['q_term_counts'].append(int(count))
			mtx_obj['q_terms'].append(term)
			mtx_obj['q_term_to_idx'][term] = idx
	mtx_obj['q_terms'] = np.array(mtx_obj['q_terms'])
	mtx_obj['q_term_counts'] = np.array(mtx_obj['q_term_counts'])

	print('reading answer terms')
	mtx_obj['a_terms'] = []
	mtx_obj['a_term_to_idx'] = {}
	mtx_obj['a_term_counts'] = []
	fname = rootname + '.a.terms.txt'
	with open(fname) as f:
		for idx, line in enumerate(f.readlines()):
			count,term = line.split('\t')
			term = term.strip()
			mtx_obj['a_term_counts'].append(int(count))
			mtx_obj['a_terms'].append(term)
			mtx_obj['a_term_to_idx'][term] = idx
	mtx_obj['a_terms'] = np.array(mtx_obj['a_terms'])
	mtx_obj['a_term_counts'] = np.array(mtx_obj['a_term_counts'])

	print('reading docs')
	mtx_obj['docs'] = []
	mtx_obj['doc_to_idx'] = {}
	with open(rootname + '.docs.txt') as f:
		for idx, line in enumerate(f.readlines()):
			doc_id = line.strip()
			mtx_obj['docs'].append(doc_id)
			mtx_obj['doc_to_idx'][doc_id] = idx 
	mtx_obj['docs'] = np.array(mtx_obj['docs'])

	print('done!')
	return mtx_obj

def build_mtx(mtx_obj, data_type, norm='l2', idf=False, leaves_only=False):
	N_terms = len(mtx_obj[data_type + '_terms'])
	N_docs = len(mtx_obj['docs'])
	if idf:
		data = np.log(N_docs) - np.log(mtx_obj[data_type + '_term_counts'][mtx_obj[data_type + '_tidxes']])
	else:
		data = np.ones_like(mtx_obj[data_type + '_tidxes'])
		if leaves_only:
			data[~mtx_obj[data_type + '_leaves']] = 0
	mtx = sparse.csr_matrix((data, (mtx_obj[data_type + '_tidxes'], mtx_obj[data_type + '_didxes'])), shape=(N_terms,N_docs))
	print(mtx)
	if norm:
# 		mtx = Normalizer(norm=norm).fit_transform(mtx)
		mtx = Normalizer(norm=norm).fit_transform(mtx.astype(np.double))
		print(mtx)
	
	return mtx

def run_simple_pipe(rootname, verbose=True):
	mtx_obj = load_joint_mtx(rootname)
	q_mtx = build_mtx(mtx_obj, 'q')
	a_mtx = build_mtx(mtx_obj, 'a', idf=True)
	return q_mtx, a_mtx, mtx_obj
def do_sparse_svd(mtx, k=50):
	u,s,v = sparse.linalg.svds(mtx, k=k) # ugh, right order dammit
	return u[:,::-1],s[::-1],v[::-1,:]
def run_lowdim_pipe(q_mtx, a_mtx, k=50, snip=True):
	a_u, a_s, a_v = do_sparse_svd(a_mtx,k + int(snip))
	lq = q_mtx * (a_v.T * a_s**-1)
	if snip:
		return snip_first_dim(lq, a_u, a_s, a_v)
	else:
		return lq, a_u, a_s, a_v

def inspect_latent_space(mtx, names, dim_iter=None, num_dims=5, num_egs=10, which_end=None, skip_first=True, dim_names={},s=None):
	mtx = Normalizer().fit_transform(mtx).T
	if dim_iter is None:
		dim_iter = range(int(skip_first), num_dims + int(skip_first))
	for dim in dim_iter:
		if s is not None:
			print(dim,s[dim])
		else:
			print(dim)
		row = mtx[dim]
		argsorted = np.argsort(row)
		if (not which_end) or (which_end == -1):
			print('\tbottom',dim_names.get((dim,-1), ''))
			for i in range(num_egs):
				print('\t\t',names[argsorted[i]], '%+.4f' % row[argsorted[i]])
		if (not which_end) or (which_end == 1):
			print('\ttop',dim_names.get((dim,1), ''))
			for i in range(num_egs):
				print('\t\t',names[argsorted[-1-i]], '%+.4f' % row[argsorted[-1-i]])
		print()


def run_kmeans(X, in_dim, k):
	print(k)
	km = KMeans(n_clusters=k, max_iter=1000, n_init=1000, random_state=164)
	km.fit(X)
	thing = [0 for i in range(k)]
	for label in km.labels_:
		thing[label] += 1
	print(thing)
	print(sum(thing))
	print(len(km.labels_))
	return km


def inspect_kmeans_run(q_mtx, a_mtx, num_svd_dims, num_clusters, q_terms, a_terms, km=None, remove_first=False, num_egs=10):
	if remove_first:
		q_mtx = q_mtx[:,1:(num_svd_dims + 1)]
		a_mtx = a_mtx[:,1:(num_svd_dims + 1)]
	else:
		q_mtx = q_mtx[:,:num_svd_dims]
		a_mtx = a_mtx[:,:num_svd_dims]
	q_mtx = Normalizer().fit_transform(q_mtx)
	a_mtx = Normalizer().fit_transform(a_mtx)
	if km:
		q_km = km
	else:
		q_km = run_kmeans(q_mtx, num_svd_dims, num_clusters)
	if num_egs > 0:
		q_dists = q_km.transform(q_mtx)
		q_assigns = q_km.labels_
		a_dists = q_km.transform(a_mtx)
		a_assigns = q_km.predict(a_mtx)
		for cl in range(num_clusters):
			print('cluster',cl)
			q_assigned = q_assigns == cl
			median_qdist = np.median(q_dists[:,cl][q_assigned])
			print('\tq assigns:',q_assigned.sum(),  'median dist:', '%.4f' % median_qdist)
			a_assigned = a_assigns == cl
			median_adist = np.median(a_dists[:,cl][a_assigned])
			print('\ta assigns:',a_assigned.sum(),  'median dist:', '%.4f' % median_adist)
			if num_egs == 0: continue
			argsorted_qdists = np.argsort(q_dists[:,cl])
			argsorted_qdists = argsorted_qdists[np.in1d(argsorted_qdists, np.where(q_assigned)[0])]
			print('\tqs:')
			for i in range(min(num_egs,q_assigned.sum())):
				curr_qdist = q_dists[:,cl][argsorted_qdists[i]]
				if curr_qdist > median_qdist:
					diststr = '%.4f ~~' %  curr_qdist
				else:
					diststr = '%.4f' % curr_qdist
				print('\t\t', q_terms[argsorted_qdists[i]], diststr)
			argsorted_adists = np.argsort(a_dists[:,cl])
			argsorted_adists = argsorted_adists[np.in1d(argsorted_adists, np.where(a_assigned)[0])]
			print('\tas:')
			for i in range(min(num_egs,a_assigned.sum())):
				curr_adist = a_dists[:,cl][argsorted_adists[i]]
				if curr_adist > median_adist:
					diststr = '%.4f ~~' %  curr_adist
				else:
					diststr = '%.4f' % curr_adist
				print('\t\t', a_terms[argsorted_adists[i]], diststr)
			print('========================')
	return q_km

def snip_first_dim(lq, a_u, a_s, a_v):
	return lq[:,1:], a_u[:,1:], a_s[1:], a_v[1:]

def assign_clusters(km, lq, a_u, mtx_obj, n_dims):
	km_qdists = km.transform(Normalizer().fit_transform(lq[:,:n_dims]))
	km_qlabels = km.predict(Normalizer().fit_transform(lq[:,:n_dims]))
	km_adists = km.transform(Normalizer().fit_transform(a_u[:,:n_dims]))
	km_alabels = km.predict(Normalizer().fit_transform(a_u[:,:n_dims]))

	motif_df_entries = []
	for idx, motif in enumerate(mtx_obj['q_terms']):
	    entry = {'idx': idx, 'motif': motif, 'cluster': km_qlabels[idx],
	            'count': mtx_obj['q_term_counts'][idx]}
	    entry['cluster_dist'] = km_qdists[idx,entry['cluster']]
	    motif_df_entries.append(entry)
	motif_df = pd.DataFrame(motif_df_entries).set_index('idx')

	aarc_df_entries = []
	for idx, aarc in enumerate(mtx_obj['a_terms']):
	    entry = {'idx': idx, 'aarc': aarc, 'cluster': km_alabels[idx], 
	            'count': mtx_obj['a_term_counts'][idx]}
	    entry['cluster_dist'] = km_adists[idx,entry['cluster']]
	    aarc_df_entries.append(entry)
	aarc_df = pd.DataFrame(aarc_df_entries).set_index('idx')

	q_leaves = build_mtx(mtx_obj,'q',leaves_only=True)
	qdoc_vects = q_leaves.T * Normalizer().fit_transform(parl_lq)
	km_qdoc_dists = km.transform(Normalizer().fit_transform(qdoc_vects[:,:n_dims]))
	km_qdoc_labels = km.predict(Normalizer().fit_transform(qdoc_vects[:,:n_dims]))
	qdoc_df_entries = []
	for idx, qdoc in enumerate(mtx_obj['docs']):
	    entry = {'idx': idx, 'q_idx': qdoc, 'cluster': km_qdoc_labels[idx]}
	    entry['cluster_dist'] = km_qdoc_dists[idx,entry['cluster']]
	    qdoc_df_entries.append(entry)
	qdoc_df = pd.DataFrame(qdoc_df_entries).set_index('idx')

	return motif_df, aarc_df, qdoc_df


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 71)

clusters.extract_clusters

In [11]:
def build_matrix(motif_dir, matrix_dir, question_threshold, answer_threshold):
	'''
		convenience pipeline to build the question answer matrices. 
		motif_dir: wherever extract_motifs wrote to
		matrix_dir: where to put the matrices
		question_threshold: minimum # of questions in which a question motif has to occur to be considered
	'''
	print('building q-a matrices')
	question_fit_file = os.path.join(motif_dir, 'question_fits.json.super')
	answer_arc_file = os.path.join(motif_dir, 'answer_arcs.json')
	superset_file = os.path.join(motif_dir, 'question_supersets_arcset_to_super.json')

	try:
		os.mkdir(matrix_dir)
	except:
		print('matrix dir %s exists!' % matrix_dir)

	outfile = os.path.join(matrix_dir, 'qa_mtx')
	build_joint_matrix(question_fit_file, answer_arc_file,superset_file, outfile, question_threshold, answer_threshold, verbose=5000)



def extract_clusters(matrix_dir,km_file,k=8, d=25,num_egs=10):
	'''
		convenience pipeline to get latent q-a dimensions and clusters. 

		km_file: where to write the kmeans object
		k: num clusters
		d: num latent dims
		
	'''
	matrix_file = os.path.join(matrix_dir, 'qa_mtx')
	q_mtx, a_mtx, mtx_obj = run_simple_pipe(matrix_file)
	lq, a_u, a_s, a_v = run_lowdim_pipe(q_mtx,a_mtx,d)
	km = inspect_kmeans_run(lq,a_u,d,k,mtx_obj['q_terms'], mtx_obj['a_terms'], num_egs=num_egs)
	joblib.dump(km, km_file)



In [12]:
#PIPELINES

pipelines.preprocess_parliament.py

In [13]:
# mostly just examples of how to spacify data
# note that spacy has its own parallelization functionality which I never use, but maybe should...
PARL_DIR = os.path.join(pkg_resources.resource_filename("convokit", ""), 'downloads', 'justine-test')


def iterate_parl_df():
	'''
		here i'm just reading question and answer text from a dataframe i got after
		a lot of data processing (actual data processing code pending).

		and yeah I realize that iterating row by row through a dataframe is slow. but
			probably spacifying things is slower.
	'''

	# question_df = pd.read_csv(os.path.join(PARL_DIR, 'metadata_10.tsv'),
	# 				index_col=0, sep='\t')
	# for row in question_df.itertuples():
	# 	print(row.asked_tbl, row.len_followups, row.official_name, row.major_name, row.minor_name, row.num_interjections)
	# 	yield row.question_text_idx, row.question_text
	# 	yield row.answer_text_idx, row.answer_text

	question_data = json.load(open(os.path.join(PARL_DIR, 'parliament-corpus'), "r"))
	id_of_last_question = ''
	for utterance in question_data:
		if utterance['is_question']:
			id_of_last_question = utterance['id']
		else:
			assert id_of_last_question == utterance['root'] # The if-else is to ensure that answers are
			# processed immediately after their questions. A redundancy to ensure order. Can be commented out.
		yield utterance['id'], utterance['text']


spacy_file = os.path.join(PARL_DIR, 'spacy')
spacify(iterate_parl_df(), spacy_file)

loading spacy NLP
	5000
	10000
	15000
	20000
	25000
	30000
	35000
	40000
	45000
	50000
	55000
	60000
	65000
	70000
	75000
	80000
	85000
	90000
	95000
	100000
	105000
	110000
	115000
	120000
	125000
	130000
	135000
	140000
	145000
	150000
	155000
	160000
	165000
	170000
	175000
	180000
	185000
	190000
	195000
	200000
	205000
	210000
	215000
	220000
	225000
	230000
	235000
	240000
	245000
	250000
	255000
	260000
	265000
	270000
	275000
	280000
	285000
	290000
	295000
	300000
	305000
	310000
	315000
	320000
	325000
	330000
	335000
	340000
	345000
	350000
	355000
	360000
	365000
	370000
	375000
	380000
	385000
	390000
	395000
	400000
	405000
	410000
	415000
	420000
	425000
	430000


pipelines.extract_parliament_motifs.py

In [14]:
def parl_question_iter(question_data): # this should really be a separate generic function call
	# for row in metadata_df.itertuples():
		# yield row.question_text_idx, row.question_text
	for utterance in question_data:
		if utterance['is_question']:
			yield utterance['id'], utterance['text']

def parl_answer_iter(answer_data):
	# for row in metadata_df.itertuples():
		# yield row.answer_text_idx, row.answer_text
	for utterance in answer_data:	
		if utterance['is_answer']:
			yield utterance['id'], utterance['text']


# metadata_df = pd.read_csv(os.path.join(PARL_DIR, 'metadata.tsv'),
                # index_col=0, sep='\t')

data = json.load(open(os.path.join(PARL_DIR, 'parliament-corpus'), "r"))

question_iter = parl_question_iter(data)
answer_iter = parl_answer_iter(data)

motif_dir = os.path.join(PARL_DIR, 'parliament-motifs')
spacy_filename = os.path.join(PARL_DIR, 'spacy')

extract_question_motifs(question_iter, spacy_filename, motif_dir)
extract_answer_arcs(answer_iter, spacy_filename, motif_dir)

running motif extraction pipeline
	motif dir /Users/ishaanjhaveri/Library/Caches/Python-Eggs/convokit-0.0.1-py3.6.egg-tmp/convokit/downloads/justine-test/parliament-motifs exists!
loading spacy vocab
getting question arcs
reading spacy
	5000
	10000
	15000
	20000
	25000
	30000
	35000
	40000
	45000
	50000
	55000
	60000
	65000
	70000
	75000
	80000
	85000
	90000
	95000
	100000
	105000
	110000
	115000
	120000
	125000
	130000
	135000
	140000
	145000
	150000
	155000
	160000
	165000
	170000
	175000
	180000
	185000
	190000
	195000
	200000
	205000
	210000
	215000
	220000
	225000
	230000
	235000
	240000
	245000
	250000
	255000
	260000
	265000
	270000
	275000
	280000
	285000
	290000
	295000
	300000
	305000
	310000
	315000
	320000
	325000
	330000
	335000
	340000
	345000
	350000
	355000
	360000
	365000
	370000
	375000
	380000
	385000
	390000
	395000
	400000
	405000
	410000
	415000
	420000
	425000
	430000
	5000
	10000
	15000
	20000
	25000
	30000
	35000
	40000
	45000
	50000
	55000
	60000
	65000
	70000

	2690000
	2695000
	2700000
	2705000
	2710000
	2715000
	2720000
	2725000
	2730000
	2735000
	2740000
	2745000
	2750000
	2755000
	2760000
	2765000
	2770000
	2775000
	2780000
	2785000
	2790000
	2795000
	2800000
	2805000
	2810000
	2815000
	2820000
	2825000
	2830000
	2835000
	2840000
	2845000
	2850000
	2855000
	2860000
	2865000
	2870000
	2875000
	2880000
	2885000
	2890000
	2895000
	2900000
	2905000
	2910000
	2915000
	2920000
	2925000
	2930000
	2935000
	2940000
	2945000
	2950000
	2955000
	2960000
	2965000
	2970000
	2975000
	2980000
	2985000
	2990000
	2995000
	3000000
	3005000
	3010000
	3015000
	3020000
	3025000
	3030000
	3035000
	3040000
	3045000
	3050000
	3055000
	3060000
	3065000
	3070000
	3075000
	3080000
	3085000
	3090000
	3095000
	3100000
	3105000
	3110000
	3115000
	3120000
	3125000
	3130000
	3135000
	3140000
	3145000
	3150000
	3155000
	3160000
	3165000
	3170000
	3175000
	3180000
	3185000
	3190000
	3195000
	3200000
	3205000
	3210000
	3215000
	3220000
	3225000
	3230000
	3235000
	3240000
	

	1145000
	1150000
	1155000
	1160000
	1165000
	1170000
	1175000
	1180000
	1185000
	1190000
	1195000
	1200000
	1205000
	1210000
	1215000
	1220000
	1225000
	1230000
	1235000
	1240000
	1245000
	1250000
	1255000
	1260000
	1265000
	1270000
	1275000
	1280000
	1285000
	1290000
	1295000
	1300000
	1305000
	1310000
	1315000
	1320000
	1325000
	1330000
	1335000
	1340000
	1345000
	1350000
	1355000
	1360000
	1365000
	1370000
	1375000
	1380000
	1385000
	1390000
	1395000
	1400000
	1405000
	1410000
	1415000
	1420000
	1425000
	1430000
	1435000
	1440000
	1445000
	1450000
	1455000
	1460000
	1465000
	1470000
	1475000
	1480000
	1485000
	1490000
	1495000
	1500000
	1505000
	1510000
	1515000
	1520000
	1525000
	1530000
	1535000
	1540000
	1545000
	1550000
	1555000
	1560000
	1565000
	1570000
	1575000
	1580000
	1585000
	1590000
	1595000
	1600000
	1605000
	1610000
	1615000
	1620000
	1625000
	1630000
	1635000
	1640000
	1645000
	1650000
	1655000
	1660000
	1665000
	1670000
	1675000
	1680000
	1685000
	1690000
	1695000
	

	5705000
	5710000
	5715000
	5720000
	5725000
	5730000
	5735000
	5740000
	5745000
	5750000
	5755000
	5760000
	5765000
	5770000
	5775000
	5780000
	5785000
	5790000
	5795000
	5800000
	5805000
	5810000
	5815000
	5820000
	5825000
	5830000
	5835000
	5840000
	5845000
	5850000
	5855000
	5860000
	5865000
	5870000
	5875000
	5880000
	5885000
	5890000
	5895000
	5900000
	5905000
	5910000
	5915000
	5920000
	5925000
	5930000
	5935000
	5940000
	5945000
	making new entries
	5000
	10000
	15000
	20000
	25000
	30000
	35000
	40000
	45000
	50000
	55000
	60000
	65000
	70000
	75000
	80000
	85000
	90000
	95000
	100000
	105000
	110000
	115000
	120000
	125000
	130000
	135000
	140000
	145000
	150000
	155000
	160000
	165000
	170000
	175000
	180000
	185000
	190000
	195000
	200000
	205000
	210000
	215000
	220000
	225000
	230000
	235000
	240000
	245000
	250000
	255000
	260000
	265000
	270000
	275000
	280000
	285000
	290000
	295000
	300000
	305000
	310000
	315000
	320000
	325000
done motif extraction
running answer ar

pipelines.construct_parliament_matrix

In [15]:
motif_dir = os.path.join(PARL_DIR, 'parliament-motifs')
matrix_dir = os.path.join(PARL_DIR, 'parliament-matrix')
build_matrix(motif_dir,matrix_dir,question_threshold=100, answer_threshold=100)

building q-a matrices
matrix dir /Users/ishaanjhaveri/Library/Caches/Python-Eggs/convokit-0.0.1-py3.6.egg-tmp/convokit/downloads/justine-test/parliament-matrix exists!
	reading arcs and motifs
	5000
	10000
	15000
	20000
	25000
	30000
	35000
	40000
	45000
	50000
	55000
	60000
	65000
	70000
	75000
	80000
	85000
	90000
	95000
	100000
	105000
	110000
	115000
	120000
	125000
	130000
	135000
	140000
	145000
	150000
	155000
	160000
	165000
	170000
	175000
	180000
	185000
	190000
	195000
	200000
	205000
	210000
	215000
	220000
	225000
	230000
	235000
	240000
	245000
	250000
	255000
	260000
	265000
	270000
	275000
	280000
	285000
	290000
	295000
	300000
	305000
	310000
	315000
	320000
	325000
	330000
	335000
	340000
	345000
	350000
	355000
	360000
	365000
	370000
	375000
	380000
	385000
	390000
	395000
	400000
	405000
	410000
	415000
	420000
	425000
	430000
	435000
	440000
	445000
	450000
	455000
	460000
	465000
	470000
	475000
	480000
	485000
	490000
	495000
	500000
	505000
	510000
	515000
	52

pipelines.extract_parliament_clusters.py

In [17]:
matrix_dir = os.path.join(PARL_DIR, 'parliament-matrix')
km_name = os.path.join(PARL_DIR, 'demo_km.pkl')
extract_clusters(matrix_dir, km_name, k=8,d=100,num_egs=10)

reading question tidxes
reading question leaves
reading answer tidxes
reading question didxes
reading answer didxes
reading question terms
reading answer terms
reading docs
done!
8
[327, 204, 245, 325, 374, 73, 399, 322]
2269
2269
cluster 0
	q assigns: 327 median dist: 0.9468
	a assigns: 549 median dist: 0.9878
	qs:
		 ('why>*',) 0.7410
		 ('admit_*',) 0.7857
		 ('why>*', 'why>does') 0.8030
		 ('admit_*', 'admit_will') 0.8288
		 ('show_*', 'show_does') 0.8404
		 ('show_*', 'show_does', 'show_not') 0.8409
		 ('justify_*', 'justify_how') 0.8413
		 ('justify_*',) 0.8431
		 ('does>*', 'show_*') 0.8436
		 ('how>*', 'how>can') 0.8449
	as:
		 remind_* 0.8564
		 is_correct 0.8645
		 find_* 0.8784
		 find_difficult 0.8850
		 is_true 0.8904
		 fell_* 0.8961
		 is_wrong 0.8986
		 find_will 0.8997
		 seems_* 0.8999
		 as>for 0.9028
cluster 1
	q assigns: 204 median dist: 0.9145
	a assigns: 123 median dist: 0.9736
	qs:
		 ('give_*',) 0.6855
		 ('assure_*',) 0.6896
		 ('assure_*', 'assure_will') 0.69