In [124]:
import stats_table_functions as stats
from collections import defaultdict, OrderedDict
import pickle
import numpy as np
import pandas

# Welcome to STAC stats! 
Here we are currently building functions to let you explore the situated and non-situated versions of the stac corpus.
For each command, you can specify which part of the corpus you would like to apply it to. 

Example:

stats.edu_count( ) returns the segment counts in a game. Run the command with "all" to see all games, the name of a game, e.g. "pilot02", to see one game, or a list of game names ["pilot02", "pilot04"]


<font color='green'>
### stats.edu_count( )</font>
<break>
takes either a list of games or 'all' to see the corpus. *Default='all'*


In [None]:
#stats.edu_count('all')

In [None]:
#stats.edu_count(['pilot02', 'pilot14'])

In [None]:
#stats.edu_count('pilot02')

<font color='green'> 
### stats.relation_count( )</font>
<break>
Takes 3 parameters:
1. games: either a list of games or 'all' -*OR*- 'aggregate' to see aggregated counts between situated and spect versions. *Default='all'*
2. version: 'situated', 'spect' or 'both', *default='both'*
3. relation types: a list of types or 'all, *default='all'*

In [None]:
#stats.relation_count('aggregate', 'both', ['Question-answer_pair', 'Comment'])

In [None]:
#stats.relation_count('aggregate', 'both', 'all')

<font color='green'> 
### stats.relation_endpoints( )</font>
<break>
Takes 4 parameters:
 1. games: either a list of games or 'all',  *default='all'*
 2. version: 'situated', 'spect' or 'both', *default='both'*
 3. endpoints: a list of endpoint tuples, e.g. [('cdu', 'edu'), ('edu', edu')], or 'all' for all combinations. *Default = 'all'*
 4. relation types: a list of types, e.g. ['Contrast', 'Result'], or 'all'. *Default='all'*

In [None]:
#stats.relation_endpoints('all', 'both', 'all', 'all')

### dialogues
game -- subdoc -- span_end

In [3]:
# stats.tables.dlgs_situ[:10]
# subset = stats.tables.dlgs_situ[['global_id', 'doc', 'subdoc', 'span_beg', 'span_end']]

pkl_file = open('dlgs_situ.pkl', 'rb')
dlgs_situ = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('dlgs_spect.pkl', 'rb')
dlgs_spect = pickle.load(pkl_file)
pkl_file.close()

In [4]:
dlgs_situ[:10]
subset = dlgs_situ[['global_id', 'doc', 'subdoc', 'span_beg', 'span_end']]


In [5]:
tuples = [tuple(x) for x in subset.values]

In [6]:
tuples[:3]

[('s2-league5-game1_01_stacutil_1496243576',
  's2-league5-game1',
  '01',
  1,
  2655),
 ('s2-league5-game1_01_stac_1413793066', 's2-league5-game1', '01', 4724, 5163),
 ('s2-league5-game1_01_stac_1413793110', 's2-league5-game1', '01', 5164, 6205)]

game --subdoc -- dialgoue -- turn -- seg id, type, text

In [7]:
dialogue_dict = {}
for t in tuples:
    if t[1] in dialogue_dict.keys():
        if t[2] in dialogue_dict[t[1]].keys():
            dialogue_dict[t[1]][t[2]][(t[3], t[4])] = t[0]
        else:
            dialogue_dict[t[1]][t[2]] = {}
            dialogue_dict[t[1]][t[2]][(t[3], t[4])] = t[0]
    else:
        dialogue_dict[t[1]] = {}
        dialogue_dict[t[1]][t[2]] = {}
        dialogue_dict[t[1]][t[2]][(t[3], t[4])] = t[0]

In [25]:
dialogue_dict.keys()

['s2-league5-game1', 's1-league1-game2']

In [9]:
dialogue_dict['s2-league5-game1']['01'][(1, 2655)]

's2-league5-game1_01_stacutil_1496243576'

In [None]:
#stats.tables.rels_situ.loc[stats.tables.rels_situ['type'] == 'Anaphora'][:5]

# cdu components dict
game -- subdoc -- schema_id -- [member list]

In [None]:
comps_slice = stats.tables.schm_mbrs_situ[['member_id', 'schema_id']]

In [None]:
comps = [tuple(x) for x in comps_slice.values]

In [None]:
comps[:2]

In [None]:
cdu_comps_dict = {}
for s in comps:
    #get game name and subdoc from schema id
    game, subdoc = s[1].split('_')[:2]
    #check if game in dict
    if game in cdu_comps_dict.keys():
        if subdoc in cdu_comps_dict[game].keys():
            cdu_comps_dict[game][subdoc][s[1]].append(s[0])
        else:
            cdu_comps_dict[game][subdoc] = defaultdict(list)
            cdu_comps_dict[game][subdoc][s[1]].append(s[0])
    else:
        cdu_comps_dict[game] = {}
        cdu_comps_dict[game][subdoc] = defaultdict(list)
        cdu_comps_dict[game][subdoc][s[1]].append(s[0])


In [None]:
cdu_comps_dict['pilot14']['02']['pilot14_02_stacnl_1490872151']

In [None]:
output = open('cdu_comps.pkl', 'wb')
pickle.dump(cdu_comps_dict, output)
output.close()

# cdu rels dict

game--subdoc--cdu_id--(component segements)--((relation, source, target), (relation, source, target))


In [None]:
cdu_slice = stats.tables.rels_situ.loc[((stats.tables.rels_situ['source_type'] == 'Complex_discourse_unit') | (stats.tables.rels_situ['target_type'] == 'Complex_discourse_unit')) & (stats.tables.rels_situ['stage'] == 'discourse')][['doc', 'subdoc', 'type', 'source', 'target']]

In [None]:
#cdu_slice
cdu_rels = [tuple(x) for x in cdu_slice.values]

In [None]:
cdu_rels[:2]

In [None]:
cdu_rels_dict = {}
for s in cdu_rels:
    #check if game in dict
    if s[0] in cdu_rels_dict.keys():
        cdu_rels_dict[s[0]][s[1]].append((s[2], s[3], s[4]))
    else:
        #add list dict for relations
        cdu_rels_dict[s[0]] = defaultdict(list)
        #add the relation info
        cdu_rels_dict[s[0]][s[1]].append((s[2], s[3], s[4]))

In [None]:
cdu_rels_dict['pilot14']['01'][0]

In [None]:
#cdu_rels_dict
output = open('cdu_rels.pkl', 'wb')
pickle.dump(cdu_rels_dict, output)
output.close()

# relations dict

game--subdoc--(relation, seg1, seg2)

In [None]:
rel_slice = stats.tables.rels_situ.loc[(stats.tables.rels_situ['source_type'] != 'Complex_discourse_unit') & (stats.tables.rels_situ['target_type'] != 'Complex_discourse_unit') & (stats.tables.rels_situ['stage'] == 'discourse')][['doc', 'subdoc', 'type', 'source', 'target']]

In [None]:
rels = [tuple(x) for x in rel_slice.values]

In [None]:
rels[:2]

In [None]:
rels_dict = {}
for s in rels:
    #check if game in dict
    if s[0] in rels_dict.keys():
        rels_dict[s[0]][s[1]].append((s[2], s[3], s[4]))
    else:
        #add list dict for relations
        rels_dict[s[0]] = defaultdict(list)
        #add the relation info
        rels_dict[s[0]][s[1]].append((s[2], s[3], s[4]))

In [None]:
for r in rels_dict['pilot14']['01']:
    print(r[0])

In [None]:
output = open('rels.pkl', 'wb')
pickle.dump(rels_dict, output)
output.close()

# nodes dict
game --subdoc -- dialgoue -- turn -- seg id, type, text
{ pilot_14 : {'01' : { dialogue : { 'turn 30' : [('pilot14_01_mabrusan_1386857336420', 'Segment', u'well build sts'), ('pilot14_01_mabrusan_1386857354520', 'Segment', u'or put done')]}}}}



In [10]:
# def turn_transform(turn_string):
#     if '.' in turn_string:
#         turn_string = turn_string.split('.')
#         turn_string = turn_string[0] + '.' + ''.join(t for t in turn_string[1:])
#     float_turn = float(turn_string)
#     return float_turn

pkl_file = open('segs_situ.pkl', 'rb')
segs_situ = pickle.load(pkl_file)
pkl_file.close()

pkl_file = open('segs_spect.pkl', 'rb')
segs_spect = pickle.load(pkl_file)
pkl_file.close()

In [11]:
#segs_situ[:10]

In [85]:
seg_slice = segs_situ[['global_id', 'doc', 'subdoc', 'type', 'text', 'turn_id', 'span_end']]

In [86]:
segs = sorted([list(x) for x in seg_slice.values], key=lambda x: (x[1], x[2], x[6]))

In [87]:
segs[:2]

[['s1-league1-game2_01_stac_1468416187',
  's1-league1-game2',
  '01',
  'NonplayerSegment',
  u'william joined the game.',
  '0.0.1',
  38],
 ['s1-league1-game2_01_stac_1468416190',
  's1-league1-game2',
  '01',
  'NonplayerSegment',
  u'william sat down at seat 2.',
  '0.0.2',
  79]]

In [94]:
#USE dialgoue_dict to add dialogue id  to segs list
#transform turn numbers into integer 0-N
#transform dialogue number into integers 0-N
n = 0
dialogue_n = 0
last = None
di_last = None
for s in segs:
    if s[5] == last:
        last = s[5]
        s[5] = n
    else:
        n += 1
        last = s[5]
        s[5] = n
    for pair in dialogue_dict[s[1]][s[2]].keys():
        if pair[0] < s[6] <= pair[1]:
            di_id = dialogue_dict[s[1]][s[2]][pair]
            if di_id == di_last:
                di_last = di_id
                s.append(dialogue_n)
            else:
                dialogue_n += 1
                di_last = di_id
                s.append(dialogue_n)
                
            #s.append(dialogue_dict[s[1]][s[2]][pair])

In [224]:
segs[0]

['s1-league1-game2_01_stac_1468416187',
 's1-league1-game2',
 '01',
 'NonplayerSegment',
 u'william joined the game.',
 1,
 38,
 1,
 1]

In [None]:
#situ_df = pandas.DataFrame([[s[0], s[1], s[2]], s[8]], columns = ['seg_id', 'game', 'subdoc', 'dialogue'])

In [101]:
#print([[s[0], s[1], s[2], s[8]] for s in segs])

In [167]:
segs_df = pandas.DataFrame([[s[0], s[1], s[3], s[6], s[7]] for s in segs], columns=['seg_id', 'game','type', 'span', 'dialogue'])

In [169]:
segs_df.sort_values(['span'])[:5]

Unnamed: 0,seg_id,game,type,span,dialogue
0,s1-league1-game2_01_stac_1468416187,s1-league1-game2,NonplayerSegment,38,1
988,s2-league5-game1_01_stac_1496239934,s2-league5-game1,NonplayerSegment,38,64
162,s1-league1-game2_02_stac_1468416639,s1-league1-game2,NonplayerSegment,56,12
451,s1-league1-game2_04_stac_1468416195,s1-league1-game2,NonplayerSegment,57,35
1232,s2-league5-game1_02_stac_1496239934,s2-league5-game1,NonplayerSegment,57,76


In [59]:
#segs_df[segs_df['type'] == 'Segment'].groupby(['dialogue'])['span'].max()

In [80]:
# segs_df.groupby('dialogue').agg({'span': 'max'})
# & segs_df['turn'].idxmax()] 
gb = segs_df[segs_df['type'] == 'Segment'].groupby(['dialogue'])['seg_id']

In [81]:
maxes = gb.max()

In [83]:
spect_table = maxes.reset_index()

In [84]:
spect_table

Unnamed: 0,dialogue,seg_id
0,1.0,s1-league1-game2_01_stac_1360444616
1,2.0,s1-league1-game2_02_stac_1360184616
2,3.0,s1-league1-game2_03_stac_1360374616
3,4.0,s1-league1-game2_04_stac_1360174617
4,5.0,s1-league1-game2_04_stac_1360444618
5,6.0,s1-league1-game2_04_stac_1360534617
6,7.0,s1-league1-game2_05_stac_1360244617
7,8.0,s1-league1-game2_06_stac_1360114617
8,9.0,s1-league1-game2_06_stac_1360224617
9,10.0,s1-league1-game2_07_stac_1360244617


In [104]:
#situ_df = pandas.DataFrame([[s[0], s[1], s[2], s[8]] for s in segs], columns = ['seg_id', 'game', 'subdoc', 'situ_dialogue'])

In [153]:
situ_df = pandas.DataFrame([[s[0], s[1], s[2], s[3], s[6], s[8]] for s in segs], columns=['seg_id', 'game', 'subdoc', 'type', 'span', 'dialogue_situ'])

In [154]:
situ_df[:10]

Unnamed: 0,seg_id,game,subdoc,type,span,dialogue_situ
0,s1-league1-game2_01_stac_1468416187,s1-league1-game2,1,NonplayerSegment,38,1
1,s1-league1-game2_01_stac_1468416190,s1-league1-game2,1,NonplayerSegment,79,1
2,s1-league1-game2_01_stac_1468416193,s1-league1-game2,1,NonplayerSegment,106,1
3,s1-league1-game2_01_stac_1468416196,s1-league1-game2,1,NonplayerSegment,149,1
4,s1-league1-game2_01_stac_1468416199,s1-league1-game2,1,NonplayerSegment,195,1
5,s1-league1-game2_01_stac_1359744616,s1-league1-game2,1,Segment,218,1
6,s1-league1-game2_01_stac_1359754616,s1-league1-game2,1,Segment,246,1
7,s1-league1-game2_01_stac_1468416208,s1-league1-game2,1,NonplayerSegment,288,1
8,s1-league1-game2_01_stac_1359764616,s1-league1-game2,1,Segment,331,1
9,s1-league1-game2_01_stac_1468416214,s1-league1-game2,1,NonplayerSegment,376,1


In [176]:
situ_range = range(1, situ_df['dialogue_situ'].max()+1)

In [155]:
sgb = situ_df[situ_df['type'] == 'Segment'].groupby(['dialogue_situ'])[['seg_id']]

In [156]:
smaxes = sgb.max()

In [157]:
situ_table = smaxes.reset_index()

In [159]:
situ_table[:5]

Unnamed: 0,dialogue_situ,seg_id,subdoc
0,1,s1-league1-game2_01_stac_1360024616,1
1,5,s1-league1-game2_01_stac_1360224616,1
2,9,s1-league1-game2_01_stac_1360354616,1
3,10,s1-league1-game2_01_stac_1360444616,1
4,12,s1-league1-game2_02_stac_1468416660,2


In [200]:
merged_table = pandas.merge(situ_table, spect_table, on='seg_id', how='outer')

In [201]:
merged_table.fillna(0, inplace=True)

In [192]:
merged_table.sort_values('dialogue', inplace=True)

In [202]:
merged_table

Unnamed: 0,dialogue_situ,seg_id,subdoc,dialogue
0,1.0,s1-league1-game2_01_stac_1360024616,1,0.0
1,5.0,s1-league1-game2_01_stac_1360224616,1,0.0
2,9.0,s1-league1-game2_01_stac_1360354616,1,0.0
3,10.0,s1-league1-game2_01_stac_1360444616,1,1.0
4,12.0,s1-league1-game2_02_stac_1468416660,2,0.0
5,13.0,s1-league1-game2_02_stac_1359934616,2,0.0
6,14.0,s1-league1-game2_02_stac_1360004616,2,0.0
7,16.0,s1-league1-game2_02_stac_1360064616,2,0.0
8,18.0,s1-league1-game2_02_stac_1360184616,2,2.0
9,19.0,s1-league1-game2_03_stac_1359904616,3,0.0


In [203]:
merged = [[x[0], x[3]] for x in merged_table.values]

In [204]:
merged

[[1.0, 0.0],
 [5.0, 0.0],
 [9.0, 0.0],
 [10.0, 1.0],
 [12.0, 0.0],
 [13.0, 0.0],
 [14.0, 0.0],
 [16.0, 0.0],
 [18.0, 2.0],
 [19.0, 0.0],
 [20.0, 0.0],
 [27.0, 0.0],
 [28.0, 0.0],
 [29.0, 3.0],
 [34.0, 0.0],
 [35.0, 0.0],
 [36.0, 0.0],
 [38.0, 0.0],
 [41.0, 0.0],
 [42.0, 0.0],
 [44.0, 5.0],
 [45.0, 6.0],
 [50.0, 0.0],
 [52.0, 7.0],
 [53.0, 0.0],
 [54.0, 0.0],
 [55.0, 0.0],
 [57.0, 0.0],
 [58.0, 9.0],
 [59.0, 0.0],
 [62.0, 11.0],
 [63.0, 12.0],
 [64.0, 0.0],
 [66.0, 0.0],
 [68.0, 0.0],
 [69.0, 0.0],
 [72.0, 0.0],
 [74.0, 0.0],
 [75.0, 13.0],
 [76.0, 0.0],
 [79.0, 0.0],
 [82.0, 0.0],
 [84.0, 0.0],
 [85.0, 0.0],
 [88.0, 0.0],
 [90.0, 0.0],
 [96.0, 0.0],
 [99.0, 0.0],
 [100.0, 0.0],
 [109.0, 0.0],
 [113.0, 19.0],
 [0.0, 4.0],
 [0.0, 8.0],
 [0.0, 10.0],
 [0.0, 14.0],
 [0.0, 15.0],
 [0.0, 16.0],
 [0.0, 17.0],
 [0.0, 18.0]]

In [None]:
#every time you have a case where there is both a situ and a spect dialogue, spe

In [222]:
# subdoc = '01'
# superdoc = None
# superdoc_dict = {}

# for m in merged:
#     subdoc = m[1]
#     if m[0] != 0 and m[2] != 0:
#         #then this is a situation where the the dialogues m[0] and m[2] linguistically match
#         #this subdoc = superdoc 
        
#cut list into lists where m[0] !=0 and m[2]!= 0

superdoc = 1
spect_dict = {}
situ_dict = {}
spect_last = 1
situ_last = 1

for m in merged:
    if m[0] != 0 and m[1] != 0:
        for n in range(situ_last, int(m[0]) + 1):
            situ_dict[n] = superdoc
        situ_last = int(m[0]) + 1
        for n in range(spect_last, int(m[1]) + 1):
            spect_dict[n] = superdoc
        spect_last = int(m[1]) + 1
        superdoc += 1
print(superdoc)
print(spect_last)
print(situ_last)
#get last dialogue numbers for both spect and situ and add last entries to dict 
for n in range(situ_last, situ_last_dialogue + 1):
    situ_dict[n] = superdoc - 1
    
for n in range(spect_last, spect_last_dialogue + 1):
    spect_dict[n] = superdoc - 1

12
20
114


In [223]:
spect_dict

{1: 1,
 2: 2,
 3: 3,
 4: 4,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 7,
 10: 8,
 11: 8,
 12: 9,
 13: 10,
 14: 11,
 15: 11,
 16: 11,
 17: 11,
 18: 11,
 19: 11,
 20: 11}

In [206]:
big_list

[[[1.0, 0.0], [5.0, 0.0], [9.0, 0.0], [10.0, 1.0]],
 [[12.0, 0.0], [13.0, 0.0], [14.0, 0.0], [16.0, 0.0], [18.0, 2.0]],
 [[19.0, 0.0], [20.0, 0.0], [27.0, 0.0], [28.0, 0.0], [29.0, 3.0]],
 [[34.0, 0.0],
  [35.0, 0.0],
  [36.0, 0.0],
  [38.0, 0.0],
  [41.0, 0.0],
  [42.0, 0.0],
  [44.0, 5.0]],
 [[45.0, 6.0]],
 [[50.0, 0.0], [52.0, 7.0]],
 [[53.0, 0.0], [54.0, 0.0], [55.0, 0.0], [57.0, 0.0], [58.0, 9.0]],
 [[59.0, 0.0], [62.0, 11.0]],
 [[63.0, 12.0]],
 [[64.0, 0.0],
  [66.0, 0.0],
  [68.0, 0.0],
  [69.0, 0.0],
  [72.0, 0.0],
  [74.0, 0.0],
  [75.0, 13.0]],
 [[76.0, 0.0],
  [79.0, 0.0],
  [82.0, 0.0],
  [84.0, 0.0],
  [85.0, 0.0],
  [88.0, 0.0],
  [90.0, 0.0],
  [96.0, 0.0],
  [99.0, 0.0],
  [100.0, 0.0],
  [109.0, 0.0],
  [113.0, 19.0]]]

In [None]:
#change dialogue column to ints
#change nan to null
#change table to list of lists

In [149]:

nodes_dict = {}
for s in segs:

    #check if game in dict
    if s[1] in nodes_dict.keys():
        #check if subdoc in dict
        if s[2] in nodes_dict[s[1]].keys():
            if s[7] in nodes_dict[s[1]][s[2]].keys():
                nodes_dict[s[1]][s[2]][s[7]][s[5]].append((s[0], s[3], s[4]))
            else:
                nodes_dict[s[1]][s[2]][s[7]] = defaultdict(list)
                nodes_dict[s[1]][s[2]][s[7]][s[5]].append((s[0], s[3], s[4]))        
        else:
            #add list dict for subdocs
            nodes_dict[s[1]][s[2]]= OrderedDict()
            nodes_dict[s[1]][s[2]][s[7]]= defaultdict(list)
            nodes_dict[s[1]][s[2]][s[7]][s[5]].append((s[0], s[3], s[4]))
            
    else:
        #add list dict for games
        nodes_dict[s[1]] = OrderedDict()
        nodes_dict[s[1]][s[2]]= OrderedDict()
        
        nodes_dict[s[1]][s[2]][s[7]]= defaultdict(list)
        nodes_dict[s[1]][s[2]][s[7]][s[5]].append((s[0], s[3], s[4]))
        

In [None]:
nodes_dict['pilot14']['10'].keys()

In [None]:
nodes_dict['pilot14']['10']['pilot14_10_stacutil_1486375772'][532]

In [None]:
output = open('nodes.pkl', 'wb')
pickle.dump(nodes_dict, output)
output.close()

In [None]:
import os
import glob
import re

In [None]:
pwd

In [None]:
current_dir= os.getcwd()

In [None]:
if not os.path.exists(current_dir + '/stac_game_graphs/'):
        os.makedirs(current_dir + '/stac_game_graphs/')

In [None]:
ls

In [None]:
current_dir

In [None]:
[x for x in os.walk(current_dir + '/stac_game_graphs/pilot02/')][0]

In [None]:
filesDepth3 = glob.glob('*/*')
dirsDepth3 = filter(lambda f: os.path.isdir(current_dir + '/stac_game_graphs/pilot02/'), filesDepth3)

In [None]:
dirsDepth3

In [None]:
current_dir

In [None]:
tups = [['2_4', 'a-'], ['s-4', 'b_'], ['4_-', '-']]

In [None]:
tups

In [None]:
for t in tups:
    t[0] = re.sub('-', '_', t[0])

In [None]:
tups