# Multi-stage revision graph
A network graph model to study the stages of revisions made from one draft to another at sentence level.

**Copyright (C) 2020 Antonette Shibani.** Available under the Creative Commons Attribution 3.0 Unported License (https://creativecommons.org/licenses/by/3.0/).

Cite work as: _Antonette Shibani (2020) Constructing Automated Revision Graphs: A novel visualization technique to study student writing. In proceedings of the 21th International Conference on Artificial Intelligence in Education._

#### Input files:
"origtext.html", "Sample_Draft_1.html","Sample_Draft_2.html","Sample_Draft_3.html","Sample_Draft_4.html"

#### Output file:
"MultistageRevisionGraph.html"

## Import necessary packages for analysis

In [None]:
import codecs
import re
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML  # Allows us to create annotated text using HTML and CSS
import tabulate as tb
from IPython.display import Image
import csv
try:
    # Python 2
    from itertools import izip
except ImportError:
    # Python 3
    izip = zip
    
import numpy as np
import pandas as pd
import holoviews as hv 
import networkx as nx
hv.extension('bokeh')
#Interactive Graphs: http://holoviews.org/user_guide/Network_Graphs.html

%opts Graph [width=400 height=400] 

import json                                 # We need to be able to work whith JSON
from urllib import request, response        # To create requests to TAP and handle responses from TAP API

from bs4 import BeautifulSoup

## Defining functions required for analysis

### Function to calculate cosine similarity between two texts

In [None]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def calc_cosine(vec1, vec2):
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

### Function to clean the text by removing characters leading to wrong sentence tokenization

In [None]:
def cleantext(txt):
    txt.lower()
    txt.replace('?.', '?')
    txt.replace('\xa0', '\n\n')
    txt = re.sub(r'\.+', ".", txt) #regular expression to replace more than one full stop with one full stop
    return txt   

### TAP setup to get analytics

In [None]:
tapUrl = "https://tap.utscic.edu.au/"       # TAP URL
endpoint = "graphql"                        # The query endpoint on TAP
completeUrl = tapUrl + endpoint             # The complete url that the request is posted to
#Note: For the open-source version of Text Analytics Pipeline (TAP) API, see https://github.com/heta-io/tap

#Connect to TAP to get metrics in the form of json output
def getJsonFromTAP(query,text,url):
    variables = {'text': text}
    escapedQuery = query.replace("\n", "\\n") #query.encode('utf8').decode('unicode_escape')
    fullQuery = json.dumps({'query': escapedQuery, 'variables': variables})
    jsonHeader = {'Content-Type':'application/json'}
    tapReq = request.Request(url, data = fullQuery.encode('utf8'), headers = jsonHeader)
    tapResponse = ""
    try:
        tapResponse = request.urlopen(tapReq)
        body = tapResponse.read().decode('utf8')           
        return json.loads(body)
    except Exception as e:
        print(e)
        return json.dumps({})
    
#Get sentences of text from TAP
def markupMoveSentences(para):
    #Get all sentences from the given text
    sentencesQuery = "query Sentences($text: String!){ annotations(text: $text) { analytics {original} }}"
    jsonData = getJsonFromTAP(sentencesQuery,para,completeUrl)
    sentencesJson = jsonData.get('data').get('annotations').get('analytics')
    def getSentence(json):
        return (json.get('original'))
    sentences = list(map(getSentence,sentencesJson)) #get original sentences from text
    return sentences

#Rhetorical moves parsing from TAP
def findMoves(rawtext):
    movesQuery =  "query RhetoricalMoves($text: String) { moves(text:$text,parameters:\"{\\\"grammar\\\":\\\"analytic\\\"}\") {analytics}}"
    jsonData = getJsonFromTAP(movesQuery,rawtext,completeUrl)
    moves = jsonData.get('data').get('moves').get('analytics')
    return(moves)

### Function to create the list of rhetorical move labels for each sentence to add to nodes csv

In [None]:
def createlabels(moves):
    rhetmoveslist = ['emph','contribution','novstat','contrast','tempstat','Surprise','nostat','grow','attitude']
    movecount = []
    label = []
    for i in moves:
        if len(i)==0:
            label.append('Zero')
        else:
            #Calculate intersection of moves with the given list of valid moves
            intmoves = list(set(i) & set(rhetmoveslist))
            if len(intmoves)==1:
                label.append('One')
            if len(intmoves)==2:
                label.append('Two')
            if len(intmoves)>2:
                label.append('More')
    return(label)

### Function to create the NODES csv

In [None]:
def create_nodes(origsentenceslist, revsentenceslist, labelmoves):
    #Creating the combined list of all sentences for the csv
    sentencelist = origsentenceslist+revsentenceslist
    
    #create-list-with-numbers-between-2-values
    index1 = range(1,len(origsentenceslist)+1)
    index2 = range(1,len(revsentenceslist)+1)

    #Adding s to indicate sentences in index like s1, s2..
    #Adding identifier for two texts' sentences t1s1, t1s2, t2s1..
    mystring1 = "t1s" #appending-the-same-string-to-a-list-of-strings-in-python
    mystring2 = "t2s"
    index1 = [mystring1 + str(x) for x in index1]
    index2 = [mystring2 + str(x) for x in index2]

    #Creating the combined list of all indices for the csv
    indexlist = index1 + index2

    x1 = [0.1] * len(origsentenceslist)
    x2 = [0.2] * len(revsentenceslist) #Changed 0.5 to 0.2 for multi
    #Creating the combined list of all x-positions for the csv
    x=x1+x2
    #print(x)

    #creating a numpy array of numbers to allow decimal points
    ylist = np.arange(0.95, 0, -0.05) 
    #Trimming the lists based on number of sentences
    y1 = ylist[:len(origsentenceslist)]
    y2 = ylist[:len(revsentenceslist)]

    #Creating the combined list of all x-positions for the csv 
    #Converting numpy arrays to python lists to create the y list
    y = y1.tolist() + y2.tolist()
    y = [ '%.2f' % elem for elem in y ] #to convert to 2 decimal points
    #print(y)
    
    #write-data-from-two-lists-into-columns-in-a-csv
    with open('data/mynodesfinal.csv', 'w', newline='') as f:
        w = csv.writer(f)
        w.writerow(['x','y','index','sentence','label'])
        w.writerows(zip(x,y,indexlist,sentencelist,labelmoves))
    print("Nodes csv created")

In [None]:
def create_nodes_more(nodedf, newrevsentenceslist, labelmoves, icount):
        
    if(icount==3):
        mult = [0.3]
        mystring = 't3s'
    elif(icount==4):
        mult = [0.4]
        mystring = 't4s'
    elif(icount==5):
        mult = [0.5]
        mystring = 't5s'
    elif(icount==6):
        mult = [0.6]
        mystring = 't6s'
    elif(icount==7):
        mult = [0.7]
        mystring = 't7s'
    
    x3 = mult * len(newrevsentenceslist)
    
    #Adding s to indicate sentences in index like s1, s2..
    #Adding identifier for two texts' sentences t1s1, t1s2, t2s1..
    indexnew = range(1,len(newrevsentenceslist)+1)
    indexnew = [mystring + str(x) for x in indexnew]    

    #creating a numpy array of numbers to allow decimal points
    ylist = np.arange(0.95, 0, -0.05) 
    #Trimming the lists based on number of sentences
    y3 = ylist[:len(newrevsentenceslist)]

    #Creating the combined list of all x-positions for the csv 
    #Converting numpy arrays to python lists to create the y list
    y = y3.tolist()
    y = [ '%.2f' % elem for elem in y ] #to convert to 2 decimal points
    
    #Creating a temporary dataframe with all columns
    tempdf = pd.DataFrame(
        {'x': x3,'y': y,'index': indexnew,'sentence': newrevsentenceslist,'label':labelmoves}
    )
    #print(tempdf)
    
    newdf =  pd.concat([nodedf, tempdf], join='outer')
    print(newdf)
    return(newdf)

#newdf = create_nodes_more(nodes_df, revsentenceslistnew, labelmovesnew, icount)

### Function to create the dataframe for EDGES csv with weights

In [None]:
def create_edges(origsentenceslist, revsentenceslist):
    myedgesdf = pd.DataFrame(columns=['Start','End','Weight']) #Edges dataframe
    templist = []
    for i in range(0,len(origsentenceslist)):
        for j in range(0,len(revsentenceslist)):
            #Calculating cosine similarity between individual sentences
            csvalue = calc_cosine(text_to_vector(origsentenceslist[i]), text_to_vector(revsentenceslist[j]))
            num1 = i+1
            num2 = j+1
            myedge1 = "t1s" + str(num1)
            myedge2 = "t2s" + str(num2)
            #print(myedge1, myedge2, csvalue)
            
            #thresholds set based on available data - to be validated
            #threshold changed from 1 since a sentence gave 0.99 similarity for the same sentence
            if(csvalue >= 0.98):
                templist.append([myedge1, myedge2, 1])
                
            if(csvalue > 0.8 and csvalue < 0.98):
                templist.append([myedge1, myedge2, 0.8])
                
            if(csvalue > 0.6 and csvalue < 0.8):
                templist.append([myedge1, myedge2, 0.6])
                  
    
    #If no edges >0.6, then the list will be empty, create empty dataframe
    if not templist:
        tempdf = myedgesdf
    else:
        tempdf = pd.DataFrame(templist)  
    
    return(tempdf)

In [None]:
def create_edges_more(origsentenceslist, revsentenceslist, icount):
    myedgesdf = pd.DataFrame(columns=['Start','End','Weight']) #Edges dataframe
    templist = []

    if(icount==3):
        edgeindex1 = "t2s"
        edgeindex2 = "t3s"
    elif(icount==4):
        edgeindex1 = "t3s"
        edgeindex2 = "t4s"
    elif(icount==5):
        edgeindex1 = "t4s"
        edgeindex2 = "t5s"
    elif(icount==6):
        edgeindex1 = "t5s"
        edgeindex2 = "t6s"
    elif(icount==7):
        edgeindex1 = "t6s"
        edgeindex2 = "t7s"
 
    for i in range(0,len(origsentenceslist)):
        for j in range(0,len(revsentenceslist)):
            #Calculating cosine similarity between individual sentences
            csvalue = calc_cosine(text_to_vector(origsentenceslist[i]), text_to_vector(revsentenceslist[j]))
            num1 = i+1
            num2 = j+1
            myedge1 = edgeindex1 + str(num1)
            myedge2 = edgeindex2 + str(num2)
            print(myedge1, myedge2, csvalue)
            
            #thresholds set based on available data - to be validated
            #threshold changed from 1 since a sentence gave 0.99 similarity for the same sentence
            if(csvalue >= 0.98):
                templist.append([myedge1, myedge2, 1])
                #tempdf = pd.DataFrame(templist)
                
            if(csvalue > 0.8 and csvalue < 0.98):
                templist.append([myedge1, myedge2, 0.8])
                #tempdf = pd.DataFrame(templist)
                
            if(csvalue > 0.6 and csvalue < 0.8):
                templist.append([myedge1, myedge2, 0.6])
                  
    
    #If no edges >0.6, then the list will be empty, create empty dataframe
    if not templist:
        tempdf = myedgesdf
    else:
        tempdf = pd.DataFrame(templist)  
        #tempdf.columns = ['Start','End','Weight']
    return(tempdf)

## Reading the given original essay

In [None]:
url1 = "origtext.html"
orig_html = codecs.open(url1).read()

soup1 = BeautifulSoup(orig_html, "html.parser")

#kill all script and style elements
for script in soup1(["script", "style"]):
    script.extract()    # rip it out

# get text
origtxt = soup1.get_text()
print(origtxt)
print("Original essay length:")
print(len(origtxt))

origtxt = origtxt.lower()
origtxt = cleantext(origtxt)

#Converting to vector to calculate overall cosine similarity
vector1 = text_to_vector(origtxt)

#Get sentences list from TAP
origsentenceslist = markupMoveSentences(origtxt)
origlen = len(origsentenceslist )

### Creating analytics for the original given essay - rhetorical moves from TAP and creating nodes for the sentences

In [None]:
moves1 = []
for sent in origsentenceslist:
    movesJson = findMoves(sent)
    #print(movesJson)
    try:
        moves1.append(movesJson[0])
    except IndexError:
        moves1.append(movesJson)

#Creating the list of rhetorical move labels for each sentence to add to nodes csv
labelmoves1 = createlabels(moves1)
print(labelmoves1)
print(len(labelmoves1))

## Create basic graph with one level

### Function to create the basic graph with one level

In [None]:
def render_graph(nodesurl,edgesurl):
    ## Creating nodes and edges to render the graph
    nodes_df = pd.read_csv(nodesurl, engine='python') #included engine for unicode error
    edges_df = pd.read_csv(edgesurl, engine='python')
    my_nodes = hv.Nodes(nodes_df).sort()

    #Rendering the revision graph
    mygraph = hv.Graph((edges_df, my_nodes), label="Revision Graph of Given vs Revised text (Sentence level comparison)")
    colors = ['#000000']+hv.Cycle('Category20').values
    mygraph = mygraph.redim.range(x=(-0.05, 1.05), 
                                    y=(-0.05, 1.05) #original: y=(-0.05, 1.05)
                                   ).options(color_index='label',edge_color_index='Weight', width=800, height=800, show_frame=True,
                                             xaxis=None, yaxis=None,node_size=20, edge_line_width=1, cmap=['tan', 'blue','green'], edge_cmap='viridis')
    return(mygraph) #to display the graph


In [None]:
def create_graph(url, origsentenceslist, labelmoves1):
    #url = "C:/Users/12696377/PycharmProjects/RevisionGraph/revhtmlfiles/1_RevisedEssay.html"
    rev_html = codecs.open(url).read()
    soup2 = BeautifulSoup(rev_html, "html.parser")

    #kill all script and style elements
    for script in soup2(["script", "style"]):
        script.extract()    # rip it out

    # get text
    revtxt = soup2.get_text()
    
    print("Text Details:\n")

    #Clean the text
    revtxt = revtxt.lower()
    revtxt = cleantext(revtxt)
    print("Revised Text:\n")
    print(revtxt)
    print("Revised essay length:")
    revtxtlen = len(revtxt)
    print(revtxtlen)

    txt = url
    filecount = txt.split('_')[0]
    print(filecount)

    #id = txt[txt.find('/')+len('/'):txt.rfind('_')]
    #print(id)
    
    #Calculating overall cosine similarity score between original and revised texts
    vector2 = text_to_vector(revtxt)
    overallcosine = calc_cosine(vector1, vector2)
    print('Cosine: %f' % overallcosine)

    #Getting the list of sentences in the text from TAP
    revsentenceslist = markupMoveSentences(revtxt)
    #print(revsentenceslist)
    revsents = len(revsentenceslist)
    #print(revsents)

    print("Number of words in the text:")
    revwords = len(revtxt.split())
    #print(revwords)
    
    #Getting rhetorical moves for all sentences in the revised text
    print("Rhetorical moves in the text:\n")
    moves2 = []
    for sent in revsentenceslist:
        movesJson = findMoves(sent)
        print(movesJson)
        try:
            #append the first element of list eg from [['tempstat', 'old']], get ['tempstat', 'old'] only (so we can get [[], ['tempstat', 'old'], ['emph', 'attitude'], ['contribution'], [], [], [], [], [], [], [], []] instead of [[[]], [['tempstat', 'old']], [['emph', 'attitude']], [['contribution']], [[]], [[]], [[]], [[]], [[]], [[]], [], [[]]])
            moves2.append(movesJson[0])
        except IndexError:
            moves2.append(movesJson)


    #Creating the list of rhetorical move labels for each sentence to add to nodes csv
    labelmoves2 = createlabels(moves2)
    print(labelmoves2)
    labelmoves = labelmoves1 + labelmoves2
    
    #Call function to create the nodels csv
    create_nodes(origsentenceslist,revsentenceslist,labelmoves)

    #Calling function to create edges dataframe which includes edges for cohesive sentences inside the same text
    df = create_edges(origsentenceslist,revsentenceslist)
    df.columns= ['Start','End','Weight']
    print(df)

    #Writing to edges csv
    df.to_csv("data/myedgesfinal.csv", encoding='utf-8', index=False)
    print("Edges csv created")
    
    return(revsentenceslist) #For next time

In [None]:
import os
path = 'inputfiles'
filelist = os.listdir(path)
print(filelist)

filelist = [i for i in filelist if 'html' in i] #filter and only use files ending with html
#print(filelist[0])
#print(filelist[len(filelist)-1])
print(len(filelist))

### Create Level 1 Graph

In [None]:
print(filelist[0])
filepath = "inputfiles/"
url = filepath + filelist[0]
revsentenceslistprior = create_graph(url, origsentenceslist, labelmoves1)

nodesurl = 'data/mynodesfinal.csv'
edgesurl = 'data/myedgesfinal.csv'

render_graph(nodesurl, edgesurl)

### Function to add multiple levels to the graph

In [None]:
def add_graph_multi(url, origsentenceslist, icount, nodesurl, edgesurl):
    rev_html = codecs.open(url).read()
    soup2 = BeautifulSoup(rev_html, "html.parser")

    #kill all script and style elements
    for script in soup2(["script", "style"]):
        script.extract()    # rip it out

    # get text
    revtxt = soup2.get_text()
    
    print("Text Details:\n")

    #Clean the text
    revtxt = revtxt.lower()
    revtxt = cleantext(revtxt)
    print("Revised Text:\n")
    print(revtxt)
    print("Revised essay length:")
    revtxtlen = len(revtxt)
    print(revtxtlen)

    txt = url
    filecount = txt.split('_')[0]
    print(filecount)

    #id = txt[txt.find('/')+len('/'):txt.rfind('_')]
    #print(id)
    
    #Calculating overall cosine similarity score between original and revised texts
    vector2 = text_to_vector(revtxt)
    overallcosine = calc_cosine(vector1, vector2)
    print('Cosine: %f' % overallcosine)

    #Getting the list of sentences in the text from TAP
    revsentenceslist = markupMoveSentences(revtxt)
    #print(revsentenceslist)
    revsents = len(revsentenceslist)
    #print(revsents)

    print("Number of words in the text:")
    revwords = len(revtxt.split())
    #print(revwords)
    
    #Getting rhetorical moves for all sentences in the revised text
    #print("Rhetorical moves in the text:\n")
    moves2 = []
    for sent in revsentenceslist:
        movesJson = findMoves(sent)
        print(movesJson)
        try:
            #append the first element of list eg from [['tempstat', 'old']], get ['tempstat', 'old'] only (so we can get [[], ['tempstat', 'old'], ['emph', 'attitude'], ['contribution'], [], [], [], [], [], [], [], []] instead of [[[]], [['tempstat', 'old']], [['emph', 'attitude']], [['contribution']], [[]], [[]], [[]], [[]], [[]], [[]], [], [[]]])
            moves2.append(movesJson[0])
        except IndexError:
            moves2.append(movesJson)


    #Creating the list of rhetorical move labels for each sentence to add to nodes csv
    labelmoves = createlabels(moves2)
    #print(labelmoves2)
    #labelmoves = labelmoves1 + labelmoves2
    
    nodes_df = pd.read_csv(nodesurl, engine='python')
    edges_df = pd.read_csv(edgesurl, engine='python')
    
    #Call function to create the nodels csv
    newdf = create_nodes_more(nodes_df, revsentenceslist, labelmoves, icount)
    newdf.to_csv(nodesurl, encoding = 'utf-8', index=False)
    print("Nodes csv updated")

    #Calling function to create edges dataframe which includes edges for cohesive sentences inside the same text
    df = create_edges_more(origsentenceslist,revsentenceslist, icount)
    df.columns= ['Start','End','Weight']
    newedgesdf =  pd.concat([edges_df, df], join='outer')
    newedgesdf.to_csv(edgesurl, encoding='utf-8', index=False)
    print("Edges csv updated")
    
    #Return the list of sentences from the current revised text 
    return(revsentenceslist)

In [None]:
i=1 #-1 to be done
while(i < len(filelist)):
    print(filelist[i])
    i=i+1

In [None]:
i= 1 #i=2 left out t3 2 #-1 to be done, since level 1 graph is already built
icount = 3 #icount = 4 left out t3
while(i < len(filelist)):
    url = filepath + filelist[i]
    revsentenceslistout = add_graph_multi(url, revsentenceslistprior, icount, nodesurl, edgesurl)
    revsentenceslistprior = revsentenceslistout
    render_graph(nodesurl, edgesurl)
    i = i+1
    icount = icount+1


In [None]:
nodesurl = 'data/mynodesfinal.csv'
edgesurl = 'data/myedgesfinal.csv'
mygraph = render_graph(nodesurl, edgesurl)

mygraph
mygraph.relabel("Stages in revision from given to revised text (Sentence level comparison)") 

In [None]:
renderer = hv.renderer('bokeh')
renderer.save(mygraph, 'MultistageRevisionGraph') #Saves a html file of the graph

In [None]:
display(Image('graphdescription.png', width=400))