In [10]:
import json
import spacy
import nltk.data
from collections import Counter

In [2]:
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def getJSonObject():
    essayStringList = ''
    with open('../data/data.json', errors='ignore') as f:
        for jsonObj in f:
            essayStringList = essayStringList + jsonObj
    jsonEssayObject = json.loads(essayStringList)
    return jsonEssayObject


def getAllConfirmationBiasEssay():
    allConfirmationBiasEssayList = [d for d in jsonEssaysObject if 'confirmation_bias' in d and d['confirmation_bias']
                                    is True]
    return allConfirmationBiasEssayList


def getAllNonConfirmationBiasEssay():
    allNonConfirmationBiasEssayList = [d for d in jsonEssaysObject if
                                       'confirmation_bias' in d and d['confirmation_bias']
                                       is False]
    return allNonConfirmationBiasEssayList


def getAllSufficientParagraphs():
    paragraphsListEachEssay = [d['paragraphs'] for d in jsonEssaysObject if 'paragraphs' in d]
    sufficientParagraphsList = [t['text'] for d in paragraphsListEachEssay for t in d if
                                'text' in t and t['sufficient'] is True]
    return sufficientParagraphsList


def getAllNonSufficientParagraphs():
    paragraphsListEachEssay = [d['paragraphs'] for d in jsonEssaysObject if 'paragraphs' in d]
    nonSufficientParagraphsList = [t['text'] for d in paragraphsListEachEssay for t in d if
                                   'text' in t and t['sufficient'] is False]
    return nonSufficientParagraphsList


def getAllParagraphsTextList():
    paragraphsListEachEssay = [d['paragraphs'] for d in jsonEssaysObject if 'paragraphs' in d]
    paragraphsList = [t['text'] for d in paragraphsListEachEssay for t in d if 'text' in t]
    return paragraphsList


def gatAllEssaySentences():
    eachEssayTextList = [d['text'] for d in jsonEssaysObject if 'text' in d]
    sentenceList = [sentence for text in eachEssayTextList for sentence in tokenizer.tokenize(text)]
    return sentenceList


def getAllMajorClaims():
    majorClaimsInListEachEssay = [d['major_claim'] for d in jsonEssaysObject if 'major_claim' in d]
    allMajorClaimsList = [eachMajorClaims for eachEssayMajorClaims in majorClaimsInListEachEssay for eachMajorClaims in
                          eachEssayMajorClaims]
    return allMajorClaimsList


def getAllMajorClaimsTokens():
    allMajorClaimsList = getAllMajorClaims()
    allMajorClaimsTextList = [eachMajorClaims['text'] for eachMajorClaims in allMajorClaimsList]
    allMajorClaimsTokens = [token.text for eachMajorClaimsText in allMajorClaimsTextList for token in
                            nlp(eachMajorClaimsText)]
    return allMajorClaimsTokens


def getAllClaims():
    claimsInListEachEssay = [d['claims'] for d in jsonEssaysObject if 'claims' in d]
    allClaimsList = [eachClaims for eachEssayClaims in claimsInListEachEssay for eachClaims in
                     eachEssayClaims]
    return allClaimsList


def getAllClaimsTokens():
    allClaimsList = getAllClaims()
    allClaimsTextList = [eachClaims['text'] for eachClaims in allClaimsList]
    allClaimsTokens = [token.text for eachClaimsText in allClaimsTextList for token in
                       nlp(eachClaimsText)]
    return allClaimsTokens


def getAllPremises():
    premisesInListEachEssay = [d['premises'] for d in jsonEssaysObject if 'premises' in d]
    allPremisesList = [eachPremises for eachEssayPremises in premisesInListEachEssay for eachPremises in
                       eachEssayPremises]
    return allPremisesList


def getAllPremisesTokens():
    allPremisesList = getAllPremises()
    allPremisesTextList = [eachPremises['text'] for eachPremises in allPremisesList]
    allPremisesTokens = [token.text for eachPremisesText in allPremisesTextList for token in
                         nlp(eachPremisesText)]
    return allPremisesTokens


# createEssayObject
jsonEssaysObject = getJSonObject()

In [4]:

# 1.Number of essays, paragraphs, sentences, and tokens
def printNoOfEPST():
    print('Number of essays = ' + str(len(jsonEssaysObject)))
    paragraphsList = getAllParagraphsTextList()
    print('Number of paragraphs = ' + str(len(paragraphsList)))
    sentenceList = gatAllEssaySentences()
    print('Number of Sentences = ' + str(len(sentenceList)))
    tokenList = [token.text for sentence in sentenceList for token in nlp(sentence)]
    print('Number of Tokens = ' + str(len(tokenList)))


printNoOfEPST()
print('-' * 100)

Number of essays = 322
Number of paragraphs = 820
Number of Sentences = 4432
Number of Tokens = 112102
----------------------------------------------------------------------------------------------------


In [5]:
# 2.Number of major claims, claims, premises
def printNoOfMajorClaimsClaimsPremises():
    allMajorClaimsList = getAllMajorClaims()
    print('Number of major claims = ', len(allMajorClaimsList))

    allClaimsList = getAllClaims()
    print('Number of claims = ', len(allClaimsList))

    allPremisesList = getAllPremises()
    print('Number of premises = ', len(allPremisesList))


printNoOfMajorClaimsClaimsPremises()
print('-' * 100)

Number of major claims =  597
Number of claims =  1799
Number of premises =  3023
----------------------------------------------------------------------------------------------------


In [6]:
# 3.Number of essays with and without confirmation bias
def printNoOfWithWithoutConfirmationBiasEssay():
    print('Number of essays with confirmation bias = ', len(getAllConfirmationBiasEssay()))
    print('Number of essays without confirmation bias = ', len(getAllNonConfirmationBiasEssay()))


printNoOfWithWithoutConfirmationBiasEssay()
print('-' * 100)

Number of essays with confirmation bias =  122
Number of essays without confirmation bias =  200
----------------------------------------------------------------------------------------------------


In [7]:
# 4.Number of sufficient and insufficient paragraphs (arguments)
def printNoOfSufficientAndInsufficientParagraph():
    print('Number of sufficient paragraphs = ', len(getAllSufficientParagraphs()))
    print('Number of insufficient paragraphs = ', len(getAllNonSufficientParagraphs()))


printNoOfSufficientAndInsufficientParagraph()
print('-' * 100)

Number of sufficient paragraphs =  538
Number of insufficient paragraphs =  282
----------------------------------------------------------------------------------------------------


In [8]:
# 5.Average number of tokens in major claims, claims, and premises
def printAverageNoOfMajorClaimsClaimsPremisesTokens():
    allMajorClaimsTokens = getAllMajorClaimsTokens()
    allClaimsTokens = getAllMajorClaimsTokens()
    allPremisesTokens = getAllPremisesTokens()
    totalMCPTokens = len(allMajorClaimsTokens) + len(allClaimsTokens) + len(allPremisesTokens)
    print("Average number of tokens in major claims, claims, and premises = ", int(totalMCPTokens / 3))


printAverageNoOfMajorClaimsClaimsPremisesTokens()
print('-' * 100)

Average number of tokens in major claims, claims, and premises =  23586
----------------------------------------------------------------------------------------------------


In [11]:
# The 10 most specific words in major claims, claims, and premises.
def printSpecificWordsInMajorClaimsClaimsPremises():
    allMajorClaimsTokens = removePunctAndStopWords(getAllMajorClaimsTokens())
    allClaimsTokens = removePunctAndStopWords(getAllClaimsTokens())
    allPremisesTokens = removePunctAndStopWords(getAllPremisesTokens())
    mt = allMajorClaimsTokens
    ct = allClaimsTokens
    pt = allPremisesTokens

    # Calculating most specific words for Major Claims

    for item in mt:
        if item in ct or item in pt:
            for it in allMajorClaimsTokens:
                if item == it:
                    allMajorClaimsTokens.remove(it)


    specificWordsMajorClaims = []
    for word1 in Counter(allMajorClaimsTokens).most_common(10):
        specificWordsMajorClaims.append(word1[0])

    # Calculating most specific words for  Claims
    for item in ct:
        if item in mt or item in pt:
            for it in allClaimsTokens:
                if item == it:
                    allClaimsTokens.remove(it)

    
    specificWordsClaims = []
    for word1 in Counter(allClaimsTokens).most_common(10):
        specificWordsClaims.append(word1[0])

    # Calculating most specific words for Major Claims
    for item in pt:
        if item in ct or item in mt:
            for it in allPremisesTokens:
                if item == it:
                    allPremisesTokens.remove(it)


    
    specificWordsPremises = []
    for word1 in Counter(allPremisesTokens).most_common(10):
        specificWordsPremises.append(word1[0])


    print("10 most specific words in major claims:",specificWordsMajorClaims)
    print("10 most specific words in claims:",specificWordsClaims)
    print("10 most specific words in premises:",specificWordsPremises)


def removePunctAndStopWords(list):
    out = []
    for word in list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False and lexeme.is_punct == False:
            out.append(word.lower())
    return out
printSpecificWordsInMajorClaimsClaimsPremises()
print('-' * 100)

10 most specific words in major claims: ['means', 'attention', 'computers', 'choice', 'classroom', 'adult', 'financial', 'energy', 'helping', 'childhood']
10 most specific words in claims: ['disagree', 'crime', 'cheap', 'agree', 'budgets', 'eat', 'developed', 'rates', 'exercise', 'indispensable']
10 most specific words in premises: ['people', 'good', 'money', 'like', 'help', 'knowledge', 'friends', 'education', 'learn', 'new']
----------------------------------------------------------------------------------------------------
