In [0]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import matplotlib
import numpy as np
import pandas as pd
import fileinput
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
f = open('matthew.txt')
JohnText = f.read()

In [0]:
f = open('john.txt')
MatthewText = f.read()

In [5]:
print(MatthewText[:20])

1:1 In the beginning


In [0]:
# Chunk the text
def Chunkr(text, size):
  chunkt = []
  tokentext = nltk.sent_tokenize(text)
  j=0
  for i in range(0, len(tokentext), size):
    chunkt.append(" ".join(tokentext[i:i+size]))
    j+=1
  return chunkt

In [0]:
# Gets rid of all of references and newlines
def KeepAlpha(text):
  words = word_tokenize(text)
  for word in words:
    if any(c.isdigit() or c == '\n' for c in word):
      words.remove(word)
    i=0
  stringtext = " ".join(str(e) for e in words)
  return stringtext

In [0]:
# Average Sentence Length (In a standardized rating system)
def AvgSentLength(text):
  words = word_tokenize(text)
  sentences = sent_tokenize(text)
  wordcount = len(words)
  sentcount = len(sentences)
  avglength = (wordcount/sentcount)
  if avglength < 5:
    stdlength = 'small'
  elif avglength >= 5 and avglength < 8:
    stdlength = 'medium'
  else:
    stdlength = 'long'
  return stdlength

In [0]:
# Find the average length of words used
def AvgWordLength(text):
  words = word_tokenize(text)
  wordlength = sum(len(word) for word in words)/len(text.split(" "))

  if wordlength < 4:
    stdlength = 'small'
  elif wordlength >=4 and wordlength < 6:
    stdlength = 'medium'
  else:
    stdlength = 'long'
  return stdlength

In [0]:
# No. of Syllables per word
def SylCount(word):
  word.lower()
  count = 0
  vowels = 'aeiouy'
  if word[0] in vowels:
    count += 1

  for index in range(1, len(word)):
    if word[index] in vowels and word[index-1] not in vowels:
      count+=1
      if word.endswith('e'):
       count -= 1
  if count == 0:
    count += 1
  return count

In [0]:
# Average Syllable Count
def AvgSylCount(text):
  words = (word_tokenize(text))
  wordcount = len(words)
  sylcount = 0
  for i in range(wordcount):
    if words[i] != " ":
      sylcount += SylCount(words[i])
  avg = sylcount/wordcount

  if avg > 1.7:
    avg_std = 'high'
  elif avg <= 1.7 and avg > 1.2:
    avg_std = 'medium'
  else:
    avg_std = 'low'
  return avg_std

In [0]:
# Average Punctuation Count
def PunctFreq(text):
  textlength = len(text)
  punct = {'.','\,',':','\;''-','?'}
  pcount = 0
  for i in range(textlength):
    if text[i] in punct:
      pcount +=1
  freq = pcount/textlength

  if freq < .006:
    freq_std = 'low'
  elif freq >= .006 and freq < .0095:
    freq_std = 'medium'
  else:
    freq_std = 'high'

  return freq_std

In [0]:
def POSTag(text):
  text = text.lower()
  words = nltk.word_tokenize(text)
  taggedWords = nltk.pos_tag(words)
  fd = nltk.FreqDist(tag for (word, tag) in taggedWords)
  sentCount = len(nltk.sent_tokenize(text))
  posrate = len(fd)/sentCount

  if posrate < 3.45:
    posrate_std = 'low'
  elif posrate >= 3.45 and posrate < 3.65:
    posrate_std = 'medium'
  else:
    posrate_std = 'high'

  return posrate_std

In [0]:
# Gunning-Fog Index
def GFIndex(text):
  compwords = 0
  words = word_tokenize(text)
  totalwords = len(words)
  totalsentences = len(sent_tokenize(text))
  
  for i in range(totalwords):
    if len(words[i]) > 6 and SylCount(words[i]) > 3:
      compwords+=1

  gf = 0.4*((totalwords/totalsentences)+100*(compwords/totalwords))

  if gf > 12:
    gf_index = 'college'
  elif gf <= 12 and gf >8:
    gf_index = 'hs'
  else:
    gf_index = 'elementary'
  
  return gf_index

In [0]:
# SMOG Readability Formula
def SMOGIndex(text):
  count = 0
  words = word_tokenize(text)
  for word in words:
    if SylCount(word) > 1:
      count += 1
  totalsentences = len(sent_tokenize(text))
  gl = 1.0430*np.sqrt(count*(30/totalsentences))+3.1291

  if gl > 90:
    gl_index = 'college'
  elif gl <= 90 and gl > 30:
    gl_index = 'hs'
  else:
    gl_index = 'elem'

  return gl_index

In [0]:
# Function Word Frequency
def FunctionFreq(text):
  functionwords = [
    "of", "at", "the", "there", "do","be","have",
    "and","but","or","nor","yet","for","so","has",
    "will","is","been","a","an","since","as","we",
    "thee","thine","ye","his","you","thou","me","i",
    "your","yours","she","he","her","hers","his",
    "anybody","that","when","while","although","whereas",
    "therfore","then","am","my","much","more","either",
    "neither","thus","no","not","it","got","are",
    "almost","always","never","another","however","both",
    "between","by","cannot","can","does","down","during",
    "which","hereby","herein","how","if","in","out",
    "indeed","its","might","lot","most","mostly",
    "meanwhile","meantime","need","near","far","nothing",
    "often","off","once","only","onto","other","others",
    "ought","shall","rather","perhaps","should","some",
    "such","thence","hence","these","too","also","toward",
    "until","us","what","when","whence","where","who","how",
    "whoever","whomever"
    ]
  text.lower()
  words = word_tokenize(text)
  count = 0
  for word in words:
    if word in functionwords:
      count += 1
  funcfreq = count/len(words)

  if funcfreq > .33:
    ff_std = 'high'
  elif funcfreq <= .33 and funcfreq > .25:
    ff_std = 'medium'
  else:
    ff_std = 'low'
  
  return ff_std

In [0]:
# Pronoun Frequency
def PronounFreq(text):
  pronouns = [
    "thy", "thou", "thee", "thine", 
    "ye", "you", "he", "she", "her",
    "hers","his", "my","me", "i",
    "your", "yours", "we","anybody","it"
    "who","whom","others","these", "this", 
    "that"]
  text.lower()
  words = word_tokenize(text)
  count = 0
  for word in words:
    if word in pronouns:
      count += 1
  pronounfreq = count/len(words)

  if pronounfreq > .07:
    pf_std = 'high'
  elif pronounfreq <= .07 and pronounfreq >.05:
    pf_std = 'medium'
  else:
    pf_std = 'low'

  return pf_std

In [0]:
# Frequency of negation words
def NegationFreq(text):
  negations = [
    "no","not","nor","neither","nobody",
    "none","nothing","nowhere","cannot"]
  text.lower()
  words = word_tokenize(text)
  count = 0
  for word in words:
    if word in negations:
      count += 1
  negationfreq = count/len(words)

  if negationfreq > .03:
    nf_std = 'high'
  elif negationfreq <= .03 and negationfreq > .01:
    nf_std = 'medium'
  else:
    nf_std = 'low'
  
  return nf_std

In [0]:
# Freq. of Words with more than 6 letters
def LargeWordFreq(text):
  words = word_tokenize(text)
  count = 0
  for word in words:
    if len(word)>6:
      count +=1
  largefreq = count/len(words)

  if largefreq > 7:
    lf_std = 'high'
  elif largefreq <= 7 and largefreq > 4:
    lf_std = 'medium'
  else:
    lf_std = 'low'
  
  return lf_std

In [0]:
# Freq. of Words with less than 4 letters
def SmallWordFreq(text):
  words = word_tokenize(text)
  count = 0
  for word in words:
    if len(word)<4:
      count =+ 1
  smallfreq = count/len(words)

  if smallfreq > 7:
    sf_std = 'high'
  elif smallfreq <= 7 and smallfreq > 4:
    sf_std = 'medium'
  else:
    sf_std = 'low'
  
  return sf_std

In [0]:
# Count the total amount of different words used
def DiffWordCount(text):
    punct = [',','.','/',';',':','?','!','-','(',')']
    text = text.lower()
    for x in text: 
            if x in punct: 
                text = text.replace(x, "")
    words = nltk.word_tokenize(text)
    fd = nltk.FreqDist(words)
    diffwords = len(fd)

    if diffwords < 90:
      dw_std = 'low'
    elif diffwords >=90 and diffwords < 130:
      dw_std = 'medium'
    else:
      dw_std = 'high'

    return dw_std

In [0]:
alpha_text = {}
chunk_text = {}
size = 7

alpha_text["John"] = KeepAlpha(JohnText)
chunk_text["John"] = Chunkr(alpha_text["John"],size)

alpha_text["Matthew"] = KeepAlpha(MatthewText)
chunk_text["Matthew"] = Chunkr(alpha_text["Matthew"], size)

In [24]:
print("John Chunks:", len(chunk_text["John"]))
print("Matthew Chunks:", len(chunk_text["Matthew"]))

John Chunks: 149
Matthew Chunks: 139


In [25]:
joined_chunks = chunk_text["John"] + chunk_text["Matthew"]
len(joined_chunks)

288

In [64]:
# Creates a vector containing the attribute values for each author
attvector = []
for i in range(len(joined_chunks)):
  avector = []
  avector = [
                  AvgSentLength(str(joined_chunks[i])),
                  AvgWordLength(str(joined_chunks[i])),
                  AvgSylCount(str(joined_chunks[i])),
                  PunctFreq(str(joined_chunks[i])),
                  POSTag(str(joined_chunks[i])),
                  GFIndex(str(joined_chunks[i])),
                  SMOGIndex(str(joined_chunks[i])),
                  PronounFreq(str(joined_chunks[i])),
                  FunctionFreq(str(joined_chunks[i])),
                  NegationFreq(str(joined_chunks[i])),
                  LargeWordFreq(str(joined_chunks[i])),
                  SmallWordFreq(str(joined_chunks[i])),
                  DiffWordCount(str(joined_chunks[i]))
  ]
  attvector.append(avector)
print(attvector[:10])

[['long', 'small', 'medium', 'low', 'high', 'college', 'elem', 'medium', 'low', 'low', 'low', 'low', 'high'], ['long', 'small', 'medium', 'medium', 'high', 'college', 'elem', 'high', 'high', 'low', 'low', 'low', 'medium'], ['long', 'small', 'medium', 'medium', 'medium', 'college', 'elem', 'low', 'medium', 'low', 'low', 'low', 'medium'], ['long', 'small', 'medium', 'medium', 'high', 'college', 'elem', 'medium', 'high', 'low', 'low', 'low', 'high'], ['long', 'small', 'medium', 'high', 'high', 'college', 'elem', 'high', 'medium', 'medium', 'low', 'low', 'high'], ['long', 'small', 'medium', 'high', 'low', 'hs', 'elem', 'medium', 'medium', 'low', 'low', 'low', 'low'], ['long', 'small', 'medium', 'medium', 'medium', 'college', 'elem', 'high', 'medium', 'low', 'low', 'low', 'medium'], ['long', 'small', 'medium', 'medium', 'medium', 'college', 'elem', 'low', 'medium', 'low', 'low', 'low', 'medium'], ['long', 'medium', 'medium', 'high', 'low', 'college', 'elem', 'low', 'medium', 'low', 'low', '

In [0]:
# Test with a chunk from John
test = joined_chunks[0]

In [0]:
labeledChunks = ([(chunk,"John") for chunk in joined_chunks[1:148]] +
                 [(chunk,"Matthew") for chunk in joined_chunks[148:]])
import random
random.shuffle(labeledChunks)

In [0]:
def TextFeatures(text):
  return {'sent_length': AvgSentLength(text),
          'word_length': AvgWordLength(text),
          'syl_count':   AvgSylCount(text),
          'punct_freq':  PunctFreq(text),
          'pos_rate' :   POSTag(text),
          'gf_index':    GFIndex(text),
          'smog_index':  SMOGIndex(text),
          'pronoun_freq':PronounFreq(text),
          'func_freq':   FunctionFreq(text),
          'neg_freq':    NegationFreq(text),
          'bigwords':    LargeWordFreq(text),
          'smallwords':  SmallWordFreq(text),
          'wordcount':   DiffWordCount(text)
          }

In [0]:
featureSet = [(TextFeatures(chunk),author) for (chunk,author) in labeledChunks]
trainSet, testSet = featureSet[30:], featureSet[:30]
classifier = nltk.NaiveBayesClassifier.train(trainSet)

In [82]:
classifier.classify(TextFeatures(KeepAlpha(test)))

'John'

In [83]:
print(nltk.classify.accuracy(classifier,testSet))

0.7333333333333333


In [84]:
classifier.show_most_informative_features(5)

Most Informative Features
              punct_freq = 'medium'         John : Matthe =      3.0 : 1.0
                gf_index = 'college'        John : Matthe =      2.9 : 1.0
               wordcount = 'medium'         John : Matthe =      2.7 : 1.0
            pronoun_freq = 'medium'         John : Matthe =      2.4 : 1.0
               syl_count = 'low'          Matthe : John   =      1.9 : 1.0
