## Building an SVM classifier of News and Opinion pieces based on lemma, pos tag, and linguistic features

#### Based on year 1987 of NYTAC corpus

In [None]:

from bs4 import BeautifulSoup
import glob, os

i = 0

DocStrings = []

data_dir = 'data\\1987\\1987\\**' # Change to path on your machine

for filename in glob.iglob(data_dir, recursive=True):
    if os.path.isfile(filename) and filename.endswith('.xml'):
        print(i, filename)
        i += 1
        f = open(filename, "r")
        text = f.read()
        DocStrings.append((filename, text))


Texts = [] # A list of the main texts of documents in the corpus

multiples = []
empties = []
i = 0
for doc in DocStrings:
    print(i)
    i += 1
    file = doc[1]
    soup = BeautifulSoup(file, 'lxml-xml')
    l = soup.find_all('block', class_='full_text')
    if len(l) == 0:
        empties.append(doc[0])
    if len(l) > 1:
        multiples.append(doc[0])
    Texts.append((doc[0], ' '.join([b.text for b in l])))
    

In [None]:
import pickle

with open("DocStrings.pckl", "wb") as fp:   #Pickling
    pickle.dump(DocStrings, fp)
    
with open("Texts.pckl", "wb") as fp:   #Pickling
    pickle.dump(Texts, fp)

In [None]:
def category(tax_list):
    
    # Method that takes a list of taxonomical classifiers and outputs two boolean values for being News and being Opinion
    
    nw = any([t.string.startswith('Top/News') for t in tax_list])
    op = any([t.string.startswith('Top/Opinion') for t in tax_list])
    return tuple((nw, op))

def categorize(filename):
    
    # Method that takes a filename and outputs whether it is News and whether it is Opinion
    
    with open(filename) as fp:
        soup = BeautifulSoup(fp, 'lxml-xml')
    l = soup.find_all('classifier', type='taxonomic_classifier')
    return category(l)

In [None]:

i = 0

categorized = []  # A list of categories of the documents

news_pieces = 0
opinion_pieces = 0

data_dir = 'data\\1987\\1987\\**' # Change to path on your machine

for filename in glob.iglob(data_dir, recursive=True):
    if os.path.isfile(filename) and filename.endswith('.xml'):
        print(i, filename)
        i += 1
        categorized.append((filename, categorize(filename)))

In [None]:
with open("categorized.pckl", "wb") as fp:   #Pickling
    pickle.dump(categorized, fp)


#### Now we begin parsing the texts using the "Stanza" package from Stanford NLP 


In [None]:
import stanza
stanza.download("en")
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

In [None]:
import time

# Store a list of the output of Stanza on parsing each of the texts in the corpus

StanzaParses = []
i = 0
t0 = time.time()
for text in Texts[n:]:
    
    i += 1
    
    #### This try - except is where about five defective documents appeared that Stanza could not parse
    #### They will be removed further down below
    
    try:
        StanzaParses.append((text[0], nlp(text[1])))
        print(i, (time.time()-t0)/i, text[0])
    except:
        StanzaParses.append((text[0], None))
        print(i, (time.time()-t0)/i, text[0], ' exception')



### Change the format to lists of dictionaries for convenient writing to and reading from files

### From now on tokens are understood to be dictionaries

In [None]:
        
        
StanzaDicts = []
i = 0
t0 = time.time()
for tup in StanzaParses:
    if tup[1]!=None:
        StanzaDicts.append((tup[0], [sentence.to_dict() for sentence in tup[1].sentences]))
    else:
        StanzaDicts.append((tup[0], None))
        
    i += 1
    print(i, (time.time()-t0)/i)

In [None]:
with open("StanzaDicts.pckl", "wb") as fp:   #Pickling
    pickle.dump(StanzaDicts, fp)

#### In this part we devise a strategy to separate segments within quotations from text outside of quotes

In [None]:
def quotIndices(sentence):
    
    # This method takes a sentence (as defined by Stanza's parse) and identifies all indices with opening or closing quotations
    
    inds = []
    for k in range(len(sentence)):
        word = sentence[k]["text"]
        if word.startswith("''") or word.startswith('"'):
            inds.append(k)
        elif (word.endswith("'")) and k+1 < len(sentence) and sentence[k+1]["text"].startswith("'"):
            inds.append(k+1)
        elif word.endswith('"'):
            inds.append(k+1)
        elif '"' in word or "''" in word:
            inds.append(k)
    return sorted(set(inds))

def quotations(sentence):
    
    # This method outputs a list of tokens outside of quotes and a list of tokens within quotes
    
    inds = quotIndices(sentence)
    unquoted = []
    quoted = []
    if len(inds)>0:
        unquoted += sentence[:inds[0]]
    else:
        unquoted = sentence
    for i in range(len(inds)):
        if i % 2 == 0:
            if i+1 < len(inds):
                quoted += sentence[inds[i]:inds[i+1]]
            else:
                quoted += sentence[inds[i]:]
        elif i % 2 == 1:
            if i+1 < len(inds):
                unquoted += sentence[inds[i]:inds[i+1]]
            else:
                unquoted += sentence[inds[i]:]
        else:
            raise Exception("There is no other mathematical possibility!")
    return (unquoted, quoted)

In [None]:
# Create a pair of lists of tokens outside and withing quotations for each sentence in a list of sentences for each document
# for a list of documents for the entire corpus

Quotes = []
t0 = time.time()
for i in range(len(StanzaDicts)):
    print(i, (time.time()-t0)/(i+1))
    SentenceQuotes = []
    if StanzaDicts[i][1] != None:
        for j in range(len(StanzaDicts[i][1])):
            SentenceQuotes.append(quotations(StanzaDicts[i][1][j]))
    else:
        SentenceQuotes = None
    Quotes.append((StanzaDicts[i][0], SentenceQuotes))

In [None]:
with open("Quotes.pckl", "wb") as fp:   #Pickling
    pickle.dump(Quotes, fp)

#### We begin compiling a list of the value of each document for each feature

In [None]:
NormNegation = []
for doc in StanzaDicts:
    if doc[1] != None:
        NormNegation.append(len([tok for sent in doc[1] for tok in sent if tok["lemma"] in NegationWords])/len(doc[1]))
    else:
        NormNegation.append(None)
        
NegationRatio = []
for doc in StanzaDicts:
    if doc[1] == None:
        NegationRatio.append(None)
    else:
        NegationRatio.append(len([tok for sent in doc[1] for tok in sent if tok["lemma"] in NegationWords])/sum([len(sent) for sent in doc[1]]))

NormNegSuffix = []
for doc in StanzaDicts:
    if doc[1] == None:
        NormNegSuffix.append(None)
    else:
        NormNegSuffix.append(len([tok for sent in doc[1] for tok in sent if tok["text"] == "n't"])/len(doc[1]))
        
NegSuffixRatio = []
for doc in StanzaDicts:
    if doc[1] == None:
        NegSuffixRatio.append(None)
    else:
        NegSuffixRatio.append(len([tok for sent in doc[1] for tok in sent if tok["text"] == "n't"])/sum([len(sent) for sent in doc[1]]))


In [None]:
def feats(token):
    
    # This method returns a dictionary of the "feats" of a token if there are any
    
    if "feats" in token:
        s = token["feats"]
        l = s.split('|')
        d = {f.split('=')[0] : f.split('=')[1] for f in l }
        return d
    else:
        return {}

In [None]:
def FinVerb(token):
    
    # Returns whether a token is a finite verb
    
    fts = feats(token)
    return 'VerbForm' in fts and fts["VerbForm"] == 'Fin'

Complexity = []
for doc in StanzaDicts:
    if doc[1] == None:
        Complexity.append(None)
    else:
        Complexity.append(len([tok for sent in doc[1] for tok in sent if FinVerb(tok)])/len(doc[1]))

In [None]:
Questions = []
for doc in StanzaDicts:
    if doc[1] == None:
        Questions.append(None)
    else:
        NumEndPuncts = len([tok for sent in doc[1] for tok in sent if tok["text"] in {'.', '?', '!'}])
        if NumEndPuncts == 0:
            Questions.append(0)
        else:
            Questions.append(len([tok for sent in doc[1] for tok in sent if tok["text"] == '?'])/NumEndPuncts)
            
Exclamations = []
for doc in StanzaDicts:
    if doc[1] == None:
        Questions.append(None)
    else:
        NumEndPuncts = len([tok for sent in doc[1] for tok in sent if tok["text"] in {'.', '?', '!'}])
        if NumEndPuncts == 0:
            Exclamations.append(0)
        else:
            Exclamations.append(len([tok for sent in doc[1] for tok in sent if tok["text"] == '!'])/NumEndPuncts)

Semicolons = []
for doc in StanzaDicts:
    if doc[1] == None:
        Semicolons.append(None)
    else:
        NumPuncts = len([tok for sent in doc[1] for tok in sent if tok["text"] in {'.', '?', '!', ';', ','}])
        if NumPuncts == 0:
            Semicolons.append(0)
        else:
            Semicolons.append(len([tok for sent in doc[1] for tok in sent if tok["text"] == ';'])/NumPuncts)
            
Commas = []
for doc in StanzaDicts:
    if doc[1] == None:
        Commas.append(None)
    else:
        NumPuncts = len([tok for sent in doc[1] for tok in sent if tok["text"] in {'.', '?', '!', ';', ','}])
        if NumPuncts == 0:
            Commas.append(0)
        else:
            Commas.append(len([tok for sent in doc[1] for tok in sent if tok["text"] == ','])/NumPuncts)


In [None]:
def matches(text, phrase):
    
    # Returns number of times a phrase appears in a given text
    
    n = len(text)
    i = 0
    k = 0
    while i < n:
        if phrase in text[i:]:
            j = text[i:].index(phrase)
        else:
            i = n+1
            return k
        i = i + j + len(phrase)
        k += 1
        if i >= n:
            return k
    return k

In [None]:
SCONJcausal = ['since', 'as', 'though', 'that']
CCONJcausal = ['yet']
causal = ['accordingly', 'because', 'hence', 'thus', 'therefore', 'consequently']
MULTIcausal = ['as a result', 'stemming from this', 'as an effect', 'in that case']

def SentCausals(sent):
    # Returns number of causal connectives in a sentence
    sconjs = len([tok for tok in sent if tok["upos"] == 'SCONJ' and tok["lemma"] in SCONJcausal])
    cconjs = len([tok for tok in sent if tok["upos"] == 'CCONJ' and tok["lemma"] in CCONJcausal])
    plains = len([tok for tok in sent if tok["lemma"] in causal])
    text = " ".join([tok["lemma"] for tok in sent])
    multis = 0
    for phrase in MULTIcausal:
        multis += matches(text, phrase)
    return sconjs + cconjs + plains + multis

Causals = []
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Causals.append(None)
    else:
        Causals.append(sum([SentCausals(sent) for sent in doc[1]])/len(doc[1]))


In [None]:
SCONJtemporal = ['before']
temporal = ['first', 'lastly', 'finally', 'later', 'meanwhile', 'now', 'previously', 'since', 'straightaway', 'then', 'until', 'when', 'whenever', 'while', 'soon', 'lastly', 'afterwards', 'after']
MULTItemporal = ['at once', 'at this moment', 'at this point', 'in the end']

def SentTemporals(sent):
    # Returns number of temporal connectives in a sentence
    sconjs = len([tok for tok in sent if tok["upos"] == 'SCONJ' and tok["lemma"] in SCONJtemporal])
    plains = len([tok for tok in sent if tok["lemma"] in temporal])
    text = " ".join([tok["lemma"] for tok in sent])
    multis = 0
    for phrase in MULTItemporal:
        multis += matches(text, phrase)
    return sconjs + plains + multis

Temporals = []
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Temporals.append(None)
    else:
        Temporals.append(sum([SentTemporals(sent) for sent in doc[1]])/len(doc[1]))

In [None]:
contrast = ['alternatively', 'anyway', 'but', 'however', 'instead', 'despite', 'nevertheless', 'although', 'whereas', 'otherwise', 'unlike']
MULTIcontrast = ['in spite of', 'on the contrary', 'contrary to', 'by contrast', 'in contrast', 'even so', 'on the other hand', 'even though']

def SentContrastives(sent):
    # Returns number of contrastive connectives in a sentence
    plains = len([tok for tok in sent if tok["lemma"] in contrast])
    text = " ".join([tok["lemma"] for tok in sent])
    multis = 0
    for phrase in MULTIcontrast:
        multis += matches(text, phrase)
    return plains + multis

Contrastives = []
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Contrastives.append(None)
    else:
        Contrastives.append(sum([SentContrastives(sent) for sent in doc[1]])/len(doc[1]))

In [None]:
expansives = ['additionally', 'also',  'even', 'furthermore',  'indeed',  'moreover', 'besides', 'e.g.', 'i.e.']
MULTIexpansives = ['as well', 'in addition', 'let alone', 'not only', 'for example', 'for instance', 'in other words', 'in that', 'that is to say']

def SentExpansives(sent):
    # Returns number of expansive connectives in a sentence
    plains = len([tok for tok in sent if tok["lemma"] in expansives])
    text = " ".join([tok["lemma"] for tok in sent])
    multis = 0
    for phrase in MULTIexpansives:
        multis += matches(text, phrase)
    return plains + multis

Expansives = []
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Expansives.append(None)
    else:
        Expansives.append(sum([SentExpansives(sent) for sent in doc[1]])/len(doc[1]))

In [None]:
def SentConnectives(sent):
    # Returns total number of connectives in a sentence
    return SentCausals(sent) + SentTemporals(sent) + SentContrastives(sent) + SentExpansives(sent)

Connectives = []
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Connectives.append(None)
    else:
        Connectives.append(sum([SentConnectives(sent) for sent in doc[1]])/sum([len(sent) for sent in doc[1]]))

In [None]:
Citations = [] #Average number of quotes per sentence
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Citations.append(None)
    else:
        Citations.append(sum([(len(quotIndices(sent))+1)//2 for sent in doc[1]])/len(doc[1]))
        
CitationLengths = [] #Average number of tokens in each citation for each document
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        CitationLengths.append(None)
    else:
        NumCitations = sum([(len(quotIndices(sent))+1)//2 for sent in doc[1]])
        if NumCitations == 0:
            CitationLengths.append(0)
        else:
            CitationLengths.append(sum([len(sent[1]) for sent in Quotes[j-1][1]])/NumCitations)

In [None]:
def PastVerb(token):
    # Returns whether token is a past verb
    fts = feats(token)
    return "Tense" in fts and fts["Tense"] == 'Past'

Pasts = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Pasts.append(None)
    else:
        Pasts.append(len([tok for sent in doc[1] for tok in sent[0] if PastVerb(tok)])/len(doc[1])) #sent[0] unquoted parts of sent


In [None]:
def PresentVerb(token):
    # Returns whether token is a present verb
    fts = feats(token)
    return "Tense" in fts and fts["Tense"] == 'Pres'

Presents = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Presents.append(None)
    else:
        Presents.append(len([tok for sent in doc[1] for tok in sent[0] if PresentVerb(tok)])/len(doc[1])) #sent[0] unquoted parts of sent


In [None]:
communicationVerbs = ['address', 'disseminate', 'profile', 'advertise', 'document', 'proofread', 'advise', 'draft', 'publicize',
                      'alert', 'edit', 'publish', 'amend', 'email', 'query', 'announce', 'generate', 'question', 'answer', 'emphasize'
                      ,'record', 'arbitrate', 'explain', 'relay', 'articulate', 'express', 'report', 'author', 'frame', 'respond',
                      'brand', 'highlight', 'rewrite', 'brief', 'inform', 'scribe', 'broadcast', 'interact', 'chronicle', 'interface',
                      'share', 'circulate', 'interpret', 'socialize', 'cite', 'interview', 'specify', 'clarify', 'liaise', 'speak',
                      'commend', 'listen', 'suggest', 'communicate', 'log', 'synthesize', 'compile', 'mediate', 'telegraph', 
                      'consult', 'narrate', 'transcribe', 'contact', 'notate', 'note', 'translate', 'convey', 'notify', 'transmit',
                      'convince', 'outline', 'tweet', 'correspond', 'pen', 'verbalize', 'source', 'portray', 'write', 'debate',
                      'post', 'define', 'present', 'detail', 'describe', 'proclaim', 'protest', 'demonstrate', 'illustrate']

Communications = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Communications.append(None)
    else:
        Communications.append(len([tok for sent in doc[1] for tok in sent[0] if tok["upos"] == 'VERB' and tok["lemma"] in communicationVerbs])/len(doc[1])) #sent[0] unquoted parts of sent
        

In [None]:
Modals = [] #Average number of modals verbs outside of quotes per sentence
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Modals.append(None)
    else:
        Modals.append(len([tok for sent in doc[1] for tok in sent[0] if tok["xpos"] == 'MD'])/len(doc[1])) #sent[0] unquoted parts of sent


In [None]:
# List of future verbs with auxiliary "will"

Wills = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Wills.append(None)
    else:
        Wills.append(len([tok for sent in doc[1] for tok in sent[0] if tok["upos"] == 'AUX' and tok["text"] == 'will'])/len(doc[1])) #sent[0] unquoted parts of sent

In [None]:
def FirstPron(tok):
    # Returns whether token is a first person pronoun
    fts = feats(tok)
    return 'Person' in fts and fts['Person'] == "1"

Firsts = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Firsts.append(None)
    else:
        Firsts.append(len([tok for sent in doc[1] for tok in sent[0] if FirstPron(tok)])/sum([len(sent) for sent in doc[1]])) #sent[0] unquoted parts of sent

In [None]:
def SecondPron(tok):
    # Returns whether token is a second person pronoun
    fts = feats(tok)
    return 'Person' in fts and fts['Person'] == "2"

Seconds = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Seconds.append(None)
    else:
        Seconds.append(len([tok for sent in doc[1] for tok in sent[0] if SecondPron(tok)])/sum([len(sent) for sent in doc[1]])) #sent[0] unquoted parts of sent

In [None]:
def NumDigits(numString):
    n = 0
    for c in numString:
        if c.isdigit():
            n += 1
    return n

Digits = []
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Digits.append(None)
    else:
        Digits.append(sum([NumDigits(tok["text"]) for sent in doc[1] for tok in sent if tok["upos"] == 'NUM' or tok["xpos"] == "CD"])/sum([len(sent) for sent in doc[1]])) #sent[0] unquoted parts of sent

Nums = [] # Average frequency of numbers in digits only occuring per token
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Nums.append(None)
    else:
        Nums.append(len([tok for sent in doc[1] for tok in sent if (tok["upos"] == 'NUM' or tok["xpos"] == "CD") and tok["text"].isdigit()])/sum([len(sent) for sent in doc[1]])) #sent[0] unquoted parts of sent


In [None]:
Inters = [] # Average frequency of interjections occuring per token
j = 0
for doc in StanzaDicts:
    print(j)
    j+=1
    if doc[1] == None:
        Inters.append(None)
    else:
        Inters.append(len([tok for sent in doc[1] for tok in sent if tok["upos"] == 'INTJ' or tok["xpos"] == "UH"])/sum([len(sent) for sent in doc[1]])) #sent[0] unquoted parts of sent

In [None]:
def SUBJLexicon():
    
    # Builds subjectivity lexicon from MPQA clues file
    
    lex = {}
    f = open("subjclueslen1-HLTEMNLP05.tff", "r")
    lines = f.readlines()
    for line in lines:
        params = line.split()
        word = params[2][6:]
        pos = params[3][5:]
        strength = {'weaksubj':.1, 'strongsubj':1}[params[0][5:]]
        lex[word] = {'pos':pos, 'subjectivity':strength}
    return lex
    

lexicon = SUBJLexicon()

def lemmatize(word):
    
    # lemmatizes a single word
    
    token = nlp(word).sentences[0].tokens[0].to_dict()[0]
    return token["lemma"]

def inverselem(lexicon):
    
    # Creates a lexicon for the lemmas of those appearing in the first lexicon
    
    lex = {}
    j = 0
    for k, v in lexicon.items():
        print(j)
        j += 1
        lex[lemmatize(k)] = v
    return lex

invlex = inverselem(lexicon)

def subjectivity(token, lexicon, invlex):
    
    # Returns the 'subjectivity' of a token w.r.t. the MPQA lexicon as described in Krüger et a
    
    if token["text"] in lexicon:
        return lexicon[token["text"]]["subjectivity"]
    elif token["lemma"] in lexicon:
        return lexicon[token["lemma"]]["subjectivity"]
    elif token["lemma"] in invlex:
        return invlex[token["lemma"]]["subjectivity"]
    else:
        return 0
    
        
Subjectivities = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        Subjectivities.append(None)
    else:
        Subjectivities.append(sum([subjectivity(tok, lexicon, invlex) for sent in doc[1] for tok in sent[0]])/sum([len(sent) for sent in doc[1]])) #sent[0] unquoted parts of sent

        
AdjSubjectivities = []
j = 0
for doc in Quotes:
    print(j)
    j+=1
    if doc[1] == None:
        AdjSubjectivities.append(None)
    else:
        AdjSubjectivities.append(sum([subjectivity(tok, lexicon, invlex) for sent in doc[1] for tok in sent[0] if tok["upos"] == 'ADJ'])/sum([len(sent) for sent in doc[1]])) #sent[0] unquoted parts of sent

In [None]:
AvgSentLengths = []
j = 0
for doc in StanzaDicts:
    print(j)
    j += 1
    if doc[1] == None:
        AvgSentLengths.append(None)
    else:
        AvgSentLengths.append(sum([len(sent) for sent in doc[1]])/len(doc[1]))
        
AvgTokLengths = []
j = 0
for doc in StanzaDicts:
    print(j)
    j += 1
    if doc[1] == None:
        AvgTokLengths.append(None)
    else:
        AvgTokLengths.append(sum([len(tok["text"]) for sent in doc[1] for tok in sent])/sum([len(sent) for sent in doc[1]]))


#### Here we list all the linguistic features for each document together in one list

In [None]:
LingFeats = []
j = 0
for doc in StanzaDicts:
    print(j)
    j += 1
    if doc[1] == None:
        LingFeats.append((doc[0], None))
    else:
        dictionary = {}
        dictionary["AvgSentLengths"] = 1/AvgSentLengths[j-1]
        dictionary["AvgTokLengths"] = 1/AvgTokLengths[j-1]
        dictionary["NormNegation"] = NormNegation[j-1]
        dictionary["NegationRatio"] = NegationRatio[j-1]
        dictionary["NormNegSuffix"] = NormNegSuffix[j-1]
        dictionary["NegSuffixRatio"] = NegSuffixRatio[j-1]
        dictionary["Complexity"] = Complexity[j-1]
        dictionary["Questions"] = Questions[j-1]
        dictionary["Semicolons"] = Semicolons[j-1]
        dictionary["Commas"] = Commas[j-1]
        dictionary["Causals"] = Causals[j-1]
        dictionary["Temporals"] = Temporals[j-1]
        dictionary["Contrastives"] = Contrastives[j-1]
        dictionary["Expansives"] = Expansives[j-1]
        dictionary["Connectives"] = Connectives[j-1]
        dictionary["Citations"] = Citations[j-1]
        dictionary["CitationLengths"] = CitationLengths[j-1]
        dictionary["Pasts"] = Pasts[j-1]
        dictionary["Presents"] = Presents[j-1]
        dictionary["Modals"] = Modals[j-1]
        dictionary["Wills"] = Wills[j-1]
        dictionary["Firsts"] = Firsts[j-1]
        dictionary["Seconds"] = Seconds[j-1]
        dictionary["Digits"] = Digits[j-1]
        dictionary["Nums"] = Nums[j-1]
        dictionary["Inters"] = Inters[j-1]
        dictionary["Subjectivities"] = Subjectivities[j-1]
        dictionary["AdjSubjectivities"] = AdjSubjectivities[j-1]
        LingFeats.append((doc[0], dictionary))

In [None]:
with open("LingFeats.pckl", "wb") as fp:   #Pickling
    pickle.dump(LingFeats, fp)

In [None]:
News = [d for d in categorized if d[1] == (True, False)]
Opinion = [d for d in categorized if d[1] == (False, True)]
Both = [d for d in categorized if d[1] == (True, True)]
Neither = [d for d in categorized if d[1] == (False, False)]

print(len(News), len(Opinion), len(Both), len(Neither))


#### First we evaluate a linear SVM using only lemma frequencies

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
cv = CountVectorizer(max_df=0.95, min_df=0.002,max_features=2000,stop_words='english')
transformer = TfidfTransformer(use_idf = False)
tfidf = TfidfVectorizer(max_df=0.95, min_df=0.002, max_features=2000, stop_words='english')

CorpusWordCounts = cv.fit_transform([t[1] for t in Texts])
CVWordIndex = cv.get_feature_names()
TFMatrix = transformer.transform(CorpusWordCounts)
TFidfMatrix = tfidf.fit_transform([t[1] for t in Texts])
IDFWordIndex = tfidf.get_feature_names()

print(CVWordIndex == IDFWordIndex)

with open("TFMatrix.pckl", "wb") as fp:   #Pickling
    pickle.dump(TFMatrix, fp)
with open("TFidfMatrix.pckl", "wb") as fp:   #Pickling
    pickle.dump(TFidfMatrix, fp)
with open("CVWordIndex.pckl", "wb") as fp:   #Pickling
    pickle.dump(CVWordIndex, fp)
with open("IDFWordIndex.pckl", "wb") as fp:   #Pickling
    pickle.dump(IDFWordIndex, fp)
    
defects = [i for i in range(len(StanzaDicts)) if StanzaDicts[i][1] == None]

Indices = [i for i in range(len(StanzaDicts)) if StanzaDicts[i][1] != None and sum(categorized[i][1]) == 1]

y = [int(cat[1][1]) for cat in categorized[Indices]]
X = TFidfMatrix[Indices]

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
clf = make_pipeline(StandardScaler(with_mean=False),LinearSVC(random_state=0, tol=1e-5))

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=433)

clf.fit(X_train, y_train)

print(clf.score(X_test, y_test)) # Got about 97-98 percent accuracy here

X = TFMatrix[Indices]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test)) # Around 97 percent accuracy



#### Now a linear SVM with only linguistic features

In [None]:
LingKeys = LingFeats[0][1].keys()
LingKeys = list(LingKeys)
ind2feat = {i : LingKeys[i] for i in range(len(LingKeys))}
feat2ind = {v : k for k,v in ind2feat.items()}

LINGmatrix = []
j = 0
for doc in LingFeats:
    print(j)
    j += 1
    featsArray = []
    if doc[1] == None:
        featsArray = None
    else:
        for k in ind2feat.keys():
            featsArray.append(doc[1][ind2feat[k]])
    LINGmatrix.append(featsArray)
    
with open("LINGmatrix.pckl", "wb") as fp:   #Pickling
    pickle.dump(LINGmatrix, fp)
    
LINGmatrix = np.asarray(LINGmatrix)

X = LINGmatrix[Indices]

rows = X.shape[0]

flatX = np.concatenate(X)

X = flatX.reshape(rows, -1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test)) # Around 94-95 percent accuracy obtained here



#### Evaluate linear SVM using upos, xpos, and combination of both tags

In [None]:
uposTags = set([tok["upos"] for doc in [doc for doc in StanzaDicts if doc[1] != None] for sent in doc[1] for tok in sent])

def emptyUposDict():
    
    #Initialize dictionary of upos counts
    
    d = {}
    for tag in uposTags:
        d[tag] = 0
    return d

UPOScounts = []
j = 0
for doc in StanzaDicts:
    print(j)
    j += 1
    docUpos = None
    if doc[1] != None:
        docUpos = emptyUposDict()
        for sent in doc[1]:
            for tok in sent:
                docUpos[tok["upos"]] += 1
    UPOScounts.append(docUpos)

xposTags = set([tok["xpos"] for doc in [doc for doc in StanzaDicts if doc[1] != None] for sent in doc[1] for tok in sent])

def emptyXposDict():
    
    #Initialize dictionary of xpos counts
    
    d = {}
    for tag in xposTags:
        d[tag] = 0
    return d

XPOScounts = []
j = 0
for doc in StanzaDicts:
    print(j)
    j += 1
    docXpos = None
    if doc[1] != None:
        docXpos = emptyXposDict()
        for sent in doc[1]:
            for tok in sent:
                docXpos[tok["xpos"]] += 1
    XPOScounts.append(docXpos)
    

with open("UPOScounts.pckl", "wb") as fp:   #Pickling
    pickle.dump(UPOScounts, fp)
with open("XPOScounts.pckl", "wb") as fp:   #Pickling
    pickle.dump(XPOScounts, fp)
    
def normalize(dic):
    N = sum(dic.values())
    return {k:v/N for k,v in dic.items()}

UPOSfreqs = []
for i in range(len(UPOScounts)):
    print(i)
    if UPOScounts[i] != None:
        UPOSfreqs.append(normalize(UPOScounts[i]))
    else:
        UPOSfreqs.append(None)
        
XPOSfreqs = []
for i in range(len(XPOScounts)):
    print(i)
    if XPOScounts[i] != None:
        XPOSfreqs.append(normalize(XPOScounts[i]))
    else:
        XPOSfreqs.append(None)
        
with open("UPOSfreqs.pckl", "wb") as fp:   #Pickling
    pickle.dump(UPOSfreqs, fp)
with open("XPOSfreqs.pckl", "wb") as fp:   #Pickling
    pickle.dump(XPOSfreqs, fp)
    
uposKeys = list(UPOSfreqs[0].keys())
ind2upos = {i:uposKeys[i] for i in range(len(uposKeys))}
upos2ind = {v:k for k,v in ind2upos.items()}

UPOSmatrix = []
j = 0
for doc in UPOSfreqs:
    print(j)
    j += 1
    if doc == None:
        UPOSmatrix.append(None)
    else:
        UPOSmatrix.append([doc[ind2upos[i]] for i in range(len(ind2upos))])
        
UPOSmatrix = np.asarray(UPOSmatrix)

UPOSmatrix = UPOSmatrix[Indices]

print(rows == UPOSmatrix.shape[0])

flatUPOS = np.concatenate(UPOSmatrix) #flatten list of lists

UPOSmatrix = flatUPOS.reshape(rows, -1)

X = UPOSmatrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # About 94 percent accuracy

xposKeys = list(XPOSfreqs[0].keys())
ind2xpos = {i:xposKeys[i] for i in range(len(xposKeys))}
xpos2ind = {v:k for k,v in ind2xpos.items()}

XPOSmatrix = []
j = 0
for doc in XPOSfreqs:
    print(j)
    j += 1
    if doc == None:
        XPOSmatrix.append(None)
    else:
        XPOSmatrix.append([doc[ind2xpos[i]] for i in range(len(ind2xpos))])
        
XPOSmatrix = np.asarray(XPOSmatrix)

XPOSmatrix = XPOSmatrix[Indices]

print(rows == XPOSmatrix.shape[0])

flatXPOS = np.concatenate(XPOSmatrix)

XPOSmatrix = flatXPOS.reshape(rows, -1)

X = XPOSmatrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # About 95 percent accuracy


# Now try combining upos and xpos tags


X = np.hstack((UPOSmatrix, XPOSmatrix))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # Slightly less than 95 percent accuracy. Worse than xpos alone! However, not by a significant amount


#### Linear SVM with lemma and pos tags

In [None]:
LINGmatrix = LINGmatrix[Indices]
flatLING = np.concatenate(LINGmatrix)
print(rows == LINGmatrix.shape[0])
LINGmatrix = flatLING.reshape(rows, -1)
X = np.hstack((LINGmatrix, UPOSmatrix, XPOSmatrix))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # About 96 percent accuracy

#### Linear SVM with linguist features, lemma frequencies and pos tag frequencies

In [None]:
from scipy.sparse import csr_matrix, hstack

X = hstack((TFIDFmat, csr_matrix(X)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)
clf.fit(X_train, y_train)
clf.score(X_test, y_test) # About 97 percent accuracy. Not as good as tdidf alone!