In [0]:
import pandas as pd
import numpy as np
from collections import Counter

s_data = pd.read_csv("subjectivity_data.csv", index_col=0)
data = pd.read_csv("data.csv", index_col=0)

In [0]:
def build_vocab(text, min_count=4):
  '''
  Takes in a 2D list of words and returns 
  set of words occuring min_count or more times
  '''
  wl = []
  for i in text:
    wl += i
  counts = Counter(wl)
  
  vocab = set([i if j >= min_count else '' for i,j in counts.items()])
  if '' in vocab:
    vocab.remove('')
  return vocab, counts

def process(text, vocab):
  '''
  Takes in a Series of texts and vocabulary and returns a dataframe
  with Vocabulary as columns and 1 / 0 as values indicating
  the presence of word in the list
  '''
  text = list(text.apply(lambda x: Counter(set(x) & vocab)))
  return pd.DataFrame(text).fillna(0).astype('bool')

In [0]:
s_proc = s_data.copy()
classes = s_proc['class']
s_proc.text = s_proc.text.apply(lambda x: x.replace('  ', ' ').split())
vocab, _ = build_vocab(s_proc.text, 5)
s_proc = process(s_proc.text, vocab)

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold

kf = KFold(10, random_state=5, shuffle=True)
acc = []
for train, test in kf.split(s_proc, classes):
  model = MultinomialNB()
  model.fit(s_proc.iloc[train], classes.iloc[train])
  acc.append(model.score(s_proc.iloc[test], classes.iloc[test]))
print(f"10-fold accuracy for ExtractNB: {np.mean(acc) * 100:.2f}")

extractNB = MultinomialNB()
extractNB.fit(s_proc, classes)

10-fold accuracy for ExtractNB: 91.71


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
import string
def negateWords(wordlist):
  '''
  Add the tag
  'NOT' to every word between a negation word (“not”,
  “isn’t”, “didn’t”, etc.) and the first punctuation
  mark following the negation word.
  '''
  new_list = []
  i = 0
  while i < len(wordlist):
    if wordlist[i] == "not" or "n't" in wordlist[i]:
      new_list.append(wordlist[i])
      i += 1
      while wordlist[i] not in string.punctuation:
        new_list.append("NOT_" + wordlist[i])
        i += 1
    new_list.append(wordlist[i])
    i += 1
  return new_list

In [39]:
unigram_data = data.copy()
uni_classes = data['class']
unigram_data.text = unigram_data.text.apply(lambda x: x.replace('\n', '').replace("  ", ' ').split()).apply(negateWords)
unigram_vocab, _ = build_vocab(unigram_data.text)
unigram_data = process(unigram_data.text, unigram_vocab)
unigram_data.head()

Unnamed: 0,character's,tiny,effects,uses,several,she,NOT_more,intelligently,because,shoulder,who,work,unpredictable,her,utilize,NOT_of,that,separate,only,normal,pat,NOT_a,it,sense,appears,NOT_events,but,NOT_care,enter,NOT_profession,recreation,can,them,),to,others,innocent,jasmine,on,make,...,kaufman's,yeager,quills,bilal,bea,`bringing,dead',mizrahi,unzipped,vianne,shell_,labors,patlabor,_ghost,cassie,patti's,caraboo,worrall,parillaud,azazel,hobbes,heartbreakers,survey,geological,tretiak,skulls',blanche,camembert,ffing,marquis,sade,klumps,knicks,angela's,stoppidge,nookey,mandingo,mede,mongkut,hortense
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,True,False,True,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,True,False,True,True,False,True,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,True,True,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,True,True,False,False,False,False,True,True,True,False,False,True,False,False,False,False,True,True,True,True,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [40]:
kf2 = KFold(3, random_state=5, shuffle=True)
models = MultinomialNB()
acc = []
print("\tTraining ...")
for train, test in kf.split(unigram_data, uni_classes):
  model.fit(unigram_data.iloc[train], uni_classes.iloc[train])
  acc.append(model.score(unigram_data.iloc[test], uni_classes.iloc[test]))
print(np.mean(acc) * 100)

	Training ...
83.6


In [0]:
def transform(sents, vocab=list(s_proc.columns)):
  '''
  Transform senteces into boolean tuples
  '''
  vects = []
  for sent in sents:
    s = set(sent.split())
    vector = [i in s for i in vocab]
    vects.append(vector)
  return vects

In [0]:
def subjectivity(translist):
  '''
  Indices of sentences ordered based on their subjectivity (descending)
  '''
  return np.argsort(extractNB.predict_proba(translist)[:, list(extractNB.classes_).index('Obj')])

In [80]:
extract_data = data.copy()
extract_data.text = extract_data.text.apply(lambda x: x.split('\n'))
extract_data['subj order'] = extract_data.text.apply(lambda x: subjectivity(transform(x)))
extract_data.head()

Unnamed: 0,text,class,subj order
0,[for those who associate italian cinema with f...,pos,"[25, 9, 12, 23, 1, 19, 8, 10, 2, 0, 20, 14, 15..."
1,[with more and more television shows having ga...,pos,"[30, 6, 34, 25, 32, 33, 4, 10, 27, 35, 11, 3, ..."
2,[it's tough to be an aspiring superhero in cha...,neg,"[20, 19, 17, 15, 13, 16, 10, 14, 12, 21, 0, 4,..."
3,[making a sequel to a widely beloved film is a...,pos,"[62, 71, 72, 46, 35, 70, 3, 61, 67, 73, 69, 9,..."
4,[if you're the type of person who goes on the ...,pos,"[19, 1, 16, 11, 2, 20, 18, 17, 21, 12, 0, 13, ..."
