In [0]:
import pandas as pd
import numpy as np
from collections import Counter

s_data = pd.read_csv("subjectivity_data.csv", index_col=0)
data = pd.read_csv("data.csv", index_col=0)

In [0]:
def build_vocab(text, min_count=4):
  '''
  Takes in a 2D list of words and returns 
  set of words occuring min_count or more times
  '''
  wl = []
  for i in text:
    wl += i
  counts = Counter(wl)
  
  vocab = set([i if j >= min_count else '' for i,j in counts.items()])
  if '' in vocab:
    vocab.remove('')
  return vocab, counts

def process(text, vocab):
  '''
  Takes in a Series of texts and vocabulary and returns a dataframe
  with Vocabulary as columns and 1 / 0 as values indicating
  the presence of word in the list
  '''
  text = list(text.apply(lambda x: Counter(set(x) & vocab)))
  return pd.DataFrame(text).fillna(0).astype('bool')

In [0]:
s_proc = s_data.copy()
classes = s_proc['class']
s_proc.text = s_proc.text.apply(lambda x: x.replace('  ', ' ').split())
vocab, _ = build_vocab(s_proc.text, 5)
s_proc = process(s_proc.text, vocab)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

kf = KFold(10, random_state=5, shuffle=True)
acc = []
for train, test in kf.split(s_proc, classes):
  model = MultinomialNB()
  model.fit(s_proc.iloc[train], classes.iloc[train])
  acc.append(model.score(s_proc.iloc[test], classes.iloc[test]))
print(f"10-fold accuracy for ExtractNB: {np.mean(acc) * 100:.2f}")

extractNB = MultinomialNB()
extractNB.fit(s_proc, classes)

10-fold accuracy for ExtractNB: 91.71


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
import string
def negateWords(wordlist):
  '''
  Add the tag
  'NOT' to every word between a negation word (“not”,
  “isn’t”, “didn’t”, etc.) and the first punctuation
  mark following the negation word.
  '''
  new_list = []
  i = 0
  while i < len(wordlist):
    if wordlist[i] == "not" or "n't" in wordlist[i]:
      new_list.append(wordlist[i])
      i += 1
      while i < len(wordlist) and wordlist[i] not in string.punctuation:
        new_list.append("NOT_" + wordlist[i])
        i += 1
    if i < len(wordlist):
      new_list.append(wordlist[i])
    i += 1
  return new_list

In [0]:
def transform(sents, vocab=list(s_proc.columns)):
  '''
  Transform senteces into boolean tuples
  '''
  vects = []
  for sent in sents:
    s = set(sent.split())
    vector = [i in s for i in vocab]
    vects.append(vector)
  return vects

def subjectivity(translist, model=extractNB):
  '''
  Indices of sentences ordered based on their subjectivity (descending)
  '''
  return np.argsort(model.predict_proba(translist)[:, list(model.classes_).index('Obj')])

In [21]:
extract_data = data.copy()
extract_data.text = extract_data.text.apply(lambda x: x.split('\n'))
extract_data['transformed'] = extract_data.text.apply(transform)
extract_data.head()

Unnamed: 0,text,class,transformed
0,[synopsis : the president of a company wants t...,neg,"[[False, False, False, False, True, False, Fal..."
1,"[okay , bear with me y'all , cause first off i...",neg,"[[False, False, False, False, False, False, Fa..."
2,"[around the end of 1998 , a japanese cartoon c...",neg,"[[False, False, False, False, False, False, Fa..."
3,"[the story of us , a rob reiner film , is the ...",pos,"[[False, False, False, False, False, False, Fa..."
4,"[when i was nine , i started buying the cooles...",neg,"[[False, False, False, False, False, False, Fa..."


In [0]:
def kfoldAccuracy(X, y, k=3, model=MultinomialNB()):
  '''
  Trains the model and returns the average k-fold cross validation accuracy in %
  '''
  kf2 = KFold(k, random_state=0, shuffle=True)
  acc = []
  print("\tTraining ...")
  for train, test in kf.split(X, y):
    model.fit(X.iloc[train], y.iloc[train])
    acc.append(model.score(X.iloc[test], y.iloc[test]))
  return np.mean(acc) * 100

In [0]:
import networkx as nx

def mincutExtract(probs, classes=list(extractNB.classes_), c=0.5, T=2, f=lambda d: 1/d**2):
  '''
  Takes in an array probabilities that a sentence is subjective vs objective
  and returns the indexes of subjective sentences, based on minimum graph cut partitioning
  '''
  G = nx.Graph()
  G.add_nodes_from(['S', 'T'])
  G.add_nodes_from(range(len(probs)))
  for n in G.nodes():
    if n != 'S' and n != 'T':
      G.add_edge('S', n, capacity=probs[n][classes.index('Subj')])
      G.add_edge('T', n, capacity=probs[n][classes.index('Obj')])
      for n2 in G.nodes():
        if n2 != 'S' and n2 != 'T' and n2 > n:
          G.add_edge(n, n2, capacity=f(n2 - n) * c if n2 - n <= T else 0)
  return list(nx.minimum_cut(G, 'S', 'T')[1][0] - {'S'})

extract_data['extracts'] = extract_data.transformed.apply(lambda x: mincutExtract(extractNB.predict_proba(x)))

In [0]:
reviews = []
for i in range(len(extract_data.text)):
  sents = extract_data.text[i]
  idxs = extract_data.extracts[i]
  sents = np.array(sents)
  review = " ".join(sents[idxs]).replace("  ", " ").split()
  reviews.append(review)

reviews = pd.Series(reviews)
X = process(reviews, build_vocab(reviews)[0])
y = extract_data['class']

In [25]:
print(f"Naive Bayes Accuracy: {kfoldAccuracy(X, y, k=10):.2f}")

	Training ...
Naive Bayes Accuracy: 85.60
