# *Sentiment Analysis of Movie Reviews*

In [171]:
import numpy as np
import matplotlib.pyplot as plt
from numpy.typing import NDArray
from typing import List, Tuple, Dict

In [155]:
Data = List[Tuple[str, int]]
def load_data(path: str) -> Data:
  data = open(path, encoding='latin-1').read().splitlines() 
  training_data: Data = []
  for sentence in data:
    training_data.append((sentence[3:], int(sentence[:2])))
  return training_data

### Benchmarking load_data function

In [157]:
import timeit
timeit.timeit("load_data('polarity.train')", globals=locals(), number=1000)

1.386632839043159

### Load training data

In [161]:
training_data = load_data('polarity.train')

### Represent $\phi(x)$ as mapping from word $x$ to number of times it appears in the sentence, $\phi(x) = \{x_1: n_1, x_2: n_2, \dots, x_n: n_n\}$

In [176]:
def extractFeatures(s: str) -> Dict[str, int]:
  return {w: s.count(w) for w in set(s.split())}

phi = extractFeatures('random string with repetititions that are random')
phi

{'string': 1, 'are': 1, 'with': 1, 'random': 2, 'that': 1, 'repetititions': 1}

### Dot product of two sparse vectors

In [247]:
SparseVector = Dict[str, int]

def dotProduct(v1: SparseVector, v2: SparseVector):
  if len(v1) < len(v2):
    print('here1')
    return sum(v1[k]*v2[k] for k in v1 if k in v2)
  else: 
    print(f'{len(v1)}')
    print(f'{len(v2)}')
    print('here2')
    for k in v1:
      print(v1[k])
    return sum(v1[k]*v2[k] for k in v2 if k in v1)

v1 = {'bye': 4, 'cool': 3}
v2 = {'hello': 1, 'bye': 2, 'cool': 3}
print(dotProduct(v2, v1))

3
2
here2
17


### Increment weights

In [241]:
def increment(weights: SparseVector, scale: float, v: SparseVector):
  for k in v:
    print(f'{k=}')
    if k in weights:
      weights[k] += scale * v[k]
    else:
      weights[k] = scale * v[k]

In [253]:
weights = {} # features=>weights or more simply words=>weights
eta = 0.5

for t in range(100):
  for x, y in training_data:
    phi = extractFeatures(x)
    print(f'{phi=}')
    tmp = dotProduct(weights, phi) * y
    if tmp < 1:
      increment(weights, -eta*-y, phi)
    print(f'{weights=}')

phi={',': 2, 'poetic': 1, 'dull': 1, 'sadly': 1, 'and': 1, '.': 1, 'earnest': 1, "everything's": 1, '--': 2, 'serious': 1}
here1
k=','
k='poetic'
k='dull'
k='sadly'
k='and'
k='.'
k='earnest'
k="everything's"
k='--'
k='serious'
weights={',': -1.0, 'poetic': -0.5, 'dull': -0.5, 'sadly': -0.5, 'and': -0.5, '.': -0.5, 'earnest': -0.5, "everything's": -0.5, '--': -1.0, 'serious': -0.5}
phi={',': 1, 'day': 1, 'every': 1, 'trouble': 1, 'narratively': 1, 'a': 4, 'plodding': 1, 'mess': 1, '.': 1, 'is': 1}
10
10
here2


KeyError: 'day'