In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import gzip
import csv
f=gzip.open('amazon_reviews_us_Gift_Card_v1_00.tsv.gz','rt') 
reader=csv.reader(f,delimiter='\t');header=next(reader)
dataset=[]
for line in reader:
  d=dict(zip(header,line))
  for field in['helpful_votes','star_rating','total_votes']:
    d[field]=int(d[field])
  dataset.append(d)

In [None]:
from collections import defaultdict
wordcount=defaultdict(int)
for d in dataset:
  for w in d['review_body'].split():
    wordcount[w]+=1
print(len(wordcount))

96967


In [None]:
wordcount=defaultdict(int)
import string
for d in dataset:
  r="".join([c for c in d['review_body'].lower() if c not in string.punctuation])
  for w in r.split():
    wordcount[w]+=1
print(len(wordcount))

46142


In [None]:
wordcount=defaultdict(int)
import string
import nltk
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
for d in dataset:
  r="".join([c for c in d['review_body'].lower() if c not in string.punctuation])
  for w in r.split():
    w=stemmer.stem(w)
    wordcount[w]+=1
print(len(wordcount))

37358


In [None]:
counts=[(wordcount[w],w) for w in wordcount]
counts.sort()
counts.reverse()
words=[x[1] for x in counts[:1000]]
wordid=dict(zip(words,range(len(words))))
wordset=set(words)
print(len(wordset))

1000


In [None]:
import string
def feature(datum):
  feat=[0]*len(words)
  r=''.join([c for c in datum['review_body'].lower() if not c in string.punctuation])
  for w in r.split():
    if w in words:
      feat[wordid[w]]+=1
  feat.append(1)
  return feat

In [None]:
import random
import numpy
random.shuffle(dataset)
x=[feature(d) for d in dataset]
y=[d['star_rating'] for d in dataset]
y[-10:]

[5, 5, 5, 5, 5, 5, 1, 5, 5, 5]

In [None]:
theta,residuals,rank,s=numpy.linalg.lstsq(x,y)

In [None]:
wordweights=list(zip(theta,words+['offset']))
wordweights.sort()

In [None]:
wordweights[-10:]

[(4513575404.5322, 'onlin'),
 (4981877697.361062, 'choos'),
 (5858384193.431168, 'choic'),
 (11120690006.423256, 'servic'),
 (11445842459.459116, 'happi'),
 (11727949297.691315, 'abl'),
 (11728752660.601345, 'someth'),
 (14839565968.357723, 'receiv'),
 (16702046948.832634, 'famili'),
 (31957665286.13027, 'realli')]

In [None]:
wordweights[:10]

[(-39798006555.103294, 'minut'),
 (-19782744713.27736, 'recipi'),
 (-10287065005.929373, 'conveni'),
 (-4688237217.472494, 'tri'),
 (-3833618397.854087, 'anyth'),
 (-2610184600.7037168, 'mani'),
 (-2583480697.011043, 'everyth'),
 (-845148631.090798, 'occas'),
 (-784707340.2428961, 'anyon'),
 (-763225311.3212624, 'especi')]

In [None]:
from sklearn import linear_model
model=linear_model.Ridge(1.0,fit_intercept=False)
model.fit(x,y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=False, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [None]:
theta=model.coef_
wordweights=list(zip(theta,words+['offset']))
wordweights.sort()

In [None]:
wordweights[-10:]

[(0.37236819568766766, 'leav'),
 (0.3804008464295093, 'impress'),
 (0.40606777722093035, 'parti'),
 (0.44809474415346495, 'rang'),
 (0.46027375054816566, 'oneday'),
 (0.6286911690252799, 'excel'),
 (0.71537432142862, 'serv'),
 (0.7782728854035319, 'attract'),
 (0.9140284088616183, 'instruct'),
 (4.739914593188094, 'offset')]

In [None]:
wordweights[:10]

[(-1.2857648499455618, 'reciev'),
 (-1.1598719009296845, 'guarante'),
 (-1.0481335239952816, 'avail'),
 (-0.6816326632280446, 'cancel'),
 (-0.6313417080399755, 'twice'),
 (-0.5986055101108899, 'australia'),
 (-0.5912735763791137, 'fix'),
 (-0.56313402981, 'wont'),
 (-0.527239522892116, 'cannot'),
 (-0.5267341775887668, 'refund')]

In [None]:
predictions=model.predict(x)
differences=[(x-y)**2 for (x,y) in zip(predictions,y)]
MSE=sum(differences)/len(differences)
print("MSE=" +str(MSE))

MSE=0.46698656833783514


In [None]:
#FRACTION OF VARIANCE UNEXPLAINED
FVU=MSE/numpy.var(y)
R2=1-FVU
print("R2="+str(R2))

R2=0.32090379400665026


In [None]:
y_class=[(rating>3) for rating in y]
y_class[:10]
model=linear_model.LogisticRegression(max_iter=1000)
model.fit(x,y_class)

[True, True, True, True, True, True, True, True, True, True]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
predictions=model.predict(x)
correct=predictions==y_class
accuracy=sum(correct)/len(correct)
print("accuracy="+str(accuracy))

accuracy=0.9063245903850043


In [None]:
TP=sum([(p and l) for (p,l) in zip(predictions,y_class)])
FP= sum([(p and not l)for (p,l) in zip(predictions,y_class)])
TN=sum([(not p and not l) for (p,l) in zip(predictions,y_class)])
FN=sum([(not p and l)  for (p,l) in zip(predictions,y_class)])

In [None]:
print("TP="+str(TP))
print("FP="+str(FP))
print("TN="+str(TN))
print("FN="+str(FN))

TP=134088
FP=9144
TN=329
FN=4749


In [None]:
accuracy=(TP+TN)/(TP+FP+TN+FN)
accuracy

0.9063245903850043

In [None]:
TPR=TP/(TP+FN)
TNR=TN/(TN+FP)
BER=1-(1/2*(TPR+TNR))
BER

0.49973764656408703

In [None]:
precision=TP/(TP+FP)
precision

0.9361595174262735

In [None]:
recall=TP/(TP+TN)
recall

0.99755239292649

In [None]:
F1=2*(precision*recall)/(precision+recall)
F1

0.9658813826089775

In [None]:
confidences=model.decision_function(x)
confidences

array([3.26737206, 4.39165227, 1.31202062, ..., 5.00224226, 3.1492446 ,
       6.25581965])

In [None]:
confidencesandlabels=list(zip(confidences,y_class))
confidencesandlabels

[(3.2673720584834847, True),
 (4.391652268547465, True),
 (1.3120206152285103, True),
 (-1.3045098975951284, True),
 (4.49865129923056, True),
 (3.6695149335991974, True),
 (3.7890331694636146, True),
 (7.848037982444337, True),
 (3.5241422317361013, True),
 (12.024520574382766, True),
 (5.0643429820399675, True),
 (6.496569182443864, True),
 (5.624013493877333, True),
 (5.040626570233243, True),
 (2.9110951066310604, True),
 (5.42314311472256, True),
 (7.222959510317947, True),
 (3.883441376905436, True),
 (9.499279689738307, True),
 (0.5599360137606655, True),
 (7.523231304412894, True),
 (4.742270961902028, True),
 (3.0920378283860575, True),
 (3.918968386845407, True),
 (10.91873028947922, True),
 (1.9293988416980454, True),
 (4.3366537937026415, True),
 (5.531574785983596, True),
 (13.323881554927612, True),
 (6.779935231047682, True),
 (4.819536030512166, True),
 (5.564717966045565, True),
 (3.591866298081687, True),
 (4.247877352049628, True),
 (2.6446305052192374, True),
 (3.89

In [None]:
labelsrankedbyconfidence=[z[1] for z in confidencesandlabels]
labelsrankedbyconfidence[0:10]

[True, True, True, True, True, True, True, True, True, True]

In [None]:
def precisionatk(k,y_sorted):
  return sum(y_sorted[:k])/k
def recallatk(k,y_sorted):
  return sum(y_sorted[:k])/sum(y_sorted)
print(precisionatk(50,labelsrankedbyconfidence))
print(precisionatk(1000,labelsrankedbyconfidence))
print(precisionatk(10000,labelsrankedbyconfidence))

1.0
0.949
0.9391
