In [1]:
from nltk.corpus import opinion_lexicon
from nltk.stem import WordNetLemmatizer
from nltk.probability import ConditionalFreqDist
from nltk.probability import ConditionalProbDist, ELEProbDist #71.50 training accuracy
from nltk.probability import LaplaceProbDist #71.90% training accuracy

import random
import nltk
import numpy as np

from Models import LoadAndTrainModels

def sentiment_feature(word):
	return {'word': word}

def printProbabilitiesFromLogProb(prob_classify_dict):
    positive = prob_classify_dict["positive"]
    negative = prob_classify_dict["negative"]

    print("positive negative log: ", positive)
    print("negative negative log: ", negative)
    
    probs = np.exp([positive, negative])/np.exp([positive, negative]).sum()
    print("positive probability: ", probs[0])
    print("negative probability: ", probs[1])
    
def getProbabilitiesFromLogProb(prob_classify_dict):
    positive = prob_classify_dict["positive"]
    negative = prob_classify_dict["negative"]
    
    probs = np.exp([positive, negative])/np.exp([positive, negative]).sum()
    
    return(probs)

In [2]:
lemmatizer = WordNetLemmatizer()
    
text = opinion_lexicon

In [3]:
positive_words = ([(lemmatizer.lemmatize(word=word), 'positive') for word in text.positive()])
negative_words = ([(lemmatizer.lemmatize(word=word), 'negative') for word in text.negative()])

labeled_words = positive_words + negative_words

In [4]:
random.shuffle(labeled_words)

In [5]:
featuresets = [(sentiment_feature(n), sentiment) for (n, sentiment) in labeled_words]
    
train_amnt = int(len(featuresets)*.8) #80% train data; 20% test data
train_data = featuresets[:train_amnt]
test_data = featuresets[train_amnt:]
assert len(train_data) == train_amnt

In [6]:
featuresets

[({'word': 'flareups'}, 'negative'),
 ({'word': 'lackadaisical'}, 'negative'),
 ({'word': 'vile'}, 'negative'),
 ({'word': 'wobble'}, 'negative'),
 ({'word': 'inelegant'}, 'negative'),
 ({'word': 'blind'}, 'negative'),
 ({'word': 'deceptively'}, 'negative'),
 ({'word': 'irreformable'}, 'negative'),
 ({'word': 'temptingly'}, 'positive'),
 ({'word': 'ominous'}, 'negative'),
 ({'word': 'inexperience'}, 'negative'),
 ({'word': 'insurrection'}, 'negative'),
 ({'word': 'dazzle'}, 'positive'),
 ({'word': 'entanglement'}, 'negative'),
 ({'word': 'irrelevance'}, 'negative'),
 ({'word': 'tenacity'}, 'positive'),
 ({'word': 'dizzing'}, 'negative'),
 ({'word': 'insignificance'}, 'negative'),
 ({'word': 'expeditiously'}, 'positive'),
 ({'word': 'crisis'}, 'negative'),
 ({'word': 'improvement'}, 'positive'),
 ({'word': 'upliftingly'}, 'positive'),
 ({'word': 'trustingly'}, 'positive'),
 ({'word': 'hazard'}, 'negative'),
 ({'word': 'revolutionize'}, 'positive'),
 ({'word': 'cancer'}, 'negative'),
 ({

In [7]:
from nltk.probability import ConditionalFreqDist, ConditionalProbDist
cfdist = ConditionalFreqDist(labeled_words)
cpdist = ConditionalProbDist(cfdist, LaplaceProbDist, bins=2)

In [8]:
cpdist['good'].prob('positive')

0.6666666666666666

In [9]:
from nltk.probability import DictionaryProbDist
    
dict_probs = {'positive': .1, 'negative': .1}
label_probdist = DictionaryProbDist(dict_probs)

classifier = nltk.NaiveBayesClassifier(label_probdist=label_probdist, feature_probdist=cpdist)
classifier = classifier.train(train_data, estimator=LaplaceProbDist)

#classifier.train(train_data)                                      
#classifier.train(test_data)
print("Classifier accuracy percent: ", (nltk.classify.accuracy(classifier, test_data)*100))

Classifier accuracy percent:  72.38586156111928


In [10]:
for x in cpdist:
    print(x)

flareups
lackadaisical
vile
wobble
inelegant
blind
deceptively
irreformable
temptingly
ominous
inexperience
insurrection
dazzle
entanglement
irrelevance
tenacity
dizzing
insignificance
expeditiously
crisis
improvement
upliftingly
trustingly
hazard
revolutionize
cancer
annihilation
clunky
get-rich
condescension
skeptically
crack
disadvantaged
shamefully
threaten
sinisterly
celebrated
crumpled
plea
confused
chastise
plagiarize
irrecoverably
crisper
graft
streaky
painfull
detrimental
sarcastically
clique
tenaciously
incoherently
spoilage
misery
spurn
nauseatingly
dodgey
meaningless
uplift
gallingly
absurdity
monstrosity
unjustifiable
blossom
thrifty
unreal
nebulously
awesomely
recklessly
renown
choppy
terror
belittle
jagged
educated
scratchy
adverse
immature
unipolar
poeticize
vibrates
soreness
effusiveness
ardently
pep
favorited
seriousness
reachable
desolately
erase
dismalness
distraughtness
mesmerizing
catastrophe
disloyalty
unruly
unobserved
nasty
grateful
boundless
menacing
mobster
i

dy
rubbish
set-up
syndrome
stereotypically
oasis
jaded
inhibit
thrilled
hurtful
insidiously
pricier
staunchly
captivating
nemesis
unaffected
disrespectfulness
broken-hearted
bull****
fulminate
dragged
subsidize
massacre
industrious
reasoned
dislike
fallacy
blockhead
thank
appeal
cautionary
sack
bonny
hang
delay
screwed-up
clamor
unusual
swagger
admirer
blurred
shimmering
smooth
shrill
retarded
unfounded
brutality
stooge
invisible
extravagance
integral
restless
genius
slander
visionary
a+
frenzy
impolitic
unbound
inviolable
imperfectly
mastery
scramble
perfection
useable
cramping
inoperable
purify
reprehensibly
poverty
evil
hawkish
impolite
spews
lukewarm
hating
unhealthy
monotonous
accomplished
excited
joyful
goad
avarice
trophy
worn
subjection
gall
sharp
fiery
skimpy
inaccurate
frown
stupify
qualify
doddering
energetic
excessively
obscene
beckons
unexpectedly
counter-attacks
deceitfully
scream
risk
rightfully
insinuating
invidious
harassment
generosity
inspiring
scarily
unneeded
hedon

greatest
prize
heartbreakingly
lumpy
intefere
shit
dazzling
shock
disapointment
unrelenting
patriot
incomplete
seamless
jaw-dropping
violator
vent
soundness
gabble
infested
unfamiliar
stiff
eased
thoughtfulness
stupified
hideous
excellant
revolutionized
lone
discomfititure
preposterously
garbage
crumble
well-being
issue
mangling
wretchedly
galling
vociferously
anti-occupation
burdensomely
weirdly
bickering
readily
standout
jitter
disaffected
doom
squeaky
malevolent
irregular
futilely
sumptuous
falling
heartwarming
excellency
cashbacks
incompatible
acerbically
direness
stodgy
baffled
bungler
stench
advocate
belligerent
peevish
scrambling
ineptly
nifty
clogged
tingle
horrific
disdain
smelling
dauntless
acrimonious
efficient
inviolate
figurehead
ill-defined
distressing
righteously
consistently
static
illegal
goofy
puzzlement
excelled
insecure
boggle
farcical
plot
uncompromising
carelessness
repel
shrug
unbeatable
sensationalize
distrustful
uproariously
first-rate
murderer
adulteration
use

jealousness
disconcerted
sulk
irksomenesses
imposition
personalized
injury
halfheartedly
repugn
overzealously
accomodative
elation
awsome
palatial
contrasty
dynamic
admiring
unmatched
shiny
chill
faithless
perfidity
calumnious
complimentary
drowning
recant
inglorious
nag
precipitous
gutter
risk-free
languid
patronize
standstill
victimize
strangely
uncomfortably
offensiveness
resplendent
miraculous
slanderous
repugnantly
steep
encroach
extinguish
imaginative
pervert
overrun
rogue
guilt
absurdly
crumples
maltreatment
revulsive
congratulate
swelling
supporter
gawk
regrettable
productively
slowest
reputable
infallibly
praising
heaven
superior
rage
congratulation
cheating
rupture
wrongly
wonderously
simplifying
staid
coercion
debaucher
mistakenly
accessible
exhilaration
deadlock
debility
loveless
discontent
masterpiece
fucking
brazenly
warned
sad
autonomous
abominable
humming
salacious
inadvisably
shocked
unacceptable
posh
fictional
swankiest
damper
regard
one-sided
longingly
unprofitable
c

tentatively
friendly
aloof
crushing
idle
recovery
agreeably
fatigued
stately
believeable
malodorous
timely
admonition
fiend
negligence
illogic
delicious
sadden
viewable
skeptic
admonishment
assuring
disputable
receptive
monstrously
daunting
debasement
pandemonium
praiseworthy
exaltingly
phony
scathing
redemption
dismally
lucidly
merciless
dire
grievously
trendy
plotter
unintelligible
deceiving
bitch
encroachment
golden
disaffirm
underdog
annoying
coerce
discouraging
woeful
deteriorate
rational
wail
distressingly
menace
tamper
undaunted
edify
viciously
misleadingly
delighted
immoderate
villainously
impose
impurity
mischief
belated
secretive
alarmed
supporting
wobbled
immorality
subvert
neurotically
hardhearted
ambush
stiffness
worthiness
mortifying
geezer
bright
indistinguishable
deplorably
burning
insatiable
wanton
paradise
enjoyment
discordance
undercutting
wrangle
easygoing
unintelligile
savagery
enfeeble
harshly
impede
triumphantly
nefariously
rip
explosive
angriness
laggy
crushed
f

In [11]:
print("classifying: asdg")
prob_classify = (classifier.prob_classify({"word":"asdg"}))
print(prob_classify._prob_dict)


print("classifying: good")
prob_classify = (classifier.prob_classify({"word":"good"}))
print(prob_classify._prob_dict)

print("classifying: bad")
prob_classify = (classifier.prob_classify({"positive":"bad"}))
print(prob_classify._prob_dict)

classifying: asdg
{'negative': -0.6405606717984611, 'positive': -1.4798085390439812}
classifying: good
{'negative': -0.6405606717984611, 'positive': -1.4798085390439812}
classifying: bad
{'negative': -0.5081764087482131, 'positive': -1.7520026085156815}


In [12]:
for (label,fname) in classifier._feature_probdist:
    print(label, fname)

negative word
positive word


In [13]:
for x in classifier._feature_probdist:
    print(x)

('negative', 'word')
('positive', 'word')


In [14]:
for x in classifier._labels:
    print(x)

negative
positive


In [15]:
prob_classify_dict = classifier.prob_classify({"word":"good"})._prob_dict
positive = prob_classify_dict["positive"]
negative = prob_classify_dict["negative"]

In [16]:
print(positive)
print(negative)

-1.4798085390439812
-0.6405606717984611


In [17]:
probs = np.exp([positive, negative])/np.exp([positive, negative]).sum()

In [18]:
from Make_Enron_Corpus import enronCorpus
enron_corp = enronCorpus()


In [19]:
print(probs)

[0.30169322 0.69830678]


In [20]:
b = {}
avg_sentiment = {}
for d in enron_corp.fileids():
    b[d] = enron_corp.words(d)

In [22]:
for d in enron_corp.fileids():
        
        input_data = b[d]
        relation_word_feature_data = LoadAndTrainModels.applyFeatureSetToWords(input_data) 
        
        if(len(relation_word_feature_data) == 0):
            avg_sentiment[d] = [[0,0]]
        else:
            avg_sentiment[d] = []
            #total_sentiment = 0
            for feature in relation_word_feature_data:               
                prob_classify_dict = classifier.prob_classify(feature)._prob_dict
                avg_sentiment[d].append(getProbabilitiesFromLogProb(prob_classify_dict))

In [25]:
print("file    likelihood that file is all positive")
for file in avg_sentiment:
    #print(file)
    avg_a_prediction_for_extremes = 0
    total_prediction_for_extremes = 0
    count = 0
    
    arr = avg_sentiment[file]
    for a in arr:
        if(a[0] > .5 or a[0] < .5):
            total_prediction_for_extremes += a[0]
            count += 1
    avg_a_prediction_for_extremes = total_prediction_for_extremes/count
    print(file, " : ", avg_a_prediction_for_extremes)

file    likelihood that file is all positive
5_5.txt  :  0.2237706825635456
5_2.txt  :  0.22377068256350843
2_1.txt  :  0.2237706825635513
4_5.txt  :  0.22377068256353264
4_2.txt  :  0.22377068256355084
5_3.txt  :  0.22377068256350482
5_4.txt  :  0.22377068256353783
1_1.txt  :  0.22377068256355065
4_4.txt  :  0.22377068256353924
1_5.txt  :  0.22377068256350627
1_2.txt  :  0.2237706825635501
3_3.txt  :  0.0
2_4.txt  :  0.22377068256354354
4_1.txt  :  0.2237706825635092
3_2.txt  :  0.2237706825635018
3_5.txt  :  0.22377068256351143
1_4.txt  :  0.22377068256353536
5_1.txt  :  0.2237706825635423
2_2.txt  :  0.2237706825635476
2_5.txt  :  0.2237706825635422
