In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np
import os
import re
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


paths = r'C:\Users\Pisces Khan\OneDrive\Documents\Articles'
filenames = os.listdir(paths)
print(filenames)

#collect all stop words
stopWords = set(stopwords.words('english'))
#update stop words
stopWords.update(['"', "'", ':', '(', ')', '[', ']', '{', '}']) #'.',  ',', '?', '!', ';'

#obtain the noun position in the text
def getNounPositions(type,tagged,lineIn):
    nounPosi={}
    for item in tagged:
        if item[1]==type:
            nounPosi[item[0]]=-1
    
    for key in nounPosi.keys():
        regExpression=r'\b'+key.lower()+r'\b'
        nounsi=[m.start() for m in re.finditer(regExpression, lineIn.lower())]
        nounPosi[key]=nounsi
    return nounPosi

#tokenize and obtain the words
def getWords(sentence):
	return word_tokenize(sentence)
#get tags for each word 
def getTagsForWords(textLn2):
    tokens=word_tokenize(textLn2)
    tagged=pos_tag(tokens)
    return(tagged)
#remove stop word from the word list
def stopwordRemove(wordlist):
	stopword_list = set(stopwords.words('english'))
	return [stopped for stopped in wordlist if stopped not in stopword_list]

#collect top 3 nouns based on their occurances
def getTop3NounAndFreq(NNP):
    top3Noun=[]*3
    nounSortedByFreq=sorted(NNP.items(), key=lambda item: len(item[1]), reverse=True)    
    for ni in range(len(nounSortedByFreq)):
        if ni>2:
            break
        nounNm=nounSortedByFreq[ni][0]
        freqNm=len(nounSortedByFreq[ni][1])
        top3Noun.append((nounNm,freqNm))
    return(top3Noun)

#read the news text from a single file
def readFromSingleFile(Paths,fileNm):
    path=paths+'/'+fileNm
    reviewContent=[]
    dataFile=open(path, encoding="utf8")
    for l in dataFile:
        print(len(l))
        if(len(l)<=1):
            continue
        reviewContent.append(l.rstrip('\n'))
    dataFile.close()
    return reviewContent


#perform 3D column chart of noun compound score and thier occurance
def plotMatrix3DColumnNounCompundScoreOccurence(sentiment_scoresForAnArticle):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    nElements=len(sentiment_scoresForAnArticle)
    xs=[i for i in range(nElements)]
    ys=[sei[0][0][1] for  sei in sentiment_scoresForAnArticle ]
    zs=[0]*nElements
    dx=[0.4]*nElements
    dy=[0.4]*nElements
    dz=[np.abs(sei[1]['compoundScore']) for  sei in sentiment_scoresForAnArticle ]
    colors_ar=[]
    for aSenti in sentiment_scoresForAnArticle:
        if aSenti[1]['compoundScore']>0:
            colors_ar.append('b')
        elif aSenti[1]['compoundScore']==0:
            colors_ar.append('g')
        else:
            colors_ar.append('r')    
    
    ax.bar3d(xs,ys,zs,dx,dy,dz,color=colors_ar)
    ax.plot([],[],color='b',label='Positve Sentiment')
    ax.plot([],[],color='g',label='Neutral Sentiment')
    ax.plot([],[],color='r',label='Negative Sentiment')
    nounNms=[sei[0][0][0] for sei in sentiment_scoresForAnArticle]
    #ax.set_xlabel('Noun')
    ax.set_xticklabels(nounNms)
    ax.set_ylabel('Num. of Noun Occurences')
    ax.set_zlabel('Compound Score')
    plt.legend()
    plt.show()    

#perform 3D column chart of noun, positive and negative score
def plotMatrix3DColumnNounPosiNegiScore(sentiment_scoresForAnArticle):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    nElements=len(sentiment_scoresForAnArticle)
    xs=[i for i in range(nElements)]
    ys=[sei[1]['negScore'] for  sei in sentiment_scoresForAnArticle]
    zs=[0]*nElements
    dx=[0.5]*nElements
    dy=[0.005]*nElements
    dz=[sei[1]['posScore'] for  sei in sentiment_scoresForAnArticle ] 
    colors_ar=['b']*nElements
    ax.bar3d(xs,ys,zs,dx,dy,dz,color=colors_ar)
    nounNms=[sei[0][0][0] for sei in sentiment_scoresForAnArticle]
    #ax.set_xlabel('Noun')
    ax.set_ylabel('Negative Score')
    ax.set_zlabel('Positive Score')
    ax.set_xticklabels(nounNms)
    ax.plot([],[],color='b',label='Positve Sentiment')
    plt.legend()
    plt.show()    



#read all news files from given path and perform the sentiment analysis
cnt=0
sentiment_scoresForAnArticle=[]
for filename in filenames:
    cnt=cnt+1
    text=readFromSingleFile(paths,filename)
    lineIn='\n'.join(text)
    tagged=getTagsForWords(lineIn)
    NNP=getNounPositions('NNP',tagged,lineIn)
    top3Noun=getTop3NounAndFreq(NNP)
    review_tokens = [getWords(asdf.lower()) for asdf in text]
    stopped_sent = [stopwordRemove(sentence) for sentence in review_tokens]
    sents = []
    for i in stopped_sent:
        asdf = ''
        for j in i:
            asdf = asdf + j + ' '
        sents.append(asdf)
    sid = SentimentIntensityAnalyzer()
    allReviews=' '.join(sent for sent in sents)
    sentiment_scores=sid.polarity_scores(allReviews)
    sentiment_scoresForAnArticle.append([top3Noun,sentiment_scores])

#perform sentiment informaiton 3D visualization
plotMatrix3DColumnNounPosiNegiScore(sentiment_scoresForAnArticle)
plotMatrix3DColumnNounCompundScoreOccurence(sentiment_scoresForAnArticle)

['negative_0.txt', 'negative_1.txt', 'negative_10.txt', 'negative_11.txt', 'negative_12.txt', 'negative_13.txt', 'negative_14.txt', 'negative_15.txt', 'negative_16.txt', 'negative_17.txt', 'negative_18.txt', 'negative_19.txt', 'negative_2.txt', 'negative_20.txt', 'negative_21.txt', 'negative_22.txt', 'negative_23.txt', 'negative_24.txt', 'negative_25.txt', 'negative_26.txt', 'negative_27.txt', 'negative_28.txt', 'negative_29.txt', 'negative_3.txt', 'negative_30.txt', 'negative_31.txt', 'negative_32.txt', 'negative_33.txt', 'negative_4.txt', 'negative_5.txt', 'negative_6.txt', 'negative_7.txt', 'negative_8.txt', 'negative_9.txt', 'neutral_0.txt', 'neutral_1.txt', 'neutral_10.txt', 'neutral_11.txt', 'neutral_12.txt', 'neutral_13.txt', 'neutral_14.txt', 'neutral_15.txt', 'neutral_16.txt', 'neutral_17.txt', 'neutral_18.txt', 'neutral_19.txt', 'neutral_2.txt', 'neutral_20.txt', 'neutral_21.txt', 'neutral_22.txt', 'neutral_23.txt', 'neutral_24.txt', 'neutral_25.txt', 'neutral_26.txt', 'neutr

128
1
1167
1
913
1
596
1
785
1
67
1
29
186
1
471
1
496
1
563
1
546
1
401
1
549
1
299
1
455
1
524
1
485
1
242
212
1
1
228
1
292
1
231
1
230
1
243
1
590
1
221
1
261
1
371
1
337
1
484
1
268
1
264
214
1
445
1
321
1
303
1
361
1
376
1
278
1
470
1
261
336
1
1
212
1
285
1
514
1
373
1
407
283
1
506
1
257
1
295
1
242
1
460
1
238
1
279
267
1
395
1
172
1
263
1
181
1
311
1
390
1
811
1
407
1
533
1
135
1
658
1
218
1
347
1
476
1
792
1
687
1
508
1
296
1
382
1
317
1
235
1
346
1
569


error: unterminated character set at position 2