In [3]:
import pandas as pd
import numpy as np
import nltk

In [8]:
imdb = pd.read_csv('https://raw.githubusercontent.com/skathirmani/datasets/master/imdb_sentiment.csv')

In [9]:
imdb.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


VADER
---------------

Valence Aware Dictionary sENTIMENT Reasoner

In [20]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

In [32]:
sentiment.polarity_scores('i love india')

{'compound': 0.6369, 'neg': 0.0, 'neu': 0.192, 'pos': 0.808}

In [36]:
sentiment.polarity_scores('i LOVE india')['compound']

0.7125

compound calc

In [30]:
# For "love" word, normalization is calculated as below:

score = 3.2
alpha = 15
compound = score / np.sqrt(np.square(score)+alpha)
compound

0.6369499429264264

In [35]:
sentiment.polarity_scores('i love india i hate apple')

{'compound': 0.128, 'neg': 0.374, 'neu': 0.202, 'pos': 0.424}

pos, neg, neu calc

In [1]:
pos = 3.2 + 1
neg = 2.7 + 1
neu = 2
total = pos + neg + neu
pos/total,neg/total,neu/total

(0.42424242424242425, 0.37373737373737376, 0.20202020202020202)

In [5]:
score = (pos - neg)
alpha = 15
compound = score / np.sqrt(np.square(score)+alpha)
compound

0.12803687993289598

In [38]:
def get_sentiment(text):
    compound = sentiment.polarity_scores(text)['compound']
    if compound < 0:
        return 0
    else:
        return 1
imdb['sentiment_vader'] = imdb['review'].apply(get_sentiment)

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(imdb['sentiment'],imdb['sentiment_vader'])

0.7767379679144385

Topic Modelling
----------
- % of topic used in each document
- Cluster

- LSA
- Matrix Factorization
- LDA (Latent Dirichlet Allocation)

LDA
---------- 
Document - Term (Main Matrix)
- Document - Topic Relationship (Matrix 1)
- Topic - Term Relationship (Matrix 2)

In [42]:
!pip install gensim



You are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [48]:
amazon = pd.read_csv('amazon_reviews_big.csv')
amazon.shape

(100000, 8)

In [49]:
amazon['reviewText'].head()

0    What I recieved is not what is pictured here O...
1    Excellent unit and a pretty simple install usi...
2    I'm enjoying this keyboard, I'm getting anothe...
3    Overall, this is a fantastic camera that I'm e...
4    These work very well with mySamsung PN64D7000 ...
Name: reviewText, dtype: object

In [50]:
docs = amazon['reviewText'].fillna('').str.lower()
docs = docs.str.replace('[^a-z ]','')

docs_clean = []
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['','use','one','like','work'])
stemmer = nltk.stem.PorterStemmer()
for doc in docs:
    words = doc.split(' ')
    words_clean = [stemmer.stem(word) for word in words if word not in stopwords]
    words_clean = [word for word in words_clean if word not in stopwords]
    docs_clean.append(words_clean)

In [51]:
docs_clean[0][0:5]

['reciev', 'pictur', 'advert', 'vidio', 'cabl']

In [52]:
import gensim
dictionary = gensim.corpora.Dictionary(docs_clean)

In [53]:
len(list(dictionary.values())),len(list(dictionary.keys()))

(152945, 152945)

In [54]:
list(zip(dictionary.keys(),dictionary.values()))[0:5]

[(0, 'advert'), (1, 'amazoncom'), (2, 'anyth'), (3, 'attempt'), (4, 'back')]

In [55]:
dictionary.doc2bow(docs_clean[0])[0:5]

[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1)]

In [56]:
docs_bow = []
for doc in docs_clean:
    bow = dictionary.doc2bow(doc)
    docs_bow.append(bow)

In [57]:
docs_bow[0][0:5],docs_bow[1][0:5],docs_bow[2][0:5]

([(0, 1), (1, 2), (2, 1), (3, 1), (4, 1)],
 [(27, 1), (34, 1), (49, 1), (52, 1), (53, 1)],
 [(6, 2), (20, 1), (21, 3), (29, 1), (41, 1)])

In [58]:
lda_model = gensim.models.LdaModel(docs_bow,
                                  id2word = dictionary,
                                  num_topics = 8)

In [59]:
# Output format(Topic_type, % of words in that Topic)
lda_model.get_document_topics(docs_bow[0]),lda_model.get_document_topics(docs_bow[1]),lda_model.get_document_topics(docs_bow[3])

([(0, 0.4490993), (1, 0.049294174), (5, 0.23619917), (7, 0.2574524)],
 [(1, 0.1801925), (2, 0.47745797), (5, 0.33116606)],
 [(1, 0.03998171), (6, 0.9421356)])

In [60]:
doc2topic = pd.DataFrame(lda_model.get_document_topics(docs_bow[0]),
            columns = ['topic_no','prob'])
doc2topic.sort_values('prob', ascending = False).iloc[0]['topic_no']

0.0

In [61]:
topics = []
for bow in docs_bow:
    doc2topic = pd.DataFrame(lda_model.get_document_topics(docs_bow[0]),
            columns = ['topic_no','prob'])
    topic = doc2topic.sort_values('prob', ascending = False)
    topic = topic.iloc[0]['topic_no']
    topics.append(topic)

In [62]:
topics[0:10],len(topics)

([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 100000)

In [63]:
lda_model.print_topics()

[(0,
  '0.024*"cabl" + 0.016*"connect" + 0.015*"batteri" + 0.014*"devic" + 0.012*"charg" + 0.010*"power" + 0.009*"plug" + 0.009*"need" + 0.008*"router" + 0.007*"adapt"'),
 (1,
  '0.023*"tv" + 0.021*"speaker" + 0.015*"sound" + 0.011*"set" + 0.010*"player" + 0.010*"video" + 0.009*"play" + 0.009*"qualiti" + 0.008*"pictur" + 0.008*"great"'),
 (2,
  '0.025*"case" + 0.012*"fit" + 0.010*"ipad" + 0.009*"well" + 0.009*"cover" + 0.009*"screen" + 0.008*"look" + 0.008*"protect" + 0.008*"would" + 0.007*"nice"'),
 (3,
  '0.036*"keyboard" + 0.032*"mous" + 0.026*"button" + 0.022*"monitor" + 0.018*"key" + 0.011*"logitech" + 0.009*"batteri" + 0.008*"feel" + 0.007*"hand" + 0.007*"click"'),
 (4,
  '0.028*"drive" + 0.018*"card" + 0.013*"usb" + 0.012*"comput" + 0.010*"gb" + 0.009*"instal" + 0.008*"window" + 0.007*"run" + 0.007*"hard" + 0.007*"file"'),
 (5,
  '0.014*"get" + 0.011*"time" + 0.009*"would" + 0.008*"unit" + 0.008*"tri" + 0.007*"go" + 0.006*"radio" + 0.006*"thing" + 0.006*"back" + 0.006*"review"')