### Method 1 : Naive Bayes Classifer for Sentiment Analysis  

In [125]:
import pandas

import numpy

from nltk.corpus import stopwords

import nltk

from nltk.tokenize import RegexpTokenizer

from nltk.classify import NaiveBayesClassifier

import nltk.classify.util, nltk.metrics

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

import random


In [54]:
#Reading data

data = pandas.read_csv("Amazon_Unlocked_Mobile_iphone6.csv", encoding='utf-8')

data = data[['Reviews','Rating']]

In [55]:
#polarity assignment

data['polarity'] = numpy.where(data['Rating']>3, 'pos', 'neg')

data['Reviews'] = data['Reviews'].str.lower()

In [56]:
#creating word feature_extraction

def create_word_features(para):

    tokenizer = RegexpTokenizer(r'\w+')

    words = tokenizer.tokenize(para)

    useful_words = [word for word in words if word not in nltk.corpus.stopwords.words("english")]

    base_words = [stemmer.stem(word) for word in useful_words]

    my_dict = dict([(word, True) for word in base_words])

    return my_dict


In [57]:
#Stratified sampling

data_pos= data.loc[(data['polarity'] == 'pos')]

data_neg = data.loc[(data['polarity'] == 'neg')]

data_neg = data_neg.dropna()

data_pos = data_pos.dropna()


In [110]:
pos_reviews = []

for i in data_pos['Reviews']:

    pos_reviews.append((create_word_features(i), "positive"))

print(len(pos_reviews))

pos_reviews


3724


[({'6': True,
   'appl': True,
   'call': True,
   'card': True,
   'contact': True,
   'could': True,
   'discov': True,
   'easili': True,
   'everyth': True,
   'expect': True,
   'good': True,
   'hesit': True,
   'iphon': True,
   'like': True,
   'new': True,
   'phone': True,
   'purchas': True,
   'recommend': True,
   'regist': True,
   'repres': True,
   'seller': True,
   'sim': True,
   'sinc': True,
   'unlock': True,
   'verizon': True,
   'would': True},
  'positive'),
 ({'4': True,
   '6': True,
   'appear': True,
   'awesom': True,
   'believ': True,
   'camera': True,
   'clear': True,
   'crystal': True,
   'differ': True,
   'far': True,
   'fast': True,
   'final': True,
   'freez': True,
   'gold': True,
   'huge': True,
   'iphon': True,
   'last': True,
   'long': True,
   'market': True,
   'phone': True,
   'processor': True,
   'qualiti': True,
   'superior': True,
   'time': True,
   'upgrad': True},
  'positive'),
 ({'16gb': True,
   '6': True,
   'advertis

In [111]:
neg_reviews = []

for i in data_neg['Reviews']:
    
    neg_reviews.append((create_word_features(i), "negative"))
    
print(len(neg_reviews))    


1028


In [112]:
i = random.sample(pos_reviews,  int(len(pos_reviews)*0.70))
j = random.sample(neg_reviews,  int(len(neg_reviews)*0.70))
train = i + j

In [96]:
print(len(train),  len(test))

In [118]:
test = [pos_reviews[k] for k in range(0,len(pos_reviews)) if k not in i] + [neg_reviews[k] for k in range(0,len(neg_reviews)) if k not in j]

Now, we train and test the model using the Naive Bayes Classifier.

In [127]:
#Model training
classifier = NaiveBayesClassifier.train(train)

In [122]:
#Model Validation
accuracy = nltk.classify.util.accuracy(classifier, train)
print(accuracy * 100)
print((1-accuracy) * 100)

88.69172932330827
11.308270676691734


In [123]:
accuracy = nltk.classify.util.accuracy(classifier, test)
print(accuracy * 100)
print((1-accuracy) * 100)


87.83670033670033
12.163299663299664


### Method 2 : Naive Bayes Classifier

In [6]:
import pandas as pd
import nltk
import collections
import numpy as np


In [9]:
phone_data = pd.read_csv('Amazon_Unlocked_Mobile.csv', encoding='utf-8')

x = phone_data['Reviews']
y_num = phone_data['Rating']
y = np.where(y_num>3,'Positive','Negative') 


In [10]:
size = int(len(y)*0.01)

x=x[:size]
y=y[:size]
y_num=y_num[:size]

x_words=collections.Counter(" ".join(str(i)for i in x).split()).most_common(2000)
x_word_features = list(a for (a,b) in x_words)



In [77]:
def review_features(review):
    review_words = set(review)
    features = {}
    for word in x_word_features:
        features['contains({})'.format(word)] = (word in review_words)
    return features
    
review=[(x[n],y_num[n]) for n in range(1,len(x))] 
    
review2=[(x.split(),y) for (x,y) in review]

featuresets = [(review_features(d), c) for (d,c) in review2]

featuresets


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [12]:
train_set2, test_set2 = featuresets[400:], featuresets[:400]

classifier = nltk.NaiveBayesClassifier.train(train_set2)
print(nltk.classify.accuracy(classifier, train_set2))
print(nltk.classify.accuracy(classifier, test_set2))

classifier.show_most_informative_features(30)

0.689858175007
0.5175
Most Informative Features
         contains(waste) = True                1 : 5      =     50.6 : 1.0
       contains(stopped) = True                2 : 5      =     44.4 : 1.0
       contains(showing) = True                3 : 5      =     27.8 : 1.0
          contains(face) = True                3 : 5      =     27.8 : 1.0
  contains(unresponsive) = True                3 : 5      =     27.8 : 1.0
        contains(failed) = True                3 : 5      =     27.8 : 1.0
          contains(dead) = True                1 : 5      =     27.5 : 1.0
         contains(stock) = True                4 : 1      =     24.7 : 1.0
         contains(Tried) = True                1 : 5      =     23.7 : 1.0
         contains(Large) = True                2 : 5      =     23.5 : 1.0
        contains(repair) = True                2 : 5      =     23.5 : 1.0
       contains(people.) = True                2 : 5      =     23.5 : 1.0
       contains(message) = True                3 : 5

### Latent Dirichlet Allocation and Non Negative Matrix Factorization - Topic Modeling

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import pandas as pd

In [7]:

#Loading datafiles
iphone7 = pd.read_csv('Amazon_Unlocked_Mobile_iphone7.csv')
iphone6 = pd.read_csv('Amazon_Unlocked_Mobile_iphone6.csv')
#iphone6S = pd.read_csv('Amazon_Unlocked_Mobile_iphone6S.csv')
#GalaxyS6Edge = pd.read_csv('Amazon_Unlocked_Mobile_SamsungGalaxy S6 Edge.csv')
#GalaxyS7Edge = pd.read_csv('Amazon_Unlocked_Mobile_SamsungGalaxy S7 Edge.csv')


In [8]:
# Checking for number of variables.
iphone7.shape
iphone6.shape
#iphone6S.shape
#GalaxyS6Edge.shape
#GalaxyS7Edge.shape

(4752, 6)

In [3]:
## Selecting the column containing reviews
reviews = 'Reviews'


n_samples = len(iphone7)
n_samples = len(iphone6)
#n_samples = len(iphone6S)
#n_samples = len(GalaxyS6Edge)
#n_samples = len(GalaxyS7Edge)

n_topics = 5
n_top_words = 5


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


data_samples = iphone7[reviews]




In [4]:
# Use tf-idf features for NMF.
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english',
                                   analyzer ='word',
                                   ngram_range =(2, 2))
tfidf = tfidf_vectorizer.fit_transform(data_samples)

# Fit the NMF model
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)


# Print the topics
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topic #0:
excellent thank ðÿ ðÿ family loves far speed far expensive
Topic #1:
did know android versions color gamut iphone plus know google
Topic #2:
best iphone iphone hands love ios apples best android marshmallow
Topic #3:
jet black silicon case dots appeared quality jet black finish
Topic #4:
ðÿ ðÿ works perfectly perfectly ðÿ time works just received



In [5]:
# Use tf (raw term count) features for LDA.
tf_vectorizer = CountVectorizer(max_df = 0.9,min_df = 2, stop_words='english', 
                                analyzer ='word',
                                ngram_range =(2, 2))
tf = tf_vectorizer.fit_transform(data_samples)



# Fit the LDA model
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=23)
lda.fit(tf)

# Print the topics
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topic #0:
great phone sim card great product phone did home button
Topic #1:
did know iphone plus android versions color gamut ios 10
Topic #2:
jet black silicon case time described arrived time ok tks
Topic #3:
works perfectly ios 10 best iphone ðÿ ðÿ delivered time
Topic #4:
excellent thank sim card brand new iphone unlocked unlocked phone

