In [1]:
import pandas as pd
import string
import numpy as np
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

data = pd.read_csv('google_play_store_apps_reviews_training.csv')

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)
data['review_clean'] = data['review'].apply(lambda x: clean_text(x))
# Split into training and testing data
x = data['review_clean']
y = data['polarity']
print(x)
x, x_test, y, y_test = train_test_split(x,y, stratify=y, test_size=0.25, random_state=3)
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words='english')
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()
model = MultinomialNB()
model.fit(x, y)
model.score(x_test, y_test)
filename = 'finalized_model.sav'
joblib.dump(model, filename) 


0      privacy least put option appear offline mean p...
1      messenger issue ever since last update initial...
2      profile time wife anybody one post view would ...
3      new feature suck work back button guy make vid...
4      force reload upload pic reply comment last nig...
                             ...                        
886    love loooooooooooooovvved incredible awesome g...
887    time legendary game birthday party level short...
888    ad way heavy listen bad review ad every round ...
889    fun work perfectly well ad annoy think especia...
890    they're everywhere see angry bird everywhere c...
Name: review_clean, Length: 891, dtype: object


['finalized_model.sav']

In [277]:

loaded_model = joblib.load(filename)
result = loaded_model.score(x_test, y_test)
print(result)


0.8295964125560538


In [None]:
#### df_coc = pd.read_csv('reviews.csv')
df_coc['review_clean'] = df_coc['Content'].apply(lambda x: clean_text(x))


In [None]:

pos = 0
neg = 0
total = 0
def cal_sentiment_polarity(text):
    sentiment = " "
    global  pos 
    global  neg 
    result = loaded_model.predict(vec.transform([text]))
    if(result == 1):
        sentiment = "Positive"
        pos = pos + 1
    elif(result == 0):
        sentiment = "Negative"
        neg = neg + 1
    return sentiment
df = pd.DataFrame({
    'Uid': list(df_coc.index.values),
    'Reviews': df_coc['review_clean'],       
    'Polarity': [cal_sentiment_polarity(x) for x in df_coc['review_clean']],
    
})
total = pos + neg
print("Total Numbers of Reviews" , total)
print("Positive Reviews ",pos)
print("Negative Reviews " ,neg)
df.to_csv("sentiment_result1.csv", encoding='utf8', index=False)
df.head(56)



In [None]:
import numpy as np
import matplotlib.pyplot as plt 
labels = ['Postive','Negative'] 
data = [pos,neg] 
fig = plt.figure(figsize = (10,5)) 
colors = ['green','orange']
plt.subplot( title='Sentimental Analysis ')
plt.pie(data, labels=labels, colors=colors, autopct='%1.2f%%', startangle=90) 
plt.axis("equal")
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color = 'white',
        max_words = 500,
        max_font_size = 20, 
        scale = 3,
        random_state = 42
    ).generate(str(data))

    fig = plt.figure(1, figsize = (20, 20))
    plt.axis('on')
    if title: 
        fig.suptitle(title, fontsize = 20)
        fig.subplots_adjust(top = 2.3)

    plt.imshow(wordcloud)
    plt.show()
    
# print wordcloud
show_wordcloud(df_coc['review_clean'])

In [69]:
df = pd.read_csv('sentiment12.csv')
#x, y = df.Translated_Review.fillna(''), df.Sentiment_Polarity
#print(x.shape)
#print(y.shape)
#df.replace([np.inf, -np.inf], np.nan, inplace=True)
#import pandas as pd
#import numpy as np


#dfs = pd.DataFrame(df)


# drop all rows with any NaN and NaT values
#df1 = dfs.dropna()
#df1.head(50)
x, y = df.text.fillna(''), df.label
print(x.shape)
print(y.shape)

(37427,)
(37427,)


In [70]:


from sklearn.model_selection import train_test_split
x, x_test, y_train, y_test = train_test_split(x, y, random_state=1)
print(x.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(28070,)
(9357,)
(28070,)
(9357,)


In [71]:
vec = CountVectorizer()
x = vec.fit_transform(x).toarray()
x_test = vec.transform(x_test).toarray()
model = MultinomialNB()
model.fit(x, y_train)
model.score(x_test, y_test)


0.8298599978625628

In [68]:
model.predict(vec.transform([' i hate this app because of too much adds ']))


array([0], dtype=int64)