In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.feature_extraction.text import CountVectorizer

#Read the df
df = pd.read_csv('google_play_store_apps_reviews_training.csv')


In [2]:
#Clean the data 
def clean_data(df):
    #Remove package name...we do not need it
    df = df.drop('package_name', axis=1)
    #convert to lowercase
    df['review'] = df['review'].str.strip().str.lower()
    return df
df = clean_data(df)
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [3]:
#Split the data 
X = df['review']
#remember, the polarity column is the sentiment! 0 = neg, 1 = pos
y = df['polarity'] 
#Split the model into training, testing groups
X_validation, X_test, y_validation, y_test = train_test_split(X , y, stratify=y, test_size=0.20, random_state=42)

In [4]:
#Vectorize text reviews to numbers (for the ML algorithm!)
vec = CountVectorizer(stop_words='english')
X_validation = vec.fit_transform(X_validation).toarray()
X_test = vec.transform(X_test).toarray()

In [5]:
#Model Generation
from sklearn.naive_bayes import MultinomialNB

model_NB = MultinomialNB()
model_NB.fit(X_validation, y_validation)

#Observe the model accuracy
model_NB.score(X_test, y_test)

0.8547486033519553

In [6]:
#Use the created model to predict reviews that I created
model_NB.predict(vec.transform(['I hate this app. It sucks.']))

array([0], dtype=int64)

In [7]:
model_NB.predict(vec.transform(['I love Google Play! You guys rock!']))

array([1], dtype=int64)