# Explore here

In [None]:
# Your code here
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_df = pd.read_csv('playstore_reviews.csv')
print(data_df.head())

# Drop unnecessary dimension
data_df.drop(['package_name'], axis=1, inplace=True)

# Process comment string
data_df['review'] = data_df['review'].str.strip().str.lower()

# Divide sets
from sklearn.model_selection import train_test_split
X = data_df['review']
y = data_df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

# Transform text into word count matrix

from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words='english')
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

# Apply Gaussian Naive Bayes algorithm

from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print(f'Model accuracy score: {acc}')

# Optimize model
from sklearn.model_selection import GridSearchCV

hyperparams = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

grid = GridSearchCV(model, hyperparams, scoring='accuracy', cv=5)
grid.fit(X_train, y_train)

print(f'Best model parameters: {grid.best_params_}')

# Re-run optimized model
opt_model = GaussianNB()
opt_model.fit(X_train,y_train)

opt_pred = opt_model.predict(X_test)

opt_acc = accuracy_score(y_test, opt_pred)
print(f'Model accuracy score: {opt_acc}')

          package_name                                             review  \
0  com.facebook.katana   privacy at least put some option appear offli...   
1  com.facebook.katana   messenger issues ever since the last update, ...   
2  com.facebook.katana   profile any time my wife or anybody has more ...   
3  com.facebook.katana   the new features suck for those of us who don...   
4  com.facebook.katana   forced reload on uploading pic on replying co...   

   polarity  
0         0  
1         0  
2         0  
3         0  
4         0  
Model accuracy score: 0.770949720670391
Best model parameters: {'var_smoothing': 1e-09}
Model accuracy score: 0.770949720670391
