# Explore here

## 1) Import required libraries

In [None]:
# Your code here
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_df = pd.read_csv('playstore_reviews.csv')
print(data_df.head())

## 2) Drop unnecessary dimensions

In [None]:
data_df.drop(['package_name'], axis=1, inplace=True)

## 3) Process comment string

In [None]:
data_df['review'] = data_df['review'].str.strip().str.lower()

## 4) Split dataset into train and test subsets

In [None]:
from sklearn.model_selection import train_test_split
X = data_df['review']
y = data_df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

## 5) Transform text into word count matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words='english')
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()


## 6) Apply Gaussian Naive Bayes algorithm

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)


## 7) Evaluate model performances

In [None]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print(f'Model accuracy score: {acc}')


## 8) Optimize model

In [None]:
from sklearn.model_selection import GridSearchCV

hyperparams = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}

grid = GridSearchCV(model, hyperparams, scoring='accuracy', cv=5)
grid.fit(X_train, y_train)

print(f'Best model parameters: {grid.best_params_}')

# Re-run optimized model
opt_model = GaussianNB()
opt_model.fit(X_train,y_train)

opt_pred = opt_model.predict(X_test)

opt_acc = accuracy_score(y_test, opt_pred)
print(f'Model accuracy score: {opt_acc}')