# Explore here

Imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

Load the data

In [3]:
all_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")

all_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


This project consist in classifying the reviews from an app in google play store.
We have three variables, package_name, review and polarity.
- Package_name is the name of the app, which is a categorical variable.
- Review is the review text, also a categorical.
- Polarity means if the review is positive (1) or negative (0). In this case is a categorical variable.

Because we are trying to classify the reviews based on negative and positive ones, we have to delete package_name column since this data is not useful as only the review content and polarity values are necessary for classification.

In [4]:
all_data = all_data.drop("package_name", axis= 1)
all_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


Now that we deleted package_name column we have to remove all the spaces and convert all the text into lower case.

In [5]:
all_data["review"] = all_data["review"].str.strip().str.lower()
all_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


After this process we divide the set into train and test

In [6]:
X = all_data["review"]
y = all_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)


Now that the set is divided we need to vectorize train and test data for reviews. This class counts the words that appear in every review.

In [7]:
#First we instance the vectorizer
vectorizer = CountVectorizer(stop_words= "english") #stop words it's used to remove any words that may cause interference in the model

X_train_vect = vectorizer.fit_transform(X_train) #Adjust the train data (reviews)
X_test_vect = vectorizer.transform(X_test) #Transform the test data (reviews)

We will use stop_words= "english" parameter, but as said in sklearn documentation, these paremeter is known to cause problems as it's not really a general solution.

If stop_words= "english" isn't used, accuracy score for BernoulliNB classifier gets a bit higher (0.8491620111731844) than MultinomialNB accuracy score (0.8435754189944135) but since we aim to use MultinomialNB because it's the best use case we will keep it (Data is discrete)

We are going to try each of the Naive Bayes models:
- GaussianNB, it won't work as this classifiers is for continous data, and in this exercise we are using discrete data.
- MultinomialNB, we know that Multinomial Naive Bayes will fit better to this exercise as this model is better for text classification (reviews) as it works with occurrence counts 
- BernoulliNB, this classifier may work as it uses binary data, but in this case it's not usefull as we are classifying the reviews using the text data, not the polarity one (which is binary)



BernoulliNB

In [None]:
#Initialize Naive Bayes BernoulliNB
clf_BernoulliNB = BernoulliNB()

#Train the model
clf_BernoulliNB.fit(X_train_vect, y_train)

#Predict using the trained model
y_pred = clf_BernoulliNB.predict(X_test_vect)

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179

0.770949720670391


MultinomialNB

In [None]:
#Initialize Naive Bayes MultinomialNB
clf_MultinomialNB = MultinomialNB()

#Train the model
clf_MultinomialNB.fit(X_train_vect, y_train)

#Predict using the trained model
y_pred = clf_MultinomialNB.predict(X_test_vect)

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179

0.8156424581005587


GaussianNB

In [None]:
#GaussianNB does not admit matrixes so we need to convert X_train_vect and X_test_vect to arrays
X_train_vect = X_train_vect.toarray()
X_test_vect = X_test_vect.toarray()

clf_GaussianNB = GaussianNB()

#Train the model
clf_GaussianNB.fit(X_train_vect, y_train)

#Predict using the trained model
y_pred = clf_GaussianNB.predict(X_test_vect)

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

0.8044692737430168


Based on accuracy results:

Accuracy means a metric that measures how often a machine learning model correctly predicts the outcome, the higher the better is the model.

- BernoulliNB: 0.770949720670391
- MultinomialNB: 0.8156424581005587
- GaussianNB: 0.8044692737430168

We conclude that the best classifier for this exercise is MultinomialNB. The next step is to try to optimize the model.

In [None]:
#To try to optimize the model we need to find the best suited hyperparameters. For this case we use random search as we don't know exactly what to optimize.
hyperparams = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False] #Whether to learn class prior probabilities or not. If false, a uniform prior will be used.
}

#Create the random search
random_search = RandomizedSearchCV(clf_MultinomialNB, hyperparams, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)

#Train the random search model to find the best hyperparmeters
best_suited = random_search.fit(X_train_vect, y_train)
print(best_suited.best_params_)

#After getting the best hyperparameters train we train the MultinomialNB model again with them
clf_MultinomialNB = MultinomialNB(alpha= 1.917638190954774, fit_prior= False)

clf_MultinomialNB.fit(X_train_vect, y_train)

y_pred = clf_MultinomialNB.predict(X_test_vect)

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

{'fit_prior': False, 'alpha': np.float64(1.917638190954774)}
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       126
           1       0.72      0.64      0.68        53

    accuracy                           0.82       179
   macro avg       0.79      0.77      0.78       179
weighted avg       0.82      0.82      0.82       179

0.8212290502793296


Model is improved after the optimization:
- Accuracy Score prior to optimization was 0.8156424581005587
- Accuracy Score after optimization is 0.8212290502793296

Based on "https://www.intechopen.com/chapters/1154729" experiment using a BBC News Corpus shows that random forest and logistic regression models are less precise in case of text classification. It may only be for that case, so random forest and logistic regression may be more precisse in we use them in this exercise.