# Naive Bayes Model: Google Play Store Reviews 
## 1. Import Modules 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

## 2. Data Preparation 
### 2.1 Data download from CSV file 

In [3]:
data_df=pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')

### 2.2 Data Inspection 

In [5]:
data_df.head().T

Unnamed: 0,0,1,2,3,4
package_name,com.facebook.katana,com.facebook.katana,com.facebook.katana,com.facebook.katana,com.facebook.katana
review,privacy at least put some option appear offli...,"messenger issues ever since the last update, ...",profile any time my wife or anybody has more ...,the new features suck for those of us who don...,forced reload on uploading pic on replying co...
polarity,0,0,0,0,0


In [6]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


### 2.3 Train-test split 

In [7]:
training_df, testing_df=train_test_split(data_df, test_size=0.3, random_state=315)

In [8]:
training_df.head().T

Unnamed: 0,106,4,589,712,592
package_name,com.linkedin.android,com.facebook.katana,com.evernote,com.opera.mini.native,com.evernote
review,why can't i share my achievements? recently d...,forced reload on uploading pic on replying co...,very user friendly! for those looking for a s...,save image and low performance can not save a...,too much going on its cool it works but ony u...
polarity,0,0,1,0,1


### 2.4 Data Encoding 

In [10]:
training_reviews=training_df['review']
testing_reviews=testing_df['review']
training_reviews.head()

106     why can't i share my achievements? recently d...
4       forced reload on uploading pic on replying co...
589     very user friendly! for those looking for a s...
712     save image and low performance can not save a...
592     too much going on its cool it works but ony u...
Name: review, dtype: object

In [20]:
training_reviews.info()

<class 'pandas.core.series.Series'>
Index: 623 entries, 106 to 873
Series name: review
Non-Null Count  Dtype 
--------------  ----- 
623 non-null    object
dtypes: object(1)
memory usage: 9.7+ KB


In [21]:
testing_reviews.info()

<class 'pandas.core.series.Series'>
Index: 268 entries, 393 to 699
Series name: review
Non-Null Count  Dtype 
--------------  ----- 
268 non-null    object
dtypes: object(1)
memory usage: 4.2+ KB


In [11]:
vector_model=CountVectorizer(stop_words="english")
vector_model.fit(training_reviews)
training_word_counts=vector_model.transform(training_reviews).toarray()
testing_word_counts=vector_model.transform(testing_reviews).toarray()

print(f'Word count matrix has {training_word_counts.shape[0]} rows and {training_word_counts.shape[1]} columns')

Word count matrix has 623 rows and 3012 columns


In [None]:
training_word_counts[0]


array([0, 0, 0, ..., 0, 0, 0], shape=(3012,))

In [27]:
feature_names=vector_model.get_feature_names_out()
training_word_counts=pd.DataFrame(training_word_counts, columns=feature_names)
testing_word_counts=pd.DataFrame(testing_word_counts, columns=feature_names)
training_word_counts.head().T

Unnamed: 0,0,1,2,3,4
000,0,0,0,0,0
04,0,0,0,0,0
0x,0,0,0,0,0
10,0,0,0,0,0
100,0,0,0,0,0
...,...,...,...,...,...
żŕ,0,0,0,0,0
żŕľ,0,0,0,0,0
žŕ,0,0,0,0,0
žŕľ,0,0,0,0,0


## 3. EDA 
### 3.1. Baseline model performance 

In [28]:
training_df['polarity'].value_counts()

polarity
0    414
1    209
Name: count, dtype: int64

In [32]:
testing_df['polarity'].value_counts()

polarity
0    170
1     98
Name: count, dtype: int64

In [29]:
training_df['package_name'].value_counts()

package_name
com.evernote                  32
com.Slack                     32
org.mozilla.firefox           31
com.facebook.orca             30
com.android.chrome            30
com.facebook.katana           30
com.hamropatro                29
com.dropbox.android           29
com.king.candycrushsaga       28
com.supercell.clashofclans    28
com.tencent.mm                28
com.google.android.talk       27
com.viber.voip                27
com.hamrokeyboard             27
com.linkedin.android          26
com.imangi.templerun2         26
com.opera.mini.native         26
com.shirantech.kantipur       26
com.rovio.angrybirds          25
com.twitter.android           24
com.whatsapp                  24
com.uc.browser.en             21
jabanaki.todo.todoly          17
Name: count, dtype: int64

In [31]:
sum(testing_df['polarity'])

98

Accuracy of a constant '0' model 

In [33]:
accuracy=((len(testing_df) - sum(testing_df['polarity'])) / len(testing_df))*100
print(f'Testing accuracy of constant "0" polarity model: {accuracy:.2f}%')

Testing accuracy of constant "0" polarity model: 63.43%


Logistic Regression Model 