# Explore here

In [22]:
import pandas as pd

df = pd.read_csv('/workspaces/naiveb/data/raw/bayes.csv')
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


As we can see there are no missing values, so we do not need to filter. We will remove the package name because it is not that relevant.

In [24]:
df = df.drop(['package_name'], axis=1)
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


Let's process the text now

In [25]:

df["review"] = df["review"].str.strip().str.lower()
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


Let's remove the english contractions so that the analysis is easier.

In [26]:
import contractions
df["review"] = df["review"].apply(contractions.fix)
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who do n...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads are not as annoy...,1


Now let's remove the weird characters because we can't really use them

In [27]:
import re
df["review"] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,messenger issues ever since the last update in...,0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who do n...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well ads are not as annoyi...,1


Let's now divide in train and test 

In [28]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.to_csv("/workspaces/naiveb/data/raw/NAIVEtrain_X.csv", index=False)
with open("/workspaces/naiveb/data/raw/NAIVEtrain_y.txt", "w") as f:
    f.write(y_train.to_string(index=False))
X_test.to_csv("/workspaces/naiveb/data/raw/NAIVEtest_X.csv", index=False)
with open("/workspaces/naiveb/data/raw/NAIVEtest_y.txt", "w") as f:
    f.write(y_test.to_string(index=False))
X_train.head()

X_train.head()


331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 60 version is to...
704    superfast just as i remember it  opera mini wa...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [29]:
y_train.head()

331    0
733    0
382    0
704    1
813    1
Name: polarity, dtype: int64

Now that we have split it, we will create the words matrix.

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Let's now start the predictions. Based on the information provided, I think that the model that needs to be used is the multinomial.

In [31]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [33]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7932960893854749

Let's now optimise the accuracy score.

In [37]:
import numpy as np
param_grid = {
    'force_alpha': [True, False],
    'alpha': np.logspace(-3, 3, 7),  # Vary alpha from 0.001 to 1000
    'fit_prior': [True, False],  # Explore both options for fit_prior
}

In [38]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')


In [39]:
grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'alpha': 0.01, 'fit_prior': True, 'force_alpha': True}


In [40]:
opt_model = grid_search.best_estimator_
y_pred = opt_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8156424581005587

The model is now optimised. 

In [None]:
from pickle import dump

dump(opt_model, open("naive_bayes_default.sav", "wb"))