# MACHINE LEARNING MODEL

---

Libraries and data

In [1]:
# LIBRARIES
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from pickle import dump

# URL
URL = r'C:\Users\Francesc\Documents\GitHub\Naive-Bayes-Project-Tutorial\data\interim\clean_total_data.csv'
total_data = pd.read_csv(URL)
target = 'polarity'

3.1. Train / test division

In [2]:
from sklearn.model_selection import train_test_split

X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 60 version is to...
704    superfast just as i remember it  opera mini wa...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

3.2. Word vectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [4]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (712, 3493)
y_train shape: (712,)


3.3. Model Inicialization

Note: Since the model has a binomial distribution due to de word vectorization, the **Multionmial Naive Bayes** model will be used.

In [5]:
model = MultinomialNB()
model.fit(X_train, y_train)

3.4. Model prediction

In [6]:
y_pred = model.predict(X_test)
y_pred


array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0], dtype=int64)

In [7]:
accuracy_score(y_test, y_pred)

0.7932960893854749

3.5. Model hyperparameters

In [8]:
hyperparams = {
    "alpha": np.linspace(0.01, 20.0, 200),  # Provide specific values for alpha
    "fit_prior": [True, False]  # True or False for fit_prior
}
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 10)
grid

In [9]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
grid.fit(X_train, y_train)
print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'alpha': 2.722211055276382, 'fit_prior': False}


3.6. Optimized performance

In [13]:
model = MultinomialNB(alpha=2.722211055276382, fit_prior=False)
model.fit(X_train, y_train)

In [14]:
accuracy_score(y_test, y_pred)

0.7932960893854749

In [15]:
dump(model, open("naive_bayes_alpha-2.7222_fitprior-false.sav", "wb"))


---
## Conclusions:

1. Since the result was a binarie (the words were vectorized), the model used was multinomial.
2. The initial performance score was 0.793 
3. The model's hyperparameteres were changed according to a gridsearch
4. The second performance was made under the following criteria: 'alpha': 2.722211055276382, 'fit_prior': False
5. The second performance score was the same as the first