In [2]:
import joblib

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

ModuleNotFoundError: No module named 'machine_algo'

In [None]:
from utils import clean_text, DenseTransformer

In [None]:
data = pd.read_csv('imdb_labelled.txt', sep='\t', names=[ 'review', 'label'])
data.head()

In [None]:
data.iloc[0]['review']


In [None]:
data['sentiment'] = data["label"].apply(lambda x: "positive" if x else "negative")
data.head()

In [None]:
data.isnull().sum()


In [None]:
Index = [1, 0]

print(data["sentiment"].value_counts())
print()

barlist = plt.bar(Index, data["sentiment"].value_counts())

plt.title("Frequency of Sentiments")
plt.xticks(Index, ['positive', 'negative'])
plt.ylabel('Number of Reviews')
plt.xlabel('Sentiment expressed in Reviews')

barlist[Index[1]].set_color('green')
barlist[Index[0]].set_color('red')
plt.show()

In [None]:
data['clean_review'] = data["review"].apply(clean_text)
data.head()

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(data["clean_review"])
cv_dataframe = pd.DataFrame(count_data.toarray(), columns=count_vectorizer.get_feature_names())

cv_dataframe.head()

In [None]:
tf_idf_vec = TfidfVectorizer(stop_words='english')
tf_idf_data = tf_idf_vec.fit_transform(data["clean_review"])
tf_idf_dataframe = pd.DataFrame(tf_idf_data.toarray(), columns=tf_idf_vec.get_feature_names())
tf_idf_dataframe.head()

In [None]:
X = data['review']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

# Pass the data into pipeline with tfidf and bernoulliNB

In [None]:
tfidf_b = Pipeline([
      ('tfidf', TfidfVectorizer(stop_words="english")),
      ('classifier', BernoulliNB())
])

In [None]:
tfidf_b.fit(X_train, y_train)


In [None]:
joblib.dump(tfidf_b, "models/bernoulli_naive_bayes_with_tfidf_vectorizer.joblib")

In [None]:
model_pred = tfidf_b.predict(X_test)

In [None]:
print(classification_report(y_test, model_pred))

In [None]:

accurancy = []

stratifiedKf_predict = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
stratifiedKf_predict.get_n_splits(X,y)

for train_index, test_index in stratifiedKf_predict.split(X,y):
   
    X1_train, X1_test = X.iloc[train_index], X.iloc[test_index]
    y1_train, y1_test = y.iloc[train_index], y.iloc[test_index]
  
    tfidf_b.fit(X1_train, y1_train)
    result = tfidf_b.score(X1_test, y1_test)
    accurancy.append(result)

accuracy = np.array(accurancy)

# print the ouput
print('list of first 10 possible accurancy')
for index, acc in enumerate(accuracy[:10]):
    print(f"{index+1:3d}. {acc:.4f}")

    
print('\nMetrics that were obtain from the model:')
print(f' Maximum accuracy: {accuracy.max()*100:.2f}%')
print(f' Minimum Accuracy: {accuracy.min()*100:.2f}%')
print(f' Mean_accuracy: {accuracy.mean()*100:.2f}%')
print(f' Std_accuracy: {accuracy.std()*100:.2f}%')

print(accurancy)
print(f' mean_accuarcy = {accuracy.mean():.4f}, std_accuracy = {accuracy.std():.4f}')

# Train the data with count vectorizer and bernoulliNB

In [None]:
pipeNB = Pipeline([
      ('bow', CountVectorizer(stop_words="english")),
      ('classifier', BernoulliNB())
])

In [None]:
pipeNB.fit(X_train, y_train)

In [None]:
joblib.dump(pipeNB, "models/bernoulli_naive_bayes_with_count_vectorizer.joblib")

In [None]:
y_pred = pipeNB.predict(X_test) #predict testing data

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from statistics import mean, stdev 
from sklearn.model_selection import StratifiedKFold 


accuracy = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=100) 
skf.get_n_splits(X,y) 

for train_index, test_index in skf.split(X, y):
  X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index] 
  y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

  pipeNB.fit(X_train_fold, y_train_fold)
  result = pipeNB.score(X_test_fold, y_test_fold)
  accuracy.append(result)


accuracy = np.array(accuracy)

# Print the output
print('List of first 10 possible accuracy:')
for index, acc in enumerate(accuracy[:10]):
    print(f"{index+1:3d}. {acc:.4f}")

print('\nMetrics that were obtained from this model:')
print(f' Maximum Accuracy:   {accuracy.max()*100:.2f}%') 
print(f' Minimum Accuracy:   {accuracy.min()*100:.2f}%') 
print(f' Mean Accuracy:   {accuracy.mean()*100:.2f}%') 
print(f' Standard Deviation: {accuracy.std():.4f}')