In [66]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn import naive_bayes
import pickle

In [67]:
def download_stopwords():
    '''This function downloads the already cathegorized words from the natural language toolkit (nlt)'''
    nltk.download('stopwords')

In [68]:
download_stopwords()

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [69]:
def initialising_paths():
    '''This function points where data for modeling is located and where to save the model after modeling'''

    global path_to_data
    global save_the_mode_path
    save_the_mode_path = 'D:/github-repos/nlp-sentiment-analysis/models/nlp_model.pkl'
    path_to_data = 'D:/github-repos/nlp-sentiment-analysis/data/01_raw/reviews.txt'

In [70]:
initialising_paths()

In [71]:
def load_data():
    '''This function loads the model from the reviews text file and sets them into a two column dataframe'''
    global reviews_data
    reviews_data = pd.read_csv(path_to_data, sep='\t', names=['Reviews', 'Comments'])

In [72]:
load_data()

In [73]:
reviews_data

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [74]:
def set_language():
    '''Get the english version of the cathegorized text from nltk'''
    global stopset
    stopset = set(stopwords.words('english'))

In [75]:
set_language()

In [76]:
def make_vectorizer():
    '''Initialising the function to turn text into numbers so the computer can understand'''
    global vectorizer
    vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [77]:
make_vectorizer()

In [78]:
def load_the_features():
    '''This function converts the review text into numbers'''
    global X, y
    X = vectorizer.fit_transform(reviews_data.Comments)
    y = reviews_data.Reviews
    pickle.dump(vectorizer, open('transform', 'wb'))

In [79]:
load_the_features()

In [80]:
def split_the_data():
    '''This funstion set the foundation to train and test the accuracy of the model'''
    global X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [81]:
split_the_data()

In [82]:
def teach_the_model():
    '''This function fits the data to the model for training'''
    global clf
    clf = naive_bayes.MultinomialNB()
    clf.fit(X_train, y_train)

In [83]:
teach_the_model()

In [84]:
def get_the_model_accuracy():
    '''This function shows us how well our model will perform on unseen data'''
    pred = clf.predict(X_test)
    return accuracy_score(y_test, pred)*100

In [85]:
get_the_model_accuracy()

97.47109826589595

In [86]:
def save_the_mode():
    '''This function saves the trained model'''
    pickle.dump(clf, open(save_the_mode_path, 'wb'))

In [87]:
save_the_mode()

In [89]:
def get_first_five_sentiments():
    '''This function gets the sentiments from the comments and categorizes them as good or bad'''
    reviews_list = []
    reviews_status = []

    for reviews in reviews_data['Comments'][0:5]:
        if reviews:
            reviews_list.append(reviews)
            '''adding the comments from the reviews text into the model'''
            movie_review_list = np.array([reviews])
            movie_vector = vectorizer.transform(movie_review_list)
            pred = clf.predict(movie_vector)
            reviews_status.append('Good' if pred else 'Bad')

    # Combining reviews and comments into a dictionary
    movie_reviews = {reviews_list[i]: reviews_status[i] for i in range(len(reviews_list))}
    return movie_reviews


In [90]:
get_first_five_sentiments() 

{'The Da Vinci Code book is just awesome.': 'Good',
 "this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this.": 'Good',
 'i liked the Da Vinci Code a lot.': 'Good',
 "I liked the Da Vinci Code but it ultimatly didn't seem to hold it's own.": 'Good'}