In [45]:
# import important modules
import numpy as np
import pandas as pd
# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 

from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# text preprocessing modules
from string import punctuation 
# text preprocessing modules
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression
# Download dependency
for dependency in (
    "brown",
    "names",
    "wordnet",
    "averaged_perceptron_tagger",
    "universal_tagset",
):
    nltk.download(dependency)
    
import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

[nltk_data] Downloading package brown to
[nltk_data]     /Users/davisdavid/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     /Users/davisdavid/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/davisdavid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/davisdavid/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/davisdavid/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [6]:
# load data
data = pd.read_csv("labeledTrainData.tsv", sep='\t')

In [7]:
# show top five rows of data
data.head() 

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [8]:
# check the shape of the data
data.shape 

(25000, 3)

In [9]:
# check missing values in data
data.isnull().sum() 

id           0
sentiment    0
review       0
dtype: int64

In [10]:
# evalute news sentiment distribution
data.sentiment.value_counts() 

1    12500
0    12500
Name: sentiment, dtype: int64

In [13]:
stop_words =  stopwords.words('english')
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [16]:
#clean the review
data["cleaned_review"] = data["review"].apply(text_cleaning) 

In [17]:
#split features and target from  data 
X = data["cleaned_review"]
y = data.sentiment.values

In [18]:
# split data into train and validate
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [43]:
# Create a classifier in pipeline
sentiment_classifier = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('naive_bayes',MultinomialNB())
                                 ])

In [46]:
scores = []
cv = KFold(n_splits=10, random_state=42, shuffle=True)
for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
    sentiment_classifier.fit(X_train, y_train)
    scores.append(sentiment_classifier.score(X_test, y_test))

Train Index:  [    0     1     2 ... 24997 24998 24999] 

Test Index:  [   17    29    30 ... 24974 24983 24988]
Train Index:  [    0     1     2 ... 24996 24997 24998] 

Test Index:  [   44    46    54 ... 24985 24986 24999]
Train Index:  [    0     1     2 ... 24996 24998 24999] 

Test Index:  [    3     6    31 ... 24968 24984 24997]
Train Index:  [    1     2     3 ... 24997 24998 24999] 

Test Index:  [    0     4    19 ... 24970 24971 24992]
Train Index:  [    0     1     2 ... 24997 24998 24999] 

Test Index:  [    5     8    14 ... 24969 24990 24995]
Train Index:  [    0     1     2 ... 24997 24998 24999] 

Test Index:  [    7    23    36 ... 24979 24993 24994]
Train Index:  [    0     1     3 ... 24997 24998 24999] 

Test Index:  [    2    10    12 ... 24946 24987 24991]
Train Index:  [    0     2     3 ... 24997 24998 24999] 

Test Index:  [    1    18    48 ... 24941 24949 24973]
Train Index:  [    0     1     2 ... 24997 24998 24999] 

Test Index:  [   16    24    25 ... 24

In [50]:
print(np.mean(scores))

0.86764


In [39]:
# train the sentiment classifier 
sentiment_classifier.fit(X_train,y_train)

Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('naive_bayes', MultinomialNB())])

In [51]:
# test model performance on valid data 
y_preds = sentiment_classifier.predict(X_valid)

In [52]:
accuracy_score(y_valid,y_preds)

0.9154666666666667

In [24]:
#save model 
import joblib 
joblib.dump(sentiment_classifier, 'model/sentiment_model_pipeline.pkl')

['model/sentiment_model_pipeline.pkl']

In [25]:
#load model 
my_model = joblib.load("model/sentiment_model_pipeline.pkl")

In [31]:
review = text_cleaning("I love the movie from the marvel it was a best movie ever") 

In [32]:
my_model.predict([review]) 

array([1])