# Movie review pipeline

In [40]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# import CountVectorizer, nltk
from sklearn.feature_extraction.text import CountVectorizer
import nltk

In [41]:
# loading all files. 

data = pd.read_csv("all-data.csv", encoding='unicode_escape',names=['Sentiment', 'Text'])
data['Sentiment']

# Split data into training and test sets
sentence_train, sentence_test, y_train, y_test =  train_test_split(data["Text"], data["Sentiment"], test_size = 0.20, shuffle = False)
print(sentence_train)


0       According to Gran , the company has no plans t...
1       Technopolis plans to develop in stages an area...
2       The international electronic industry company ...
3       With the new production plant the company woul...
4       According to the company 's updated strategy f...
                              ...                        
3871    The newspaper 's best sales asset is high qual...
3872    The non-recurring costs caused to Talentum 's ...
3873    The ongoing project where Tekla Structures is ...
3874    The operations to be sold include manufacturin...
3875    The options might include a partial or total d...
Name: Text, Length: 3876, dtype: object


In [42]:
# Set NLTK data path
nltk.data.path.append(r"C:\Users\Rebecka\Documents\Liu\Kurser\Maskininlärning\Project\TNM108")

# Download NLTK data
nltk.download('punkt')

# Create a CountVectorizer instance
#Vzer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=3000)
Vzer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)

# Transform the training data into a matrix of token counts
sentence_train_counts = Vzer.fit_transform(sentence_train)


# initialize CountVectorizer
#movieVzer= CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=3000) # use top 3000 words only. 78.25% acc.
#movieVzer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)         # use all 25K words. Higher accuracy

# fit and tranform using training text 
#sentence_train_counts = movieVzer.fit_transform(sentence_train)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rebecka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
# Convert raw frequency counts into TF-IDF values
Tfmer = TfidfTransformer()
docs_train_tfidf = Tfmer.fit_transform(sentence_train_counts)

In [44]:
# Using the fitted vectorizer and transformer, tranform the test data
docs_test_counts = Vzer.transform(sentence_test)
docs_test_tfidf = Tfmer.transform(docs_test_counts)

NameError: name 'docs_test' is not defined

In [None]:
# Now ready to build a classifier. 
# We will use Multinominal Naive Bayes as our model
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Train a Multimoda Naive Bayes classifier. Again, we call it "fitting"
clf = MultinomialNB()
clf.fit(docs_train_tfidf, y_train)

In [None]:
#Parameter tuning using grid search
from sklearn.model_selection import GridSearchCV
parameters = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0],
    'class_prior': [None, [0.3, 0.7], [0.4, 0.6]],
    'fit_prior': [True, False],
}

#Use all CPU cores:
gs_clf = GridSearchCV(clf, parameters, cv=5, n_jobs=-1)

alpha:

    alpha is the smoothing parameter for Laplace smoothing. It helps handle unseen words in the test data. Common choices include 0 (no smoothing) and values like 0.1, 1.0, and 10.0.

class_prior:

    This parameter allows you to specify prior probabilities of the classes. If provided, the priors are not adjusted based on the data. It can be set to either None (indicating uniform class priors) or an array-like object representing prior probabilities of the classes.

fit_prior:

    This is a boolean parameter that indicates whether to learn class prior probabilities from the data. If set to True, the algorithm will estimate class priors based on the training data. If set to False, it uses a uniform prior

In [None]:
gs_clf = gs_clf.fit(docs_train_tfidf, y_train)

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

alpha: 0.5
class_prior: None
fit_prior: True


In [None]:
predicted = gs_clf.predict(docs_test_tfidf)

In [None]:
print(np.mean(predicted == y_test))

0.8175


# Parameter Tuning Using Grid Search

In [None]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=
True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])

In [None]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [None]:
import numpy as np
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
print("multinomialBC accuracy ",np.mean(predicted == twenty_test.target))

multinomialBC accuracy  0.9101198402130493


In [None]:
# training SVM classifier
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42
,max_iter=5, tol=None)),
])
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
print("SVM accuracy ",np.mean(predicted == twenty_test.target))

SVM accuracy  0.9101198402130493


In [None]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.91      0.91      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



In [None]:
print(metrics.confusion_matrix(twenty_test.target, predicted))

[[256  11  16  36]
 [  4 380   3   2]
 [  5  35 353   3]
 [  5  11   4 378]]


In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
print(twenty_train.target_names[gs_clf.predict(['God is love'])[0]])

soc.religion.christian


In [None]:
print(gs_clf.best_score_)

0.9175000000000001


In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


# Movie reviews sentimental

In [None]:
import sklearn
from sklearn.datasets import load_files

In [None]:
moviedir = 'movie_reviews'

# loading all files. 
movie = load_files(moviedir, shuffle=True)

In [None]:
len(movie.data)

2000

In [None]:
#target names ("classes") are automatically generated from subfolder names
movie.target_names

['neg', 'pos']

In [None]:
# First file seems to be about a Schwarzenegger movie. 
movie.data[0][:500]

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so cal"

# A detour: try out CountVectorizer & TF-IDF

In [None]:
# import CountVectorizer, nltk
from sklearn.feature_extraction.text import CountVectorizer
import nltk

In [None]:
# Turn off pretty printing of jupyter notebook... it generates long lines
%pprint

Pretty printing has been turned OFF


In [None]:
# Three tiny "documents"
docs = ['A rose is a rose is a rose is a rose.',
        'Oh, what a fine day it is.',
        "A day ain't over till it's truly over."]

In [None]:
# Initialize a CountVectorizer to use NLTK's tokenizer instead of its 
#    default one (which ignores punctuation and stopwords). 
# Minimum document frequency set to 1. 
fooVzer = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)

In [None]:
# .fit_transform does two things:
# (1) fit: adapts fooVzer to the supplied text data (rounds up top words into vector space) 
# (2) transform: creates and returns a count-vectorized output of docs
docs_counts = fooVzer.fit_transform(docs)

# fooVzer now contains vocab dictionary which maps unique words to indexes
fooVzer.vocabulary_

{'a': 3, 'rose': 12, 'is': 7, '.': 2, 'oh': 10, ',': 1, 'what': 15, 'fine': 6, 'day': 5, 'it': 8, 'ai': 4, "n't": 9, 'over': 11, 'till': 13, "'s": 0, 'truly': 14}

In [None]:
# docs_counts has a dimension of 3 (document count) by 16 (# of unique words)
docs_counts.shape

(3, 16)

In [None]:
# this vector is small enough to view in a full, non-sparse form! 
docs_counts.toarray()

array([[0, 0, 1, 4, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0]], dtype=int64)

In [None]:
# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
fooTfmer = TfidfTransformer()

# Again, fit and transform
docs_tfidf = fooTfmer.fit_transform(docs_counts)

In [None]:
# TF-IDF values
# raw counts have been normalized against document length, 
# terms that are found across many docs are weighted down ('a' vs. 'rose')
docs_tfidf.toarray()

array([[0.        , 0.        , 0.11337964, 0.45351858, 0.        ,
        0.        , 0.        , 0.4379908 , 0.        , 0.        ,
        0.        , 0.        , 0.7678737 , 0.        , 0.        ,
        0.        ],
       [0.        , 0.39427404, 0.2328646 , 0.2328646 , 0.        ,
        0.29985557, 0.39427404, 0.29985557, 0.29985557, 0.        ,
        0.39427404, 0.        , 0.        , 0.        , 0.        ,
        0.39427404],
       [0.30352608, 0.        , 0.17926739, 0.17926739, 0.30352608,
        0.23083941, 0.        , 0.        , 0.23083941, 0.30352608,
        0.        , 0.60705216, 0.        , 0.30352608, 0.30352608,
        0.        ]])

In [None]:
# A list of new documents
newdocs = ["I have a rose and a lily.", "What a beautiful day."]

# This time, no fitting needed: transform the new docs into count-vectorized form
# Unseen words ('lily', 'beautiful', 'have', etc.) are ignored
newdocs_counts = fooVzer.transform(newdocs)
newdocs_counts.toarray()

array([[0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

# Back to real data: movie reviews

In [None]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(movie.data, movie.target, 
                                                          test_size = 0.20, random_state = 12)

In [None]:
# initialize CountVectorizer
movieVzer= CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=3000) # use top 3000 words only. 78.25% acc.
# movieVzer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)         # use all 25K words. Higher accuracy

# fit and tranform using training text 
docs_train_counts = movieVzer.fit_transform(docs_train)

In [None]:
# 'screen' is found in the corpus, mapped to index 2290
movieVzer.vocabulary_.get('screen')

2291

In [None]:
# Likewise, Mr. Steven Seagal is present...
movieVzer.vocabulary_.get('seagal')

2298

In [None]:
# huge dimensions! 1,600 documents, 3K unique terms. 
docs_train_counts.shape

(1600, 3000)

In [None]:
# Convert raw frequency counts into TF-IDF values
movieTfmer = TfidfTransformer()
docs_train_tfidf = movieTfmer.fit_transform(docs_train_counts)

In [None]:
# Same dimensions, now with tf-idf values instead of raw frequency counts
docs_train_tfidf.shape

(1600, 3000)

# The feature extraction functions and traning data are ready.

    Vectorizer and transformer have been built from the training data
    Training data text was also turned into TF-IDF vector form

## Next up: test data

    You have to prepare the test data using the same feature extraction scheme.



In [None]:
# Using the fitted vectorizer and transformer, tranform the test data
docs_test_counts = movieVzer.transform(docs_test)
docs_test_tfidf = movieTfmer.transform(docs_test_counts)

#### Training and testing a Naive Bayes classifier

In [None]:
# Now ready to build a classifier. 
# We will use Multinominal Naive Bayes as our model
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Train a Multimoda Naive Bayes classifier. Again, we call it "fitting"
clf = MultinomialNB()
clf.fit(docs_train_tfidf, y_train)

MultinomialNB()

In [None]:
# Predict the Test set results, find accuracy
y_pred = clf.predict(docs_test_tfidf)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.7825

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[164,  42],
       [ 45, 149]], dtype=int64)

#### Trying the classifier on fake movie reviews

In [None]:
# very short and fake movie reviews
reviews_new = ['This movie was excellent', 'Absolute joy ride', 
            'Steven Seagal was terrible', 'Steven Seagal shone through.', 
              'This was certainly a movie', 'Two thumbs up', 'I fell asleep halfway through', 
              "We can't wait for the sequel!!", '!', '?', 'I cannot recommend this highly enough', 
              'instant classic.', 'Steven Seagal was amazing. His performance was Oscar-worthy.']

reviews_new_counts = movieVzer.transform(reviews_new)         # turn text into count vector
reviews_new_tfidf = movieTfmer.transform(reviews_new_counts)  # turn into tfidf vector

In [None]:
# have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)

In [None]:
# print out results
for review, category in zip(reviews_new, pred):
    print('%r => %s' % (review, movie.target_names[category]))

'This movie was excellent' => pos
'Absolute joy ride' => pos
'Steven Seagal was terrible' => neg
'Steven Seagal shone through.' => neg
'This was certainly a movie' => neg
'Two thumbs up' => neg
'I fell asleep halfway through' => neg
"We can't wait for the sequel!!" => neg
'!' => neg
'?' => neg
'I cannot recommend this highly enough' => pos
'instant classic.' => pos
'Steven Seagal was amazing. His performance was Oscar-worthy.' => neg


In [None]:
# Mr. Seagal simply cannot win!

# Final notes

    In practice, you should use TfidfVectorizer, which is CountVectorizer and TfidfTranformer conveniently rolled into one:

    from sklearn.feature_extraction.text import TfidfVectorizer



    Also: It is a popular practice to use pipeline, which pairs up your feature extraction routine with your choice of ML model:

    model = make_pipeline(TfidfVectorizer(), MultinomialNB())

