In [1]:
import numpy as np
from flask import Flask, request, jsonify, render_template
import pickle
import math
import contractions
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

**Load model and vectorizer**

In [2]:
with open('best-model.pickle', 'rb') as f:
   my_model =  pickle.load(f)
    
with open('tfidf_vectorizer.pickle', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


**Pipeline**

In [3]:
# Get english stop words (most frequent)
stop_words = stopwords.words('english')
# Get WordNetLemmatizer to get the context of words
lemmatizer = WordNetLemmatizer()

In [4]:
# Expand contractions for better Text interpretations and stop words removal
def expand_contractions(text):
    # creating an empty list
    expanded_words1 = []
    for word in text.split():
      # using contractions.fix to expand the shortened words
      expanded_words1.append(contractions.fix(word))

    text = ' '.join(expanded_words1)

    contractions_dict = {
        "'m": " am",
        "'s": " is",
        "'re": " are",
        "'ve": " have",
        "'ll": " will",
        "'d": " would",
    }

    # Case-insensitive flag for the regular expression
    pattern = re.compile(r"\b(" + "|".join(re.escape(key) for key in contractions_dict.keys()) + r")\b", re.IGNORECASE)

    # Replace contractions with their expanded forms
    expanded_text = pattern.sub(lambda match: contractions_dict[match.group(0).lower()], text)

    return expanded_text

# Remove stop words to reduce the dimensionality size and improve total performance
def stopWordsRemoval(sentence):
    filtered_sentence = []
    # Tokenize the sentence
    sentence =  word_tokenize(sentence)
    for w in sentence:
        if w not in stop_words:
            # Only append non stop words
            filtered_sentence.append(w)

    return ' '.join(filtered_sentence)

# This function will be used to help in lemmatization to get pos tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
# For lemmatization (Lemmatization is similar to stemming but it brings context to the words)
def lemmatization(sentence):
    filtered_sentence = []
    sentence = word_tokenize(sentence)
    sentence = pos_tag(sentence)
    tokenCount = len(sentence)
    for i in range (0, tokenCount):
        t = sentence[i][0] # The token
        p = sentence[i][1] # The pos tag
        p = get_wordnet_pos(p)
        l = lemmatizer.lemmatize(t, pos = p)
        filtered_sentence.append(l)

    return ' '.join(filtered_sentence)

def text_preprocessing(text):
    # Matches Twitter handles.
    text = re.sub("(@[A-Za-z0-9]+)", " ",text)

    # Matches URLs.
    text = re.sub("(\w+:\/\/\S+)", " ",text)

    # Matches Hashtags
    text = re.sub(r'#\w*', ' ' , text)

    # Expand contradictions
    text = expand_contractions(text)

    # Convert text to lowercase
    text = text.lower()

    # Remove stop words like are, is, has ...
    text = stopWordsRemoval(text)

    # Lemmatization brings context to the words
    text = lemmatization(text)

    # Matches special characters letters, spaces, and tabs.
    text = re.sub("([^A-Za-z \t])", " ",text)

    # Remove extra white spaces
    text = " ".join(text.split())

    return text

In [5]:
classes = {
    0:'Negative',
    1:'Positive'
}

In [6]:
def predict_sentiment(text):
    # Clean text
    text = [text_preprocessing(text), ]
    
    x = tfidf_vectorizer.transform(text)
    
    y = my_model.predict(x)
    
    y = y[0]
    
    return classes[y]

In [7]:
predict_sentiment('This movie is so bad')

'Negative'

In [8]:
app = Flask(__name__)

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    # Take the text
    text = request.form['text']
    prediction = predict_sentiment(text)
    return render_template('index.html',prediction_text=f"Sentiment: {prediction}")

if __name__ == '__main__':
    app.run(host="0.0.0.0",port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.1.7:5000
Press CTRL+C to quit
127.0.0.1 - - [10/Dec/2023 00:42:59] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2023 00:42:59] "GET /static/css/style.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Dec/2023 00:42:59] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [10/Dec/2023 00:43:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2023 00:43:06] "GET /static/css/style.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Dec/2023 00:43:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2023 00:43:09] "GET /static/css/style.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Dec/2023 00:43:12] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2023 00:43:12] "GET /static/css/style.css HTTP/1.1" 304 -
127.0.0.1 - - [10/Dec/2023 00:43:21] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [10/Dec/2023 00:43:21] "GET /static/css/style.css HTTP/1.1" 304 -
