## Unzip all saved model and load necessary packages

In [12]:
!unzip saved_model.zip

Archive:  saved_model.zip
replace saved_model/count_vector.pickel? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: saved_model/count_vector.pickel  
  inflating: saved_model/nn/variables/variables.data-00000-of-00001  
  inflating: saved_model/nn/variables/variables.index  
  inflating: saved_model/nn/saved_model.pb  
  inflating: saved_model/svc.pkl     
  inflating: saved_model/lstm_encoder.pickel  
  inflating: saved_model/lstm/variables/variables.data-00000-of-00001  
  inflating: saved_model/lstm/variables/variables.index  
  inflating: saved_model/lstm/saved_model.pb  


In [13]:
!pip install vaderSentiment
!pip install tensorflow-datasets




In [14]:
import pickle
import joblib
import string
import pandas as pd
import numpy as np
from string import punctuation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#nltk
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.tokenize import word_tokenize,sent_tokenize
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

import sklearn
from sklearn.svm import SVC
import tensorflow as tf
from keras.datasets import imdb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1. Benchmark - Using VadarSentiment

In [15]:
# New words and values to update the Lexicon.
new_words = {
    'crushes': 10,
    'beats': 5,
    'increase':10,
    'increasing':10,
    'long': 50,
    'misses': -5,
    'trouble': -10,
    'falls': -100,
    'drops':-100,
    'dropping':-200,
    'falling':-100,
    
}

In [16]:
analyser = SentimentIntensityAnalyzer()
# Example
score = analyser.polarity_scores("Apple Stock Is Falling Again. Why That’s Not a Problem for the Dow.")
print(score)
analyser.lexicon.update(new_words)
# Example
score = analyser.polarity_scores("Apple Stock Is Falling Again. Why That’s Not a Problem for the Dow.")
print(score)

{'neg': 0.108, 'neu': 0.74, 'pos': 0.152, 'compound': 0.1675}
{'neg': 0.884, 'neu': 0.096, 'pos': 0.02, 'compound': -0.9992}


# 2. Using LSTM model trained on IMDB dataset from tensorflow

### Preprocessing functions for LSTM model

In [17]:
def pad_to_size(vec, size):
    zeros = [0] * (size - len(vec))
    vec.extend(zeros)
    return vec

def sample_predict(sample_pred_text, encoder, pad, model):
    encoded_sample_pred_text = encoder.encode(sample_pred_text)

    if pad:
        encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64)
    encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.float32)
    predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))

    return (predictions)



## Load model

In [18]:
encoder_loaded = pickle.load(open("saved_model/lstm_encoder.pickel", "rb"))
lstm_loaded = tf.keras.models.load_model('saved_model/lstm')

# Check its architecture
lstm_loaded.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          523840    
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         66048     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 635,329
Trainable params: 635,329
Non-trainable params: 0
________________________________________________

### Prediction Example by LSTM

In [19]:
new_prediction = sample_predict("Apple Stock Is Falling Again. Why That’s Not a Problem for the Dow.", encoder = encoder_loaded, pad=False, model = lstm_loaded)
print(new_prediction)

[[-0.48384848]]


# 3. Building a Neural Network and train on IMDB Dataset

## Text preprocessing

In [20]:
TOP_WORDS = 10000
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()

In [21]:

word2index = imdb.get_word_index()
word2index = {k:(v+3) for k,v in word2index.items()}

def clean_symbols(text):
    
    for char in text:
        # remove punctuation but preserve symbols defined above 
        if char in string.punctuation and char != ' ':
            text = text.replace(char, '')
        # remove all other characters
        if char.isalpha() is False and char.isdigit() is False and char != ' ':
            text = text.replace(char, '')
          
    return ' '.join(text.split())

def clean_words(news, dimension = TOP_WORDS):
    cleaned = clean_symbols(news).lower()
    test=[]
    for word in word_tokenize(cleaned):
        if word in word2index:
            test.append(word2index[word])
    
    results = np.zeros(dimension)
    for _ , sequence in enumerate(test):
        if sequence < dimension:
            results[sequence] = 1
    
    results = np.reshape(results,(1, TOP_WORDS))

    return results

## Load the model

In [22]:
NN_loaded = tf.keras.models.load_model('saved_model/nn')

# Check its architecture
NN_loaded.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 50)                500050    
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_7 (Dense)              (None, 16)                816       
_________________________________________________________________
dropout_4 (Dropout)          (None, 16)               

In [23]:
def predict_sentiment(x_test, NN):

    x_test = clean_words(x_test, dimension = TOP_WORDS)
    #print(x_test.shape)
    prediction = NN.predict(x_test) 
    #print(prediction)
    return prediction

predict_sentiment("it is good and let's have a try", NN_loaded)


array([[0.9595045]], dtype=float32)

# SVC trained on another dataset

In [24]:
# making list stopwords for removing stopwords from our text 
stop = set(stopwords.words('english'))
stop.update(punctuation)
print(stop)

{'here', "you'll", 'who', '.', 'our', 'be', 'herself', 'below', 'through', 'more', "she's", 'being', 'their', 'having', "'", '}', 'then', ';', 'of', "you're", 'up', 'why', '$', 'as', "that'll", 't', 'didn', "weren't", 'your', 'haven', 'if', 'yourselves', 'have', '<', 'down', "needn't", '-', 'with', "hasn't", 'won', 'there', 're', 'hers', 'what', "you'd", 'when', '&', '?', '_', 'his', 'from', 'mustn', "shan't", 'doing', 'isn', '`', 'should', 'shouldn', 'me', 'is', ':', 'these', 'but', 'than', "don't", 'him', 'm', '#', 'each', ']', 'too', 'off', '\\', 'wasn', '%', 'against', 'ain', 'hasn', "wouldn't", 'both', 'not', 'so', 'over', 'can', '[', "it's", "should've", "aren't", 'whom', 'he', 'its', 'has', 'above', '>', '(', 'shan', 'while', 'yourself', 'during', "didn't", '^', "isn't", 'just', 'about', 'ma', 'doesn', "mustn't", 'does', 'ourselves', 'yours', 'further', 'after', 've', '{', 'before', 'mightn', '!', 'a', "wasn't", 'once', 'how', ')', "doesn't", '~', 'her', 'did', 'at', 'will', 'on

In [25]:
from bs4 import BeautifulSoup
import re
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')

    clean_text = []
    for w in word_tokenize(text):
        if w.lower() not in stop:
            clean_text.append(w)
    return clean_text
    
def join_text(text):
    return " ".join(text)

## Load the model

In [26]:
# load pickle
count_vec_loaded = pickle.load(open("saved_model/count_vector.pickel", "rb"))

# load
svc_loaded = joblib.load("saved_model/svc.pkl")

In [27]:
def svc_predict(svc, count_vec, news):
    news_processed = cleanText(news)
    news_processed = join_text(news_processed)
    
    test_news = count_vec.transform([news_processed]).todense()
    return svc.predict_proba(test_news)

In [28]:
print(svc_predict(svc_loaded, count_vec_loaded, "Investors are looking to buy more stocks"))

[[0.06255801 0.72184132 0.21560067]]


# Combine all models together to give the final rating

In [29]:
def predict_rating(news, lstm, NN):
    benchmark_rating = analyser.polarity_scores(news)['compound']
    print(f'result from vadar: {benchmark_rating:.2f}')

    # Predicted by LSTM model, using padding = True
    rating_lstm = sample_predict(news, encoder_loaded, pad=True, model = lstm)[0][0]
    print(f'result from lstm: {rating_lstm:.2f}')
  
    # predicted by NN
    rating_NN = predict_sentiment(news, NN)[0][0]
    print(f'result from Neural Network: {rating_NN:.2f}')
    
    # predictd by SVC
    rating_svc = svc_predict(svc_loaded, count_vec_loaded, news)[0]
    print(f"result from SVC: positive:{rating_svc[2]:.2f}, neutral:{rating_svc[1]:.2f}, negative:{rating_svc[0]:.2f}")

    if np.argmax(rating_svc) == 2:
        # the news is in the positive side
        if benchmark_rating >= 0.05 and rating_lstm >=0 and rating_NN >= 0.5:
            return "Strongly Positive", rating_svc[2]
        elif benchmark_rating >= 0.05 or rating_lstm >=0 or rating_NN >= 0.5:
            return "Slightly Positive", rating_svc[2]
        else:
            return "Positive", rating_svc[2]

    elif np.argmax(rating_svc) == 0:
        # the news is in the negative side
        if benchmark_rating <-0.05 and rating_lstm <0 and rating_NN < 0.5:
            return "Strongly Negative", rating_svc[0]* -1
        elif benchmark_rating <-0.05 or rating_lstm <0 or rating_NN < 0.5:
            return "Slightly Negative", rating_svc[0] * -1
        else:
            return "Negative", rating_svc[0] * -1

    else:
        if benchmark_rating < -0.05:
            return "Negative", rating_svc[0] * -1
        if benchmark_rating > 0.05:
            return "Positive", rating_svc[2]
        if rating_svc[1] > 0.7:
            return "Neutral", rating_svc[1] * 0
        else:
            if rating_svc[0] > rating_svc[2]:
                return "Slightly Negative", rating_svc[0] * -1
            else:
                return "Slightly Positive", rating_svc[2]

            
    

# Prediction Example by using the final combined model

In [30]:
predict_rating("Apple Stock Is Falling Again. Why That’s Not a Problem for the Dow.", lstm_loaded, NN_loaded)

result from vadar: -1.00
result from lstm: -0.31
result from Neural Network: 0.00
result from SVC: positive:0.09, neutral:0.89, negative:0.02


('Negative', -0.02164393735425806)

In [31]:
predict_rating("Apple is launching new products in the next month and it displays good prospect. Investors are keen to invest", lstm_loaded, NN_loaded)

result from vadar: 0.77
result from lstm: 2.15
result from Neural Network: 1.00
result from SVC: positive:0.32, neutral:0.65, negative:0.03


('Positive', 0.3216184616392452)

In [32]:
predict_rating("'Total blackout': Facebook, Instagram crashes across the globe", lstm_loaded, NN_loaded)

result from vadar: 0.00
result from lstm: 3.63
result from Neural Network: 0.90
result from SVC: positive:0.02, neutral:0.94, negative:0.04


('Neutral', 0.0)

In [33]:
predict_rating("The market is falling", lstm_loaded, NN_loaded)

result from vadar: -1.00
result from lstm: 0.48
result from Neural Network: 0.22
result from SVC: positive:0.05, neutral:0.93, negative:0.02


('Negative', -0.018970463804799848)

In [34]:
predict_rating("Vodafone suffers network outage", lstm_loaded, NN_loaded)

result from vadar: -0.48
result from lstm: 0.48
result from Neural Network: 0.11
result from SVC: positive:0.05, neutral:0.93, negative:0.01


('Negative', -0.014486554855328676)

In [35]:
predict_rating("Facebook is generating high profit in the current quarter", lstm_loaded, NN_loaded)

result from vadar: 0.44
result from lstm: 0.57
result from Neural Network: 0.01
result from SVC: positive:0.46, neutral:0.30, negative:0.23


('Slightly Positive', 0.4630470420612455)

In [36]:
predict_rating("Tesla is generating large operating loss in the current quarter", lstm_loaded, NN_loaded)

result from vadar: -0.32
result from lstm: -2.43
result from Neural Network: 0.00
result from SVC: positive:0.07, neutral:0.07, negative:0.86


('Strongly Negative', -0.8609715066051844)

In [38]:
predict_rating("SoftBank is under investigation by the SEC following its risky 'Nasdaq whale' investments (UBER, AAPL, TSLA)", lstm_loaded, NN_loaded)

result from vadar: -0.20
result from lstm: -2.01
result from Neural Network: 0.53
result from SVC: positive:0.09, neutral:0.90, negative:0.01


('Negative', -0.011860595932658261)

In [39]:
predict_rating("Apple can definitely build a car — but it wouldn't want to sell it in the US (AAPL)", lstm_loaded, NN_loaded)


result from vadar: 0.13
result from lstm: -0.03
result from Neural Network: 0.99
result from SVC: positive:0.06, neutral:0.89, negative:0.05


('Positive', 0.06271062598495668)