<a href="https://colab.research.google.com/github/Arpith01/directional_stock_prediction/blob/master/notebooks/models/SVC_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colabcode

In [None]:
!pip install fastapi

In [3]:
import unicodedata
import re
import pandas as pd
import numpy as np
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer,PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import LancasterStemmer,PorterStemmer


def nonAsciiChar(words):
    words_list=[]
    for w in words:
        w=re.sub('[^a-zA-Z]+','',re.sub(r'[\W\d]','',w.lower()))
        format_words=unicodedata.normalize('NFKD', w).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        words_list.append(format_words)
        
    return words_list
def stemWordsRemoval(words):
    stemmer = PorterStemmer()
    words_list=[]
    for word in words:
        word=stemmer.stem(word)
        if word not in words_list:
            words_list.append(word)
    return words_list

def stopWordsRemoval(words):
    words_list=[]
    for w in words:
        if w not in stopwords:
            words_list.append(w)
    return words_list

def removeLinks(words):
    words_list=[]
    for w in words:
        if not re.match('[www]',w):
            words_list.append(w)
    return words_list

def spaceRemoval(words):
    words_list=[]
    for w in words:
        if w!='':
            words_list.append(w)
    return words_list


def dataExtraction(words):
    words=nonAsciiChar(words)
    words=spaceRemoval(words)
    words=stopWordsRemoval(words)
    words=stemWordsRemoval(words)
    words=removeLinks(words)
    return words

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...




[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
def get_sentences(news_article):
    paragraphs = nltk.sent_tokenize(news_article.lower())
    sentences=[]
    is_amazon = []
    amzn_sentences = []
    apple_sentences = []
    key_words=['amazon','apple','aapl','amzn']
    amazon_keywords = ['amazon', 'amzn']
    apple_keywords = ['apple', 'aapl']

    for para in paragraphs:
      sentences_list = para.split("\n")
      for sentence in sentences_list:
        if any(key in sentence for key in amazon_keywords):
          sentences.append(sentence)
          is_amazon.append(1)
        if any(key in sentence for key in apple_keywords):
          sentences.append(sentence)
          is_amazon.append(0)
          # if any(key in sentence for key in key_words):
          #   sentences.append(sentence)


    if len(sentences) == 0:
      return None
    
    sentences_df = pd.DataFrame({'text':sentences, 'is_amazon':is_amazon})

    sentences_df=sentences_df[sentences_df['text'].str.match('^[A-Z a-z 0-9]+')]

    sentences_df=sentences_df.drop_duplicates(keep=False).reset_index(drop=True)

    sentences_df['words'] = sentences_df.text.apply(word_tokenize)
    sentences_df['words'] = sentences_df.words.apply(dataExtraction)
    sentences_df['text'] = sentences_df.words.apply(lambda words: " ".join(words))
    sentences_df = sentences_df.drop(columns="words")

    return sentences_df

In [5]:
from pydantic import BaseModel
from fastapi import FastAPI

In [6]:
from fastapi.middleware.cors import CORSMiddleware

In [7]:
class Sentence(BaseModel):
  news_article:str

In [8]:
class Prediction(BaseModel):
  prediction_apple:str
  confidence_apple:str
  prediction_amazon:str
  confidence_amazon:str

In [9]:
app = FastAPI()

In [10]:
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import pickle
path = "/content/drive/MyDrive/new_data_ARS/SVM_LinearSVC_2.sav"
infile = open(path,'rb')
SVC_Linear = pickle.load(infile)
infile.close()



In [13]:
saved_model_path = SVC_Linear
saved_model_path

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=10)

In [14]:
path = "/content/drive/MyDrive/new_data_ARS/vectorizer"
infile = open(path,'rb')
vectorizer_dict = pickle.load(infile)
infile.close()

In [15]:
vectorizer = vectorizer_dict['vectorizer']

In [16]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=4000, min_df=10,
                ngram_range=(2, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [17]:
def get_features(X_test):
  cls = SentimentIntensityAnalyzer()
  Positive_X_test_text = []
  Negative_X_test_text = []

  for i in tqdm(X_test):
      n1= cls.polarity_scores(i)["neg"]
      n2= cls.polarity_scores(i)["pos"]
      Positive_X_test_text.append(n1)
      Negative_X_test_text.append(n2)
  
  # vectorizer = CountVectorizer(ngram_range = (2,2),max_features = 4000, min_df = 10)
  
  X_test_2gram_features = vectorizer.transform(X_test)
  Sentiment_X_test_text = np.column_stack((Positive_X_test_text, Negative_X_test_text))

  X_test_2gram_features_array = X_test_2gram_features.toarray()
  Xtest = np.concatenate((Sentiment_X_test_text,X_test_2gram_features_array), axis = 1)
  return Xtest

In [18]:
@app.on_event("startup")
def load_model_from_file():
  global inference_model
  inference_model = saved_model_path


@app.get('/')
def index():
  return{'message':'Post an article to this API to get inference'}

@app.post('/predict')
def classify_sentence(data:Sentence):
  received_data = data.dict()
  news_article = received_data['news_article']
  sentences = get_sentences(news_article)
  prediction_class_amzn = "-1"
  prediction_class_apple = "-1"
  confidence_amzn = '0'
  confidence_apple = '0'

  if sentences is None:
    p = Prediction(prediction_apple=prediction_class_apple, confidence_apple=confidence_apple, prediction_amazon = prediction_class_amzn, confidence_amazon=confidence_amzn)
    return p


  amazon_sentences = sentences[sentences['is_amazon'] == 1].text
  apple_sentences = sentences[sentences['is_amazon'] == 0].text

  # posterior_prob = tf.sigmoid(inference_model.predict(sentences))

  if(len(amazon_sentences) != 0):
    amazon_sentences_test = get_features(amazon_sentences)
    posterior_prob_amzn = inference_model.predict(amazon_sentences_test)
    prediction_class_amzn = np.where(posterior_prob_amzn>0.5, 1, 0)
    bin_count_amzn = np.bincount(prediction_class_amzn.flatten())

    prediction_class_amzn = np.bincount(prediction_class_amzn.flatten()).argmax()

    confidence_amzn = bin_count_amzn[prediction_class_amzn]/np.sum(bin_count_amzn)
    confidence_amzn = str(confidence_amzn)
    prediction_class_amzn = str(prediction_class_amzn)


  if(len(apple_sentences) != 0):
    apple_sentences_test = get_features(apple_sentences)
    posterior_prob_apple = inference_model.predict(apple_sentences_test)
    prediction_class_apple = np.where(posterior_prob_apple>0.5, 1, 0)
    bin_count_apple = np.bincount(prediction_class_apple.flatten())

    prediction_class_apple = np.bincount(prediction_class_apple.flatten()).argmax()

    confidence_apple = bin_count_apple[prediction_class_apple]/np.sum(bin_count_apple)
    prediction_class_apple = str(prediction_class_apple)
    confidence_apple = str(confidence_apple)

  # prediction_classes = np.where(posterior_prob>0.5, 1, 0)

  # print(prediction_classes)
  # prediction_class = np.bincount(prediction_classes.flatten()).argmax()
  # p = Prediction(prediction=str(prediction_class))
  p = Prediction(prediction_apple=prediction_class_apple, confidence_apple=confidence_apple, prediction_amazon = prediction_class_amzn, confidence_amazon=confidence_amzn)

  return p

In [19]:
from colabcode import ColabCode
server = ColabCode(port=10000, code=False)

In [None]:
server.run_app(app)



INFO:     Started server process [63]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:10000 (Press CTRL+C to quit)


Public URL: NgrokTunnel: "http://fc8ffcd2c751.ngrok.io" -> "http://localhost:10000"
INFO:     72.223.1.23:0 - "GET / HTTP/1.1" 200 OK


100%|██████████| 1/1 [00:00<00:00, 969.78it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 2/2 [00:00<00:00, 295.58it/s]
100%|██████████| 1/1 [00:00<00:00, 245.88it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 674.22it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 11/11 [00:00<00:00, 2309.18it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 11/11 [00:00<00:00, 1813.00it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 579.72it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 817.76it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK





INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK


100%|██████████| 1/1 [00:00<00:00, 419.89it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 11/11 [00:00<00:00, 2005.36it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 620.83it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 609.73it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 1113.73it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 1846.08it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 260.84it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



