<a href="https://colab.research.google.com/github/Arpith01/directional_stock_prediction/blob/master/notebooks/models/NB_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colabcode

In [None]:
!pip install fastapi

In [5]:
import unicodedata
import re
import pandas as pd
import numpy as np
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer,PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem import LancasterStemmer,PorterStemmer


def nonAsciiChar(words):
    words_list=[]
    for w in words:
        w=re.sub('[^a-zA-Z]+','',re.sub(r'[\W\d]','',w.lower()))
        format_words=unicodedata.normalize('NFKD', w).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        words_list.append(format_words)
        
    return words_list
def stemWordsRemoval(words):
    stemmer = PorterStemmer()
    words_list=[]
    for word in words:
        word=stemmer.stem(word)
        if word not in words_list:
            words_list.append(word)
    return words_list

def stopWordsRemoval(words):
    words_list=[]
    for w in words:
        if w not in stopwords:
            words_list.append(w)
    return words_list

def removeLinks(words):
    words_list=[]
    for w in words:
        if not re.match('[www]',w):
            words_list.append(w)
    return words_list

def spaceRemoval(words):
    words_list=[]
    for w in words:
        if w!='':
            words_list.append(w)
    return words_list


def dataExtraction(words):
    words=nonAsciiChar(words)
    words=spaceRemoval(words)
    words=stopWordsRemoval(words)
    words=stemWordsRemoval(words)
    words=removeLinks(words)
    return words

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
def get_sentences(news_article):
    paragraphs = nltk.sent_tokenize(news_article.lower())
    sentences=[]
    is_amazon = []
    amzn_sentences = []
    apple_sentences = []
    key_words=['amazon','apple','aapl','amzn']
    amazon_keywords = ['amazon', 'amzn']
    apple_keywords = ['apple', 'aapl']

    for para in paragraphs:
      sentences_list = para.split("\n")
      for sentence in sentences_list:
        if any(key in sentence for key in amazon_keywords):
          sentences.append(sentence)
          is_amazon.append(1)
        if any(key in sentence for key in apple_keywords):
          sentences.append(sentence)
          is_amazon.append(0)
          # if any(key in sentence for key in key_words):
          #   sentences.append(sentence)


    if len(sentences) == 0:
      return None
    
    sentences_df = pd.DataFrame({'text':sentences, 'is_amazon':is_amazon})

    sentences_df=sentences_df[sentences_df['text'].str.match('^[A-Z a-z 0-9]+')]

    sentences_df=sentences_df.drop_duplicates(keep=False).reset_index(drop=True)

    sentences_df['words'] = sentences_df.text.apply(word_tokenize)
    sentences_df['words'] = sentences_df.words.apply(dataExtraction)
    sentences_df['text'] = sentences_df.words.apply(lambda words: " ".join(words))
    sentences_df = sentences_df.drop(columns="words")

    return sentences_df

In [7]:
from pydantic import BaseModel
from fastapi import FastAPI

In [8]:
from fastapi.middleware.cors import CORSMiddleware

In [9]:
class Sentence(BaseModel):
  news_article:str

In [10]:
class Prediction(BaseModel):
  prediction_apple:str
  confidence_apple:str
  prediction_amazon:str
  confidence_amazon:str

In [11]:
app = FastAPI()

In [12]:
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Naive Bayes

In [15]:
import pickle
path = "/content/drive/MyDrive/new_data_ARS/naive_bayes.sav"
infile = open(path,'rb')
Naive_Bayes = pickle.load(infile)
infile.close()



In [16]:
saved_model = Naive_Bayes
saved_model

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
path = "/content/drive/MyDrive/new_data_ARS/vectorizer"
infile = open(path,'rb')
vectorizer_dict = pickle.load(infile)
infile.close()

In [18]:
vectorizer = vectorizer_dict['vectorizer']

In [19]:
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=4000, min_df=10,
                ngram_range=(2, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [20]:
def get_features(X_test):
  cls = SentimentIntensityAnalyzer()
  Positive_X_test_text = []
  Negative_X_test_text = []

  for i in tqdm(X_test):
      n1= cls.polarity_scores(i)["neg"]
      n2= cls.polarity_scores(i)["pos"]
      Positive_X_test_text.append(n1)
      Negative_X_test_text.append(n2)
  
  # vectorizer = CountVectorizer(ngram_range = (2,2),max_features = 4000, min_df = 10)
  
  X_test_2gram_features = vectorizer.transform(X_test)
  Sentiment_X_test_text = np.column_stack((Positive_X_test_text, Negative_X_test_text))

  X_test_2gram_features_array = X_test_2gram_features.toarray()
  Xtest = np.concatenate((Sentiment_X_test_text,X_test_2gram_features_array), axis = 1)
  return Xtest

In [21]:
@app.on_event("startup")
def load_model_from_file():
  global inference_model
  inference_model = saved_model


@app.get('/')
def index():
  return{'message':'Post an article to this API to get inference'}

@app.post('/predict')
def classify_sentence(data:Sentence):
  received_data = data.dict()
  news_article = received_data['news_article']
  sentences = get_sentences(news_article)
  prediction_class_amzn = "-1"
  prediction_class_apple = "-1"
  confidence_amzn = '0'
  confidence_apple = '0'

  if sentences is None:
    p = Prediction(prediction_apple=prediction_class_apple, confidence_apple=confidence_apple, prediction_amazon = prediction_class_amzn, confidence_amazon=confidence_amzn)
    return p


  amazon_sentences = sentences[sentences['is_amazon'] == 1].text
  apple_sentences = sentences[sentences['is_amazon'] == 0].text

  # posterior_prob = tf.sigmoid(inference_model.predict(sentences))

  if(len(amazon_sentences) != 0):
    amazon_sentences_test = get_features(amazon_sentences)
    posterior_prob_amzn = inference_model.predict(amazon_sentences_test)
    prediction_class_amzn = np.where(posterior_prob_amzn>0.5, 1, 0)
    bin_count_amzn = np.bincount(prediction_class_amzn.flatten())

    prediction_class_amzn = np.bincount(prediction_class_amzn.flatten()).argmax()

    confidence_amzn = bin_count_amzn[prediction_class_amzn]/np.sum(bin_count_amzn)
    confidence_amzn = str(confidence_amzn)
    prediction_class_amzn = str(prediction_class_amzn)


  if(len(apple_sentences) != 0):
    apple_sentences_test = get_features(apple_sentences)
    posterior_prob_apple = inference_model.predict(apple_sentences_test)
    prediction_class_apple = np.where(posterior_prob_apple>0.5, 1, 0)
    bin_count_apple = np.bincount(prediction_class_apple.flatten())

    prediction_class_apple = np.bincount(prediction_class_apple.flatten()).argmax()

    confidence_apple = bin_count_apple[prediction_class_apple]/np.sum(bin_count_apple)
    prediction_class_apple = str(prediction_class_apple)
    confidence_apple = str(confidence_apple)

  # prediction_classes = np.where(posterior_prob>0.5, 1, 0)

  # print(prediction_classes)
  # prediction_class = np.bincount(prediction_classes.flatten()).argmax()
  # p = Prediction(prediction=str(prediction_class))
  p = Prediction(prediction_apple=prediction_class_apple, confidence_apple=confidence_apple, prediction_amazon = prediction_class_amzn, confidence_amazon=confidence_amzn)

  return p

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

amazon_full = pd.read_csv("/content/drive/MyDrive/Amazon_csvs_published_source/Label46623.csv")

apple_full = pd.read_csv("/content/drive/MyDrive/apple_csvs_published_source/Label30000.csv")

amazon_full = amazon_full[amazon_full.text.notnull()]
apple_full = apple_full[apple_full.text.notnull()]

amazon_test = amazon_full.sample(frac=0.5, random_state=42).reset_index(drop=True)
apple_test = apple_full.sample(frac=0.5, random_state=42).reset_index(drop=True)

In [23]:
def get_counts(test):
  # print(apple_test)
  predictions = saved_model.predict(get_features(test[:5000].text))
  # sig_preds = tf.sigmoid(predictions)
  sig_preds = np.where(predictions>=0.5, 1, 0)

  print(classification_report(test[:5000].label, sig_preds))
  print(confusion_matrix(test[:5000].label, sig_preds))

  a_df = pd.DataFrame({"sites":test[:5000][sig_preds.flatten() == test[:5000].label].source})
  print(a_df.sites.value_counts())
  print(test[:5000].source.value_counts())

In [24]:
get_counts(apple_test)

100%|██████████| 5000/5000 [00:02<00:00, 2375.87it/s]


              precision    recall  f1-score   support

           0       0.56      0.30      0.39      3007
           1       0.38      0.64      0.47      1993

    accuracy                           0.44      5000
   macro avg       0.47      0.47      0.43      5000
weighted avg       0.49      0.44      0.42      5000

[[ 903 2104]
 [ 719 1274]]
www.yahoo.com             1003
seekingalpha.com           620
www.businesswire.com       243
investingnews.com          169
news.morningstar.com       124
www.thestreet.com           12
www.investors.com            3
seoland.in                   1
freeamericanetwork.com       1
www.fool.com                 1
Name: sites, dtype: int64
www.yahoo.com             3124
seekingalpha.com          1033
www.businesswire.com       332
investingnews.com          256
news.morningstar.com       225
www.thestreet.com           20
freeamericanetwork.com       4
www.investors.com            4
seoland.in                   1
www.fool.com                 1


In [25]:
get_counts(amazon_test)

100%|██████████| 5000/5000 [00:03<00:00, 1485.54it/s]


              precision    recall  f1-score   support

           0       0.72      0.35      0.47      3668
           1       0.26      0.64      0.37      1332

    accuracy                           0.42      5000
   macro avg       0.49      0.49      0.42      5000
weighted avg       0.60      0.42      0.44      5000

[[1273 2395]
 [ 483  849]]
www.yahoo.com               1096
www.investors.com            452
seekingalpha.com             390
www.thestreet.com             98
googlejuices.com              25
finance.yahoo.com              9
news.morningstar.com           8
www.topstocksforum.com         7
www.nasdaq.com                 6
www.financialnewsusa.com       6
chinese.wsj.com                5
www.fool.com                   3
thefly.com                     3
etfdailynews.com               3
virtualmining.com              2
www.msn.com                    2
www.wgmd.com                   1
redliontrader.com              1
www.foxbusiness.com            1
es-us.finanzas.yaho

In [27]:
import json

amazon_full = pd.read_csv("/content/drive/MyDrive/Amazon_csvs_published_source/Label46623.csv")
apple_full = pd.read_csv("/content/drive/MyDrive/apple_csvs_published_source/Label30000.csv")

amazon_full = amazon_full[amazon_full.text.notnull()]
apple_full = apple_full[apple_full.text.notnull()]

amazon_test = amazon_full.sample(frac=0.5, random_state=42).reset_index(drop=True)
apple_test = apple_full.sample(frac=0.5, random_state=42).reset_index(drop=True)

test_data = pd.concat([apple_test, amazon_test])
test_data = test_data.sample(frac=1, random_state=42).reset_index(drop=True)

### Run your testing logic here (generate features...)
predictions = saved_model.predict(get_features(test_data.text))
# sig_preds = tf.sigmoid(predictions)
sig_preds = np.where(predictions>=0.5, 1, 0)

print(classification_report(test_data.label, sig_preds))
print(confusion_matrix(test_data.label, sig_preds))

results_df = pd.DataFrame({"site":test_data.source, "ground_truth":test_data.label, "predicted":sig_preds.flatten()})
### Check until here

results_df["correct"] = np.where(results_df.ground_truth == results_df.predicted, 1, 0)
results_df = results_df.drop(columns=['ground_truth', 'predicted'])

site_stats = results_df.groupby(["site"]).agg(['count', 'sum'])
site_stats.columns = ['_'.join(col) for col in site_stats.columns.values]
site_stats.reset_index(inplace=True)
site_stats.rename(columns={"correct_count":"support","correct_sum":"correct"}, inplace=True)
site_stats["accuracy"] = 100 * site_stats.correct/site_stats.support

site_stats_50 = site_stats[site_stats.support > 50]

top_10_sites = site_stats_50.sort_values(by=['accuracy'], ascending=False).iloc[:10]

for index, row in top_10_sites.iterrows():
  d = {"site":str(row.site), "accuracy":str(row.accuracy), "support":str(row.support)}
  print(json.dumps(d)+",")

100%|██████████| 38312/38312 [00:23<00:00, 1631.41it/s]


              precision    recall  f1-score   support

           0       0.65      0.33      0.44     25859
           1       0.31      0.63      0.42     12453

    accuracy                           0.43     38312
   macro avg       0.48      0.48      0.43     38312
weighted avg       0.54      0.43      0.43     38312

[[ 8576 17283]
 [ 4571  7882]]
{"site": "www.businesswire.com", "accuracy": "75.7936507936508", "support": "1008"},
{"site": "www.thestreet.com", "accuracy": "67.99007444168734", "support": "806"},
{"site": "investingnews.com", "accuracy": "63.41756919374248", "support": "831"},
{"site": "seekingalpha.com", "accuracy": "62.60578279266573", "support": "5672"},
{"site": "www.topstocksforum.com", "accuracy": "58.62068965517241", "support": "58"},
{"site": "news.morningstar.com", "accuracy": "56.809815950920246", "support": "815"},
{"site": "finance.yahoo.com", "accuracy": "53.96825396825397", "support": "63"},
{"site": "googlejuices.com", "accuracy": "53.5545023696682

In [26]:
from colabcode import ColabCode
server = ColabCode(port=10000, code=False)

In [None]:
server.run_app(app)



INFO:     Started server process [359]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:10000 (Press CTRL+C to quit)


Public URL: NgrokTunnel: "http://669ef4cd6374.ngrok.io" -> "http://localhost:10000"
INFO:     72.223.1.23:0 - "GET / HTTP/1.1" 200 OK
INFO:     72.223.1.23:0 - "GET / HTTP/1.1" 200 OK


100%|██████████| 1/1 [00:00<00:00, 544.57it/s]
100%|██████████| 1/1 [00:00<00:00, 540.29it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 169.57it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 388.25it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 182.65it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



100%|██████████| 1/1 [00:00<00:00, 190.44it/s]

INFO:     72.223.1.23:0 - "POST /predict HTTP/1.1" 200 OK



INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [359]
