## Importing Packages

In [4]:
!pip install easyOCR
!pip install catboost
import easyocr
import matplotlib.pyplot as plt
import cv2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import pickle as pk
from skimage.measure import compare_ssim
import argparse
import imutils
import statistics

import nltk
import re
import string
from string import punctuation
from nltk.corpus import stopwords # Words which do not contribute to the sentiment analysis
from nltk.tokenize import word_tokenize # Separating sentences in different components, every word and punctuation
from nltk.stem import LancasterStemmer # Converts the words to root words which might or might not make sense
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import keras
from keras import models, layers
from keras.utils import to_categorical
from keras.models import load_model

nltk.download()
reader = easyocr.Reader(['en'])

## Getting Data From Video

### Read Text in Frame

In [None]:
def doOCR(img):
  output = reader.readtext(img)
  words = []
  for i in range(len(output)):
    words.append(output[i][1])
  return words, output

### Make Bounding Box Around The Text

In [None]:
def boundText(output, img):
  cord = [output[i][0] for i in range(len(output))]
  start = []
  end = []
  for i in range(len(cord)):
    x_min, y_min = [min(loc) for loc in zip(*cord[i])]
    x_max, y_max = [max(loc) for loc in zip(*cord[i])]
    temp1 = [x_min, y_min]
    temp2 = [x_max, y_max] 
    start.append(temp1)
    end.append(temp2)
  fig = plt.figure(figsize = (15, 12))
  for n, i in enumerate(zip(start, end)):
    cv2.rectangle(img, (start[n][0], start[n][1]), (end[n][0], end[n][1]), (0, 0, 0), 4)
  return img

## Detecting Sentiment

### Loading The Set of Positive and Negative Words In Their Root Format

In [7]:
def load_data(pos, neg, info):
  stemmer = LancasterStemmer()
  pos_words, neg_words = open(pos, 'r'), open(neg, 'r')
  pos_words, neg_words = pos_words.read().lower().split(), neg_words.read().lower().split()

  positive, negative = [], []
  for i in pos_words: positive.append(stemmer.stem(i[:-1]))
  for i in neg_words: negative.append(stemmer.stem(i[:-1]))

  data = open(info, 'r')
  data = data.readlines()
  train, label = [], []
  for i in data: 
    temp = i[:-2].lower().split()
    label.append(i[-2])
    headline = ''
    for j in temp:
      headline = headline + stemmer.stem(j) + ' '
    train.append(headline)

  train.extend(positive); label.extend([1 for i in positive])
  train.extend(negative); label.extend([0 for i in negative])

  data = pd.DataFrame()
  data['headlines'] = train
  data['label'] = label
  data['label'] = data['label'].astype('int64')
  return positive, negative, data

### Tokenizing, Removing Punctuation & Stopwords and Lemmatizing The Text

In [8]:
def preprocess(words):
  stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation)
  stemmer = LancasterStemmer()

  final_headlines = []
  for i in range(len(words)):
    headline = word_tokenize(words[i].lower())
    headline = [stemmer.stem(y) for y in headline if y not in stuff_to_be_removed]
    j = " ".join(headline)
    final_headlines.append(j)

  return final_headlines

### Keeping Only Relevant Text from Each Frame

In [9]:
def filterImportant(final_headlines, positive, negative):
  final = []
  for i in final_headlines:
    temp = i.split()
    flag = 0
    for j in temp:
      if j in positive or j in negative: flag+=1
    if flag: final.append(i)
  return final

## Actual Code Running - Per Frame

In [None]:
img = cv2.imread('/content/Positive News.png')

# DoOCR Function Returns words or phrases in the image and output which includes the coordinates of the text
words, output = doOCR(img)

# A function which uses the output of the OCR engine to return image with text box made around it
img = boundText(output, img)
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

''' 
Loading Data to Filter out relevant text from image and used for Model Building as well 
Takes 3 inputs 
- File with positive words about stock which GENERALLY indicate growing stock - Used for filtering and prediction
- File with negative words about stock which GENERALLY indicate declining stock - Used for filtering and prediction
- File with sample headlines to help train the model - Only Used for Prediction
'''
positive, negative, data = load_data('/content/Positive Words.txt', '/content/Negative Words.txt', # Filtering
                                     '/content/Headlines.txt') # Only to create model, not necessary while running

# Preprocess the words obtained from the frame - Takes input as a LIST OF STRINGS
headlines = preprocess(words)

# Filtering out the irrelevant headlines which say nothing about the stock
headlines = filterImportant(headlines, positive, negative)
print('Important Headlines', headlines)

# Loading The TFIDF Vectorizer Model and Sentiment Analyzer Model used while training the model
vec = pk.load(open('transformer.pkl', 'rb'))
sent = pk.load(open('analyzer.pkl', 'rb'))

# Detecting the sentiment of final headlines
headlines = vec.transform(headlines)
print('Prediction:', sent.predict(headlines))

## Working With Video - MAIN FUNCTION IN THIS

### Sentiment for Every Frame Captured

In [None]:
def newsSentiment(img):
  words, output = doOCR(img)
  img = boundText(output, img)
  
  headlines = preprocess(words)
  headlines = filterImportant(headlines, positive, negative)
  print(headlines)
  
  headlines = vec.transform(headlines)
  pred = statistics.median(sent.predict(headlines))
  # cv2.putText(img, pred, (20, 20), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_4)
  if pred: cv2.rectangle(img, (0, 0), (30, 30), (255, 255, 255), -1)
  else: cv2.rectangle(img, (0, 0), (30, 30), (0, 0, 0), -1)
  
  return img

### Saving SIGNIFICANTLY different Frames from the video

In [1]:
def save_frames(video, folder):
  cap = cv2.VideoCapture(video)

  if not (cap.isOpened()):
    print("Error Reading Video")
    return

  _, frame1 = cap.read()
  files = folder + '/frame0' + '.jpg'
  i = 1
  cv2.imwrite(files, frame1)

  while(True):
    # Capture frame-by-frame
    ret, frame2 = cap.read()
    
    if ret == True:
      grayA = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
      grayB = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
      
      score, diff = compare_ssim(grayA, grayB, full=True)
      diff = (diff * 255).astype("uint8")
      
      if score < 0.8: # This parameter can be adjusted to capture more changes in consecutive frames
        files = folder + '/frame'+ str(i) +'.jpg'
        i += 1
        cv2.imwrite(files, frame2)
        frame1 = frame2
      
      if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    
    else: ## Waiting for the video to be over
      break

  cap.release()
  cv2.destroyAllWindows()

### Calling All Functions - MAIN FUNCTION

In [10]:
vec = pk.load(open('transformer.pkl', 'rb'))
sent = pk.load(open('analyzer.pkl', 'rb'))

positive, negative, data = load_data('/content/Positive Words.txt', '/content/Negative Words.txt', 
                                     '/content/Headlines.txt')

folder = '/content/frames'
video = '/content/rest.mov'
save_frames(video, folder)

print('Captured Video')
for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, filename))
        '''
        The following function will return the frame with all the text highlighted and a little square on top left of the screen
        Black Square - Stock price will go down
        White Square - Stock price will go up
        '''
        img = newsSentiment(img)
        '''
        Using plt.imshow for displaying image here as cv2.imshow doesn't work on colab, if running on local system, cv2.imshow command can 
        be used to see all the frames one after the other like a video. Command is commented out below
        '''
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # OR cv2.imshow('News', cv2.cvtColor(img, cv2.COLOR_BGR2RGB))