In [2]:
!pip install bnlp_toolkit
!pip install fasttext==0.9.2
!pip install schedule
!pip install transformers

Collecting fasttext==0.9.2
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext==0.9.2)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=e010ca65bc3fef39cf2a6bcfcc75a3d62a28c1afbc76b6593d3b11e67eacc5fc
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


## Requirements:
##### The following two cells imports the libraries, and prepares the model and other files/folders required to run the sentiment prediction program.

##### Pleae reffer to the comments for adjusting the variables and, commenenting out unnecessary lines, based the system used for running this code

In [3]:
import json
from transformers import AutoModel, AutoTokenizer
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from sklearn.svm import SVC
from pickle import dump, load
from bnlp import NLTKTokenizer
from bnlp.embedding.fasttext import BengaliFasttext
from tqdm import tqdm
import warnings
import datetime
import pytz
import os
import os.path
import re
import threading
import schedule
import time
from zipfile import ZipFile
from bnlp import BengaliCorpus as corpus
from bnlp import CleanText
warnings.filterwarnings("ignore")

# please comment out the following three lines if not running this file on colab
from google.colab import files
# from google.colab import drive
# drive.mount('/content/drive')



class MakeModel(nn.Module):
  def __init__(self,model,k):
    super(MakeModel,self).__init__()
    self.bert_model=model
    self.lin_layer=nn.Linear(768,k)

  def forward(self,input_ids,attention_mask):
    out_vect=self.bert_model(input_ids=input_ids,attention_mask=attention_mask)
    lin_op=self.lin_layer(out_vect.last_hidden_state[:,0,:])
    return F.softmax(lin_op)


bnltk = NLTKTokenizer()
os.mkdir('results')
DEVICE=torch.device("cuda" if torch.cuda.is_available() else "cpu")

punkt not found. downloading...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
# either of the following two finetuned models can be used or the third SVC model can be used

# model 1
# model_name = 'csebuetnlp/banglabert'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# PATH = '/content/drive/MyDrive/Bangla_SA/bsenti_model3.pt'       # please adjust this path based on model's storage location
# model = torch.load(PATH)
# model.to(DEVICE)

# model 2
# model_name = 'sagorsarker/bangla-bert-base'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# PATH = '/content/drive/MyDrive/Bangla_SA/bsenti_model.pt'       # please adjust this path based on model's storage location
# model = torch.load(PATH)
# model.to(DEVICE)

# model 3
tokenizer, model = None, None
bft = BengaliFasttext()
f2=open('/content/svc_model.pkl','rb')                         # please comment out the above two model if wish to use this model
clf = load(f2)
f2.close()


# please modify the below two paths accorsing to your system
dir_path_in = '/content'           # stores input: directory in which the json files of news articles will be present
dir_path_out = '/content/results'           # stores output: oudirectory in which the generated sentiment analysis json files are present


# the follwoing two lines extracts a sample of 100 news articles json files and adds them to the input directory of news articles
# comment out the following two lines if you don't want to extract files from news_sample.zip or don't have any such zipfile
articles_fl = "news_sample.zip"    # you may also modify this varible if you have any other zipfile
with ZipFile(articles_fl, 'r') as zObject:
  zObject.extractall(path=dir_path_in)


files_list_in = os.listdir(dir_path_in)
files_list_out = os.listdir(dir_path_out)

stopwords = corpus.stopwords
punct = [p for p in corpus.punctuations]
punct2 = punct[:-2]
bsw_df = pd.read_excel('stopwords_bangla.xlsx')
sw2 = list(bsw_df['words'])
stopwords.extend(sw2)
stopwords = sorted(list(set(stopwords)))
clean_text = CleanText(
   fix_unicode=True,
   unicode_norm=True,
   unicode_norm_form="NFKC",
   remove_url=True,
   remove_email=True,
   remove_emoji=True,
   remove_number=True,
   remove_digits=True,
   remove_punct=False,
   replace_with_url=" ",
   replace_with_email=" ",
   replace_with_number=" ",
   replace_with_digit=" ")




### Code for output generation
##### Please choose the required get_sentiment method in the predict() method and comment out the other in the two places they are invoked

In [10]:
def get_sentiment(sent):
  tokenized_path = tokenizer(sent,truncation=True,padding=True,max_length=500,return_tensors='pt').to(DEVICE)
  input_ids,attn_mask = tokenized_path['input_ids'],tokenized_path['attention_mask']
  out_vals=model(input_ids,attn_mask)
  labl=torch.argmax(out_vals,1)
  out_list=out_vals.detach().cpu().numpy().tolist()
  label_dict = {0:'Negative', 1:'Positive', 2:'Neutral'}
  output_dict=dict()
  for i in range(len(out_list[0])):
    output_dict[label_dict[i]]= round(out_list[0][i],3)
  return output_dict


def get_sentiment_svm(sent):
  testX = bft.get_word_vector(sent)
  pred_prob = clf.predict_proba([testX])
  prob = list(pred_prob[0])
  prob = [round(p,3) for p in prob]
  output_dict = {'Negative':prob[0], 'Positive':prob[1], 'Neutral':prob[2]}
  return output_dict


def predict(text,cat):               # please comment out either of the get_sentiment call based on the model you are using
  sents = bnltk.sentence_tokenize(text)
  res_dict = dict()
  for st in sents:
    # curr_dict2 = get_sentiment(st)
    curr_dict2 = get_sentiment_svm(st)
    if len(res_dict.keys()) == 0:
      res_dict = curr_dict2.copy()
    else:
      for k in curr_dict2.keys():
        res_dict[k] = res_dict[k] + curr_dict2[k]
  for k in res_dict.keys():
    res_dict[k] = round((res_dict[k]/len(sents)),3)
  # curr_dict2 = get_sentiment(text)
  curr_dict2 = get_sentiment_svm(text)
  for k in curr_dict2.keys():
    res_dict[k] = res_dict[k] + curr_dict2[k]
  for k in res_dict.keys():
    res_dict[k] = round((res_dict[k]/2),3)
  senti, val = None, -1
  for k in res_dict.keys():
    if res_dict[k] > val:
      val = res_dict[k]
      senti = k
  res_dict['Sentiment'] = senti
  res_dict['Category'] = cat
  current_time = datetime.datetime.now(pytz.timezone('Asia/Kolkata'))
  date = str(current_time.day) + '/' + str(current_time.month) + '/' + str(current_time.year)
  time = str(current_time.hour) + ':' + str(current_time.minute) + ':' + str(current_time.second)
  res_dict['Date'] = date
  res_dict['Time of generation'] = time
  return res_dict


def add_to_result(fl_name,content):
  text = content['title'] + ' । ' + content['body']
  pr_text = clean_text(text)
  tokens = pr_text.split()
  tokens = [t for t in tokens if t not in stopwords]
  tokens = [t for t in tokens if t not in punct2]
  pr_text2 = ' '.join(tokens)
  pr_text2 = str(pr_text2)
  cat = content['label']
  res_dict = predict(pr_text2,cat)
  fl_dir = dir_path_out + '/' + fl_name
  with open(fl_dir, "w") as output:
    json.dump(res_dict, output, indent=2)
  print(f'\nPredicted the sentiment of {fl_name} and added to the output directory !!! ')


def download_result():       # call this function if running this code on colab and need to download all output files zipping together
  zf = ZipFile("outputs.zip", "w")
  for dirname, subdirs, fles in os.walk(dir_path_out):
      zf.write(dirname)
      for filename in fles:
          zf.write(os.path.join(dirname, filename))
  zf.close()
  files.download('/content/outputs.zip')


def run_prediction():
  files_list_in = os.listdir(dir_path_in)
  files_list_in = [fl for fl in files_list_in if len(re.findall('\.json',fl))==1]
  files_list_out = os.listdir(dir_path_out)
  new_files = list(set(files_list_in) - set(files_list_out))
  n = len(new_files)
  if n > 0:
    i, j = 0, 0
    while i < n:
      threadpool = []
      while i < n and j < 5:
        fl = new_files[i]
        fl_path = dir_path_in + '/' + fl
        with open(fl_path, encoding='utf-8') as f:
          content = json.load(f)
        t = threading.Thread(target=add_to_result, args=(fl,content,))
        threadpool.append(t)
        i += 1
        j += 1
      for t in threadpool:
        t.start()
      for t in threadpool:
        t.join()
      if i < n:
        j = 0


if __name__=="__main__":
  count = 2  # modify the minutes count as per the requirement of how frequently the prediction function should be executed
  schedule.every(count).minutes.do(run_prediction)
  #schedule.every().hour.do(run_prediction)
  while True:
    schedule.run_pending()


Predicted the sentiment of a27.json and added to the output directory !!! 

Predicted the sentiment of a75.json and added to the output directory !!! 

Predicted the sentiment of a63.json and added to the output directory !!! 

Predicted the sentiment of a91.json and added to the output directory !!! 

Predicted the sentiment of a50.json and added to the output directory !!! 

Predicted the sentiment of a47.json and added to the output directory !!! 

Predicted the sentiment of a46.json and added to the output directory !!! 

Predicted the sentiment of a16.json and added to the output directory !!! 

Predicted the sentiment of a21.json and added to the output directory !!! 

Predicted the sentiment of a38.json and added to the output directory !!! 

Predicted the sentiment of a35.json and added to the output directory !!! 

Predicted the sentiment of a100.json and added to the output directory !!! 

Predicted the sentiment of a76.json and added to the output directory !!! 

Predicted 

KeyboardInterrupt: ignored

In [12]:
files_list_out = os.listdir(dir_path_out)
print(len(files_list_out))

103


In [None]:
# run this to download a zip of the predicted files, if running code in google colab
download_result()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>