In [None]:
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from torch import nn
import nltk
from os import listdir
from os.path import isfile, join
import xml.etree.cElementTree as et
from bs4 import BeautifulSoup
import nltk
import nltk.data
from os import listdir
import pysent3 as ps
from os.path import isfile, join
import eng_spacysentiment
nlp_spacy = eng_spacysentiment.load()

# nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

class TextDataset(Dataset):
  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.text)
  def __getitem__(self, item):
    text = str(self.text[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding = 'max_length', #pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True
    )
    return {
      'text_text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask, 
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)


def create_data_loader(df, tokenizer, max_len, batch_size, col):
  ds = TextDataset(
    text=df[col].to_numpy(),
    targets=df.sentiment_value.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )


def get_predictions(model, data_loader):
  model = model.eval()
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["text_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

RANDOM_SEED = 0 
PRE_TRAINED_MODEL_NAME = 'bert-large-uncased'
FINBERT_MODEL_NAME = 'finbert'
MAX_LEN = 85 
BATCH_SIZE = 32
EPOCHS = 8
LEARNING_RATE = 1e-5
TEST_DATA_PERCENT = 0.30

def get_finbert_sentiment(sentence):
    ###
    # labels = {0:'neutral', 1:'positive',2:'negative'}
    ###

    inputs = tokenizer_finbert(sentence, return_tensors="pt", truncation=True, padding=True)
    outputs = finbert_model(**inputs)[0]
#     return np.argmax(outputs.detach().numpy())
    return nn.Softmax(dim=1)(outputs).detach().numpy()[0]


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import glob, os
os.chdir("/home/ec2-user/SageMaker/Getting Started/2022.05.25/ahkz_sentiment_classifier/ahkz_sentiment_classifier/multifiles/part2_for_chen")
csv_ls = []
for file in glob.glob("*.csv"):
    csv_ls.append(file)

In [None]:
ROOT_PATH = '/home/ec2-user/SageMaker/Getting Started/2022.05.25/ahkz_sentiment_classifier/ahkz_sentiment_classifier/'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('/home/ec2-user/SageMaker/bert-large-uncased/')
sent_model = torch.load(ROOT_PATH + 'model/3_class_fed_testimony_bert_large_uncased_best_model.pt', map_location = device)  

In [None]:
tokenizer_finbert = BertTokenizer.from_pretrained('/home/ec2-user/SageMaker/finbert/')
finbert_model = BertForSequenceClassification.from_pretrained('/home/ec2-user/SageMaker/finbert/', num_labels=3)

In [5]:
def processing_file(files):
    source_folder = "/home/ec2-user/SageMaker/Getting Started/2022.05.25/ahkz_sentiment_classifier/ahkz_sentiment_classifier/multifiles/part2_for_chen/"
    target_folder = "/home/ec2-user/SageMaker/Getting Started/2022.05.25/ahkz_sentiment_classifier/ahkz_sentiment_classifier/multifiles/part2_for_chen_result/"
    column_ls = ['sentence_before', 'sentence_with_the_quote', 'sentence_after', 'concat_sentence']
    for file in files:
        try:
            df = pd.read_csv(source_folder+file)
#             df = df.iloc[:,:-2]
            column_ls = [df.columns[i] for i in [4,12]]
        
            for col in column_ls:
                # FinBERT
                df['finbert_score'] = df[col].apply(lambda x: get_finbert_sentiment(x) if x!='' else ['','',''])

                df[['finbert_neutral_'+col,'finbert_positive_'+col,'finbert_negative_'+col]] = \
                                pd.DataFrame(df['finbert_score'].tolist(), index= df.index)

                df = df.drop(['finbert_score'], axis=1)

                #ahkz
                df['sentiment_value'] = 0

                testimony_data_loader = create_data_loader(df, tokenizer, MAX_LEN, BATCH_SIZE, col)
                testimony_texts, predictions, prediction_probs, real_values = get_predictions(sent_model, testimony_data_loader)

                pred_numpy = predictions.numpy()
                pred_probs = prediction_probs.numpy()
                ms_pred = real_values.numpy()

                df['sentiment_head'] = pred_numpy
                df['sentiment_head'].replace({2: -1}, inplace=True)
                df.drop('sentiment_value', axis=1, inplace=True)
                df = df.rename(columns={"sentiment_head": "ahkz_score_"+col})

                #pysent3
                hiv4 = ps.HIV4()
                df['pysent3'] = df[col].apply(lambda x: hiv4.get_score(hiv4.tokenize(x)))

                pysent3_df = df['pysent3'].apply(pd.Series).rename(columns={"Positive": "pysent3_Positive_"+col, \
                                                               "Negative": "pysent3_Negative_"+col, \
                                                               "Polarity": "pysent3_Polarity_"+col, \
                                                               "Subjectivity": "pysent3_Subjectivity_"+col})

                df = pd.concat([df, pysent3_df], axis=1)

                df = df.drop(columns=['pysent3'])
                
                df['spacy_senti_'+col] = df[col].apply(lambda x: nlp_spacy(x).cats['positive'])

            df.to_csv(target_folder + file, index =False)
        except:
            print(source_folder+file)

In [6]:
processing_file(csv_ls)

In [14]:
pwd

'/home/ec2-user/SageMaker/Getting Started/2022.05.25/ahkz_sentiment_classifier/ahkz_sentiment_classifier/multifiles/source'

In [15]:
!zip -r ../breakingnews.zip . -i ../target/*



In [9]:
df = pd.read_csv('../target/20120718CNBC_breakingnews_matches.csv')

In [10]:
df.head()

Unnamed: 0,Timestamp 1,date,9am est stamp,fed chair,text,standardized,"policy ind (1=testimony related, 0=not policy related, 99=other",topic indicator,type of news indicator,first timestamp of group,...,spacy_senti_text,finbert_neutral_text.1,finbert_positive_text.1,finbert_negative_text.1,ahkz_score_text.1,pysent3_Positive_text.1,pysent3_Negative_text.1,pysent3_Polarity_text.1,pysent3_Subjectivity_text.1,spacy_senti_text.1
0,130000_05660000,20120718,130000,Bernanke,bernanke: agree that fed needs to be transpar...,bernankeagreethatfedneedstobetransparentandacc...,1,,4,10:34:20,...,6.014177e-06,0.186004,0.014144,0.799853,0,1.0,2.0,-0.333333,0.333333,6.014177e-06
1,130000_05680000,20120718,130000,Bernanke,bernanke: would argue that the fed is already...,bernankewouldarguethatthefedisalreadyquitetran...,1,,4,10:34:40,...,5.653985e-08,0.117009,0.102258,0.780733,0,0.0,3.0,-1.0,0.333333,5.653985e-08
2,130000_05690000,20120718,130000,Bernanke,"bernanke: gao has extensive, broad authority t...",bernankegaohasextensivebroadauthoritytoauditth...,1,,4,10:34:50,...,0.7758295,0.101995,0.011845,0.88616,0,1.0,1.0,0.0,0.222222,0.7758295
3,130000_05800000,20120718,130000,Bernanke,bernanke: it's a mistake to subject monetary p...,bernankeitsamistaketosubjectmonetarypolicydeli...,1,,4,10:36:40,...,0.07477073,0.022089,0.638846,0.339065,0,0.0,2.0,-1.0,0.181818,0.07477073
4,130000_05860000,20120718,130000,Bernanke,bernanke: would be concerning if monetary poli...,bernankewouldbeconcerningifmonetarypolicydelib...,1,,4,10:37:40,...,0.7195666,0.024614,0.092775,0.882611,0,0.0,2.0,-1.0,0.181818,0.7195666
