In [1]:
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch
import numpy as np
import pandas as pd
import re
from torch.utils.data import Dataset, DataLoader
from torch import nn
import nltk
from os import listdir
from os.path import isfile, join
import xml.etree.cElementTree as et
from bs4 import BeautifulSoup
import nltk
import nltk.data
from os import listdir
import pysent3 as ps
from os.path import isfile, join
import eng_spacysentiment
nlp_spacy = eng_spacysentiment.load()

import textstat
import pandas as pd
import glob, os

def sentence_process(text):
    try:
        return [textstat.flesch_reading_ease(text),
                textstat.flesch_kincaid_grade(text),
#                 textstat.smog_index(text),
                textstat.automated_readability_index(text),
                textstat.dale_chall_readability_score(text),
                textstat.gunning_fog(text),
                textstat.polysyllabcount(text)]
    except:
        print(text)
        return ['','','','','','']

# nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

class TextDataset(Dataset):
  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.text)
  def __getitem__(self, item):
    text = str(self.text[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding = 'max_length', #pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True
    )
    return {
      'text_text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask, 
      return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)


def create_data_loader(df, tokenizer, max_len, batch_size, col):
  ds = TextDataset(
    text=df[col].to_numpy(),
    targets=df.sentiment_value.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )


def get_predictions(model, data_loader):
  model = model.eval()
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["text_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

RANDOM_SEED = 0 
PRE_TRAINED_MODEL_NAME = 'bert-large-uncased'
FINBERT_MODEL_NAME = 'finbert'
MAX_LEN = 85 
BATCH_SIZE = 32
EPOCHS = 8
LEARNING_RATE = 1e-5
TEST_DATA_PERCENT = 0.30

def get_finbert_sentiment(sentence):
    ###
    # labels = {0:'neutral', 1:'positive',2:'negative'}
    ###

    inputs = tokenizer_finbert(sentence, return_tensors="pt", truncation=True, padding=True)
    outputs = finbert_model(**inputs)[0]
#     return np.argmax(outputs.detach().numpy())
    return nn.Softmax(dim=1)(outputs).detach().numpy()[0]


C:\Users\czhao\anaconda3\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll
C:\Users\czhao\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [4]:
import glob, os
os.chdir("C:\\Users\\czhao\\OneDrive\\桌面\\uoft\\work_study\\Chen_workingfolder\\Powell_Titles")
csv_ls = []
for file in glob.glob("factiva*.csv"):
    csv_ls.append(file)

In [5]:
csv_ls

['factiva_10days_news_20180227_Powell_direct_quotes.csv',
 'factiva_10days_news_20180717_Powell_direct_quotes.csv',
 'factiva_10days_news_20190226_Powell_direct_quotes.csv',
 'factiva_10days_news_20190710_Powell_direct_quotes.csv']

In [6]:
ROOT_PATH = 'C:\\Users\\czhao\\OneDrive\\桌面\\uoft\\work_study\\ahkz_sentiment_classifier\\ahkz_sentiment_classifier\\ahkz_sentiment_classifier\\'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
sent_model = torch.load(ROOT_PATH + 'model\\3_class_fed_testimony_bert_large_uncased_best_model.pt', map_location = device)  

In [7]:
tokenizer_finbert = BertTokenizer.from_pretrained('ProsusAI/finbert')
finbert_model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert', num_labels=3)

In [8]:
def processing_file(files):
    source_folder = "C:\\Users\\czhao\\OneDrive\\桌面\\uoft\\work_study\\Chen_workingfolder\\Powell_Titles\\"
    target_folder = "C:\\Users\\czhao\\OneDrive\\桌面\\uoft\\work_study\\Chen_workingfolder\\Powell_Titles_Result\\"
    
    column_ls = ['sentence_before', 'sentence_with_the_quote', 'sentence_after', 'concat_sentence']
#     column_ls = ['Title of article']
    
    for file in files:
        try:
            print(source_folder+file)
            df = pd.read_csv(source_folder+file)
            df['concat_sentence'] = df[['sentence_before', 'sentence_with_the_quote', 'sentence_after']].agg(' '.join, axis=1)
        
            for col in column_ls:
                print(col)
                # FinBERT
                df['finbert_score'] = df[col].apply(lambda x: get_finbert_sentiment(x) if x!='' else ['','',''])

                df[['finbert_neutral_'+col,'finbert_positive_'+col,'finbert_negative_'+col]] = \
                                pd.DataFrame(df['finbert_score'].tolist(), index= df.index)

                df = df.drop(['finbert_score'], axis=1)

                #ahkz
                df['sentiment_value'] = 0

                testimony_data_loader = create_data_loader(df, tokenizer, MAX_LEN, BATCH_SIZE, col)
                testimony_texts, predictions, prediction_probs, real_values = get_predictions(sent_model, testimony_data_loader)

                pred_numpy = predictions.numpy()
                pred_probs = prediction_probs.numpy()
                ms_pred = real_values.numpy()

                df['sentiment_head'] = pred_numpy
                df['sentiment_head'].replace({2: -1}, inplace=True)
                df.drop('sentiment_value', axis=1, inplace=True)
                df = df.rename(columns={"sentiment_head": "ahkz_score_"+col})

                #pysent3
                hiv4 = ps.HIV4()
                df['pysent3'] = df[col].apply(lambda x: hiv4.get_score(hiv4.tokenize(x)))

                pysent3_df = df['pysent3'].apply(pd.Series).rename(columns={"Positive": "pysent3_Positive_"+col, \
                                                               "Negative": "pysent3_Negative_"+col, \
                                                               "Polarity": "pysent3_Polarity_"+col, \
                                                               "Subjectivity": "pysent3_Subjectivity_"+col})

                df = pd.concat([df, pysent3_df], axis=1)

                df = df.drop(columns=['pysent3'])
                
                df['spacy_senti_'+col] = df[col].apply(lambda x: nlp_spacy(x).cats['positive'])
                
                df[['flesch_reading_ease_'+column, 'flesch_kincaid_grade_'+column, 'automated_readability_index_'+column, 'dale_chall_readability_score_'+column, 'gunning_fog_'+column, 'polysyllabcount_'+column]] = df.apply(lambda x: sentence_process(x[column]), axis=1, result_type="expand")

                
            print('complete')

            df.to_csv(target_folder + file, index =False)
        except:
            print(source_folder+file)

In [None]:
processing_file(csv_ls)

C:\Users\czhao\OneDrive\桌面\uoft\work_study\Chen_workingfolder\Powell_Titles\factiva_10days_news_20180227_Powell_direct_quotes.csv
sentence_before
C:\Users\czhao\OneDrive\桌面\uoft\work_study\Chen_workingfolder\Powell_Titles\factiva_10days_news_20180227_Powell_direct_quotes.csv
C:\Users\czhao\OneDrive\桌面\uoft\work_study\Chen_workingfolder\Powell_Titles\factiva_10days_news_20180717_Powell_direct_quotes.csv
sentence_before
C:\Users\czhao\OneDrive\桌面\uoft\work_study\Chen_workingfolder\Powell_Titles\factiva_10days_news_20180717_Powell_direct_quotes.csv
C:\Users\czhao\OneDrive\桌面\uoft\work_study\Chen_workingfolder\Powell_Titles\factiva_10days_news_20190226_Powell_direct_quotes.csv
sentence_before


In [15]:
!zip -r ../breakingnews.zip . -i ../target/*

