## Data and Imports

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import (AutoModel, AutoConfig, AutoTokenizer,AutoModelForSequenceClassification,
                          pipeline, Trainer, TrainingArguments,EarlyStoppingCallback)
from utils.text_processing import (get_summarizer, perform_summarizer, text_filter, 
                                   get_sentiment_model, get_topic_model,compute_metrics)

In [None]:
df = pd.read_csv('data/raw_text_data.csv')
df = text_filter(df)

## Summary Generation with bert-extractive-summarizer
- [Bert-Extractive-Summarizer](https://github.com/dmmiller612/bert-extractive-summarizer)
- [Bart-Large-CNN](https://huggingface.co/facebook/bart-large-cnn)

In [None]:
# Initiate model
summary_model_name = 'facebook/bart-large-cnn' # pretrained model hosted on HuggingFace
summary_model = get_summarizer(summary_model_name)

In [None]:
# Perfom Summarizer
summarized_texts = []
sum_flag = 250
shrink_ratio = 0.8

for idx in tqdm(range(len(df)), desc='Raw Texts'):
    text_tok_cnts = df['cnt_len'][idx]
    raw_text = df['text'][idx]
    
    # When the raw text is long enough for generating a summary
    if text_tok_cnts >= sum_flag:
        summary = perform_summarizer(raw_text, summary_model, ratio = shrink_ratio, return_embeddings=False)
        summary_tok_cnts = len(summary.split())
        
        while summary_tok_cnts >= sum_flag:
            summary = perform_summarizer(summary, summary_model, ratio = shrink_ratio, return_embeddings=False)
            summary_tok_cnts = len(summary.split())
        
        summarized_texts.append(summary)
    
    else:
        summarized_texts.append(raw_text)
        
df['summary'] = summarized_texts

## Sentiment Label and Topic Label Generation
- [(Used in the project) Sentiment Model 1: mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis](https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis)
- [Sentiment Model 2:Jean-Baptiste/roberta-large-financial-news-sentiment-en](https://huggingface.co/Jean-Baptiste/roberta-large-financial-news-sentiment-en)
- [Topic Model: jonaskoenig/topic_classification_04](https://huggingface.co/jonaskoenig/topic_classification_04)

In [None]:
# Financial
financial_tokenizer, financial_model = get_sentiment_model('mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis')
# financial_tokenizer, financial_model = get_model('Jean-Baptiste/roberta-large-financial-news-sentiment-en')
topic_tokenizer, topic_model = get_topic_model('jonaskoenig/topic_classification_04',from_tf=True)

article_tokenizer_kwargs = {'padding':'max_length',
                            'truncation':True,
                            'max_length':250,
                            'add_special_tokens':True}

article_classifier = pipeline("sentiment-analysis", 
                              model=financial_model, 
                              tokenizer=financial_tokenizer,
                              **article_tokenizer_kwargs)

In [None]:
# Create Sentiment Labels
sentiment_labels = []
sentiment_threshold = 0.7

for text in tqdm(df['text'].values,total=len(df),desc='Sentiment Label Generation'):
    
    # Make Predictions
    prediction = article_classifier(text)[0]
    sentiment_label = prediction['label']
    sentiment_score = prediction['score']
    
    if (sentiment_score >= sentiment_threshold) & (sentiment_label != 'neutral'):
        sentiment_labels.append(sentiment_label)

    else:
        sentiment_labels.append('NA')
        
df['sentiment'] = sentiment_labels

In [None]:
# Create Topic Labels
topics = []
topic_confidence_threshold = 0.9

for summary in tqdm(df.summary,total=len(df),desc='Sentiment Label Generation'):
    
    topic_classifier = pipeline("sentiment-analysis", 
                                model=topic_model, 
                                tokenizer=topic_tokenizer,
                                **article_tokenizer_kwargs)
    
    prediction = topic_classifier(text)[0]
    topic_label = prediction['label']
    topic_score = prediction['score']

    if topic_score >= topic_confidence_threshold:
        topics.append(topic_label)

    else:
        topics.append('NA')
        
df['topic'] = topics

##### Save Data

In [None]:
#### Save to CSV ####
# df.to_csv('data_sample.csv',index=0)

## BERT Model

#### Helper Functions

In [None]:
def num_labels_cnvt(batch):
    dicts = {'negative':0,'positive':1}
    labels = [dicts[lbl] for lbl in batch['sentiment']]
    return {'labels':labels}

def tokenize(batch):
    tokens = tokenizer(batch['text'], padding='max_length', max_length = 250, truncation=True)
    return tokens

def data_encoder(root, train_file, valid_file):

    # Create Datasets
    dataset = load_dataset(path = root,
                            data_files={'train': train_file, 'valid': valid_file})
    dataset = dataset.map(num_labels_cnvt, batched=True)
    
    dataset_encoded = dataset.map(tokenize,batched=True)
    dataset_encoded.set_format(type='torch',columns=["input_ids","attention_mask","labels"])
    return dataset_encoded


#### Encoding Data and Preparing Model

In [None]:
# Create Label Maps
num_labels = 2
id2label = {0:'Negative',1:'Postive'}
label2id = {'Negative':0,'Positive':1}

# Loading Models
device = 'cpu'
model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                                num_labels=num_labels,
                                                                label2id=label2id,
                                                                id2label=id2label).to(device)

# Data Path
root = 'sample_data/'
train = 'sample_train_fold.csv'
valid = 'sample_valid_fold.csv'

encoded_dataset = data_encoder(root,train,valid)

#### Model Training

In [None]:
batch_size = 16
logging_steps = len(encoded_dataset['train']) // batch_size
model_output_path = f"saved_model/"

training_args = TrainingArguments(output_dir=model_output_path,
                                  num_train_epochs=15,
                                  learning_rate=2e-6,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.0001,
                                  
                                  #### EarlyStopping
                                  evaluation_strategy = 'epoch',
                                  save_strategy='epoch',
                                  load_best_model_at_end = True,
                                  metric_for_best_model = 'f1',
                                  ##########
                                  
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  no_cuda=False,
                                  log_level="error",
                                  # optim = 'sgd'
                                 )

trainer = Trainer(model=model, 
                  tokenizer=tokenizer,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=encoded_dataset['train'],
                  eval_dataset=encoded_dataset['valid'],
                  callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
                 )

results = trainer.train()