## Install Library

In [None]:
# Installation
! pip install bert-extractive-summarizer
! pip install sentencepiece
! pip install datasets
! pip install -U spacy
! python -m spacy download en_core_web_lg
! pip install umap-learn

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from transformers import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from summarizer import Summarizer
import umap
import spacy
from sklearn.preprocessing import OneHotEncoder
from tqdm.auto import tqdm
import warnings
import torch
from sklearn.manifold import TSNE
from sklearn.model_selection import ParameterGrid

## Helper Functions

### summarizer

In [None]:
def get_summarizer(model_name):
  # Find and setup model from the HuggingFace API
    custom_config = AutoConfig.from_pretrained(model_name)
    custom_config.output_hidden_states=True
    custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
    custom_model = AutoModel.from_pretrained(model_name, config=custom_config)

  # Initiate the bert-extractive summarizar
    model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
    return model

def perform_summarizer(text_body, model, num_sent = 5, return_embeddings = False):
    try:
      text_body = text_body.strip()
      text_body = text_body.replace('\n',' ')
    except:
      pass

    # Return Summary
    summary = model(body = text_body, num_sentences = num_sent, max_length = 800)

    if return_embeddings:
        embeddings = model.run_embeddings(summary, aggregate = 'mean', num_sentences = num_sent)
        return summary, embeddings
    
    return summary

### sentiment analysis model

In [None]:
def get_sentiment_model(model):
  tokenizer = AutoTokenizer.from_pretrained(model, model_max_length=512)
  model = AutoModelForSequenceClassification.from_pretrained(model)
  return tokenizer, model

def get_sentiment_prediction(tokenizer,model, text):
  try:
    text = text.replace('\n',' ')
    text = text.strip()
  except:
    pass
  tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
  classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, **tokenizer_kwargs)
  prediction = classifier(str(text))[0]
  label, score = prediction['label'], prediction['score']
  return label, round(score,4)

### topic analysis model

In [None]:
def get_topic_model(model):
  tokenizer = AutoTokenizer.from_pretrained(model)
  model = AutoModelForSequenceClassification.from_pretrained(model,from_tf=True)
  return tokenizer, model

def get_topic_prediction(tokenizer,model, text):
  try:
    text = text.replace('\n',' ')
    text = text.strip()
  except:
    pass
  tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
  classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, **tokenizer_kwargs)
  prediction = classifier(str(text))[0]
  label, score = prediction['label'], prediction['score']
  return label, round(score,4)

### load news data

In [None]:
def load_news_data(directory_files: 'str'):
  directory_path = Path(directory_files)
  directory_files = os.listdir(directory_path)

  file_dict = {}
  for file_name in directory_files:
    if file_name.split('.')[-1] == 'csv':
      try:
        df = pd.read_csv(directory_path / file_name)
        file_dict[file_name] = df
      except:
        pass
    elif file_name.split('.')[-1] == 'json':
      try:
        df = pd.read_json(directory_path / file_name)
        file_dict[file_name] = df
      except:
        try:
          df = pd.read_json(directory_path / file_name, lines = True)
          file_dict[file_name] = df
        except:
          pass
    elif  file_name.split('.')[-1] == 'txt':
      try:
        with open(directory_path / file_name, 'r') as f:
          data = eval(f.read())
          df = pd.DataFrame(data).T
          file_dict[file_name] = df
      except:
        pass
  return file_dict

### generate embedding for sampled news

In [None]:
def tokenization_embedding(load_news_data_fn: 'function', 
                           directory: 'str', 
                           file_name: 'str', 
                           perform_summarizer: 'function', 
                           summarizer_model: 'function', 
                           num_sent = 2, 
                           return_embeddings = True, 
                           sample_size = 300):
  
  warnings.filterwarnings("ignore")
  tqdm.pandas()
  data_dict = load_news_data(directory_files = directory)
  news_data = data_dict[file_name]
  news_data['token_count'] = news_data['text'].apply(lambda x: len(x.split(' ')))

  news_data_biz = news_data[(news_data['topics'] == 'Business & Finance') & (news_data['token_count'] <=100)].sample(sample_size)
  news_data_pol = news_data[(news_data['topics'] == 'Politics & Government') & (news_data['token_count'] <=100)].sample(sample_size)
  news_data_sci = news_data[(news_data['topics'] == 'Science & Mathematics') & (news_data['token_count'] <=100)].sample(sample_size)
  news_data_com = news_data[(news_data['topics'] == 'Computers & Internet') & (news_data['token_count'] <=100)].sample(sample_size)

  news_data_sampled = pd.concat([news_data_biz, news_data_pol, news_data_sci, news_data_com], ignore_index=True)
  news_data_sampled['summary'], news_data_sampled['text_embedding'] = zip(*news_data_sampled['text'].progress_apply(
      lambda x: perform_summarizer(text_body = x, model = summarizer_model, num_sent = num_sent, return_embeddings = return_embeddings))
  )

  text_article_embedding = [i for i in news_data_sampled['text_embedding']]
  df_text_article_embedding = pd.DataFrame(np.vstack(text_article_embedding))
  df_text_article_embedding['topics'] = news_data_sampled['topics'].values
  df_text_article_embedding['sentiment'] = news_data_sampled['sentiment'].values

  return news_data_sampled, df_text_article_embedding

### get the logit of BERT sentiment model

In [None]:
def logit_sent(news_data: 'df', financial_tokenizer: 'function', financial_model: 'function', summary: 'True/False'):
  logit_list = []

  if summary == False:
    for text in tqdm(news_data['text']):
        input = torch.tensor(financial_tokenizer(text, truncation = True, max_length = 512, padding = True,).input_ids).unsqueeze(0)
        tensor_output = financial_model(input)[0].detach()
        logit_list.append(tensor_output)
  
  elif summary == True:
    for text in tqdm(news_data['summary']):
        input = torch.tensor(financial_tokenizer(text, truncation = True, max_length = 512, padding = True,).input_ids).unsqueeze(0)
        tensor_output = financial_model(input)[0].detach()
        logit_list.append(tensor_output)

  logit_output_all = torch.stack(logit_list)
  df_logit_output = pd.DataFrame(logit_output_all.squeeze().numpy())
  df_logit_output['sentiment'] = news_data['sentiment']

  return df_logit_output

### visualize sentiment logit

In [None]:
def logit_visualize(df_logit: 'DataFrame', X_transformed: 'array'):
  np.random.seed(42)
  df_tsne = pd.DataFrame(X_transformed, columns = ['d1', 'd2'])
  df_tsne['sentiment'] = df_logit['sentiment'].values

  plt.figure(figsize = (7, 7))
  color = ['#EC7063', '#2980B9']

  for i, j in zip(['positive', 'negative'], color):
    plt.scatter(df_tsne.loc[(df_tsne['sentiment'] == i),'d1'], 
                df_tsne.loc[(df_tsne['sentiment'] == i),'d2'],   
                c = j, label = i, s = 6)

  plt.legend()
  plt.xlabel('dimension 1')
  plt.ylabel('dimension 2')
  plt.show()

### get the logit of BERT topic model

In [None]:
def logit_topic(news_data: 'df', topic_tokenizer: 'function', topic_model: 'function', summary: 'True/False'):
  logit_list = []

  if summary == False:
    for text in tqdm(news_data['text']):
        input = torch.tensor(topic_tokenizer(text, truncation = True, max_length = 512, padding = True,).input_ids).unsqueeze(0)
        tensor_output = topic_model(input)[0].detach()
        logit_list.append(tensor_output)
  
  elif summary == True:
    for text in tqdm(news_data['summary']):
        input = torch.tensor(topic_tokenizer(text, truncation = True, max_length = 512, padding = True,).input_ids).unsqueeze(0)
        tensor_output = topic_model(input)[0].detach()
        logit_list.append(tensor_output)
  
  logit_output_all = torch.stack(logit_list)
  df_logit_output = pd.DataFrame(logit_output_all.squeeze().numpy())
  df_logit_output['topics'] = news_data['topics']

  return df_logit_output

### visualize topic logit

In [None]:
def logit_topic_visualize(df_logit: 'DataFrame', X_transformed: 'array'):
  np.random.seed(42)
  df_tsne = pd.DataFrame(X_transformed, columns = ['d1', 'd2'])
  df_tsne['topics'] = df_logit['topics'].values

  plt.figure(figsize = (7, 7))
  color = ['#2980B9', '#27AE60', '#F1C40F', '#7F8C8D']
  topic_list = ['Business & Finance', 'Politics & Government','Science & Mathematics', 'Computers & Internet']

  for i, j in zip(topic_list, color):
    plt.scatter(df_tsne.loc[(df_tsne['topics'] == i),'d1'], 
                df_tsne.loc[(df_tsne['topics'] == i),'d2'],   
                c = j, label = i, s = 6)

  plt.legend()
  plt.xlabel('dimension 1')
  plt.ylabel('dimension 2')
  plt.show()

## Visualization of Summary Embedding

### initialize function

In [None]:
model_name = 'facebook/bart-large-cnn'
summarizer_model = get_summarizer(model_name)

In [None]:
news_data_sampled, df_embedding = tokenization_embedding(load_news_data_fn = load_news_data, 
                                                         directory = '../data/EDA/News Article Text File', 
                                                         file_name = 'articles_2015_2019.csv', 
                                                         perform_summarizer = perform_summarizer,
                                                         summarizer_model = summarizer_model, 
                                                         num_sent = 2, 
                                                         return_embeddings = True, 
                                                         sample_size = 300)

### UMAP

#### reduce the dimensionality of summary embedding to 3 dimensions

In [None]:
umap_model = umap.UMAP(n_neighbors=30,
                       n_components=3,
                       metric='cosine', 
                       random_state = 42)

X_transformed_umap = umap_model.fit_transform(df_embedding.iloc[:,:-2])

#### visualize transformed embedding/ label = topic

In [None]:
df_umap = pd.DataFrame(X_transformed_umap, columns = ['d1', 'd2', 'd3'])
df_umap['topic_label'] = df_embedding['topics'].values

fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(projection='3d')

unique_topic = set(df_embedding['topics'].values)
color = ['#2980B9', '#27AE60', '#F1C40F', '#7F8C8D']



for i, j in zip(unique_topic, color):
  ax.scatter(df_umap.loc[(df_umap['topic_label'] == i),'d1'], 
             df_umap.loc[(df_umap['topic_label'] == i),'d2'],
             df_umap.loc[(df_umap['topic_label'] == i),'d3'],
              c = j, label = i)

ax.set_xlabel('dimension 1')
ax.set_ylabel('dimension 2')
ax.set_zlabel('dimension 3')

plt.legend()
plt.show()

#### visualize transformed embedding/ label = sentiment

In [None]:
df_umap = pd.DataFrame(X_transformed_umap, columns = ['d1', 'd2', 'd3'])
df_umap['sentiment_label'] = df_embedding['sentiment'].values

fig = plt.figure(figsize = (8, 8))
ax = fig.add_subplot(projection='3d')

unique_topic = set(df_embedding['sentiment'].values)
color = ['#2980B9', '#27AE60']



for i, j in zip(unique_topic, color): 
  ax.scatter(df_umap.loc[(df_umap['sentiment_label'] == i),'d1'], 
             df_umap.loc[(df_umap['sentiment_label'] == i),'d2'],
             df_umap.loc[(df_umap['sentiment_label'] == i),'d3'],
              c = j, label = i)

ax.set_xlabel('dimension 1')
ax.set_ylabel('dimension 2')
ax.set_zlabel('dimension 3')

plt.legend()
plt.show()

### tSNE

In [None]:
parameter_dict = {'learning_rate': [50, 100, 150, 200],
                  'perplexity': [10, 30, 50], 
                  'early_exaggeration': [20]}
grid = ParameterGrid(parameter_dict)

KL_list = []
complete = 0
for i in grid:
  TSNE_model = TSNE(n_components=2, 
                    learning_rate=i['learning_rate'], 
                    init='random', 
                    perplexity=i['perplexity'],
                    method = 'exact',
                    early_exaggeration =i['early_exaggeration']).fit(df_embedding.iloc[:,:-2])
  KL_list.append(TSNE_model.kl_divergence_)
  complete += 1
  print(f'finish: {complete}/ {len(grid)}')

In [None]:
best_parameter = grid[np.array(KL_list).argmin()]
best_parameter

In [None]:
from sklearn.manifold import TSNE
from sklearn.model_selection import ParameterGrid

best_parameter = grid[np.array(KL_list).argmin()]
np.random.seed(10)

print(best_parameter)
X_transformed = TSNE(n_components=2, 
                  learning_rate=best_parameter['learning_rate'], 
                  init='random', 
                  perplexity=best_parameter['perplexity'],
                  method = 'exact',
                  early_exaggeration =best_parameter['early_exaggeration']).fit_transform(df_embedding.iloc[:,:-2])

In [None]:
df_tsne = pd.DataFrame(X_transformed, columns = ['d1', 'd2'])
df_tsne['topic_label'] = df_embedding['topics'].values
plt.figure(figsize = (6, 6))

color = ['#EC7063', '#2980B9', '#27AE60', '#F1C40F']
topic_list = ['Business & Finance', 'Politics & Government','Science & Mathematics', 'Computers & Internet']

for i, j in zip(topic_list, color): 
  plt.scatter(df_tsne.loc[(df_tsne['topic_label'] == i),'d1'], 
              df_tsne.loc[(df_tsne['topic_label'] == i),'d2'],   
              c = j, label = i, s = 5)

plt.legend()
plt.xlabel('dimension 1')
plt.ylabel('dimension 2')
plt.show()

## Sentiment Analysis

### initialize sentiment model

In [None]:
financial_tokenizer, financial_model = get_sentiment_model('ahmedrachid/FinancialBERT-Sentiment-Analysis')

### extract logit from the sentiment model

In [None]:
df_logit = logit_sent(news_data = news_data_sampled,
                                financial_tokenizer = financial_tokenizer,
                                financial_model = financial_model, 
                                summary = False)

## Visualization: Original Text + Sentiment

### hyperparameter tuning of tSNE

In [None]:
parameter_dict = {'learning_rate': [50, 100, 150, 200],
                  'perplexity': [10, 30, 50], 
                  'early_exaggeration': [20]}
grid = ParameterGrid(parameter_dict)

KL_list = []
complete = 0
for i in grid:
  TSNE_model = TSNE(n_components=2, 
                    learning_rate=i['learning_rate'], 
                    init='random', 
                    perplexity=i['perplexity'],
                    method = 'exact',
                    early_exaggeration =i['early_exaggeration']).fit(df_logit.iloc[:,:-1])
  KL_list.append(TSNE_model.kl_divergence_)
  complete += 1
  print(f'finish: {complete}/ {len(grid)}')

### reduce data dimensionality by tSNE

In [None]:
np.random.seed(10)
best_parameter = grid[np.array(KL_list).argmin()]
print(best_parameter)
X_transformed_org_sent = TSNE(n_components=2,
                              learning_rate=best_parameter['learning_rate'], 
                              init='random', 
                              perplexity=best_parameter['perplexity'],
                              method = 'exact',
                              early_exaggeration =best_parameter['early_exaggeration']).fit_transform(df_logit.iloc[:,:-1])

### visualize the text representation of sampled data (label: sentiments)

In [None]:
logit_visualize(df_logit, X_transformed_org_sent)

## Viusalization: Summary + Sentiment

### extract logit of BERT Model

In [None]:
df_logit_summary = logit_sent(news_data = news_data_sampled,
                              financial_tokenizer = financial_tokenizer,
                              financial_model = financial_model,
                              summary = True)

### hyperpamaeter tuning of tSNE

In [None]:
parameter_dict = {'learning_rate': [50, 100, 150, 200],
                  'perplexity': [10, 30, 50], 
                  'early_exaggeration': [20]}
grid = ParameterGrid(parameter_dict)

KL_list = []
complete = 0
for i in grid:
  TSNE_model = TSNE(n_components=2, 
                    learning_rate=i['learning_rate'], 
                    init='random', 
                    perplexity=i['perplexity'],
                    method = 'exact',
                    early_exaggeration =i['early_exaggeration']).fit(df_logit_summary.iloc[:,:-1])
  KL_list.append(TSNE_model.kl_divergence_)
  complete += 1
  print(f'finish: {complete}/ {len(grid)}')

### reduce dimensionality by tSNE

In [None]:
from sklearn.manifold import TSNE
from sklearn.model_selection import ParameterGrid

best_parameter = grid[np.array(KL_list).argmin()]
np.random.seed(10)

print(best_parameter)
X_transformed_sum = TSNE(n_components=2,
                         learning_rate=best_parameter['learning_rate'], 
                         init='random', 
                         perplexity=best_parameter['perplexity'],
                         method = 'exact',
                         early_exaggeration =best_parameter['early_exaggeration']).fit_transform(df_logit_summary.iloc[:,:-1])

In [None]:
logit_visualize(df_logit_summary, X_transformed_sum)

## Topic Analysis

In [None]:
topic_tokenizer, topic_model = get_topic_model('jonaskoenig/topic_classification_04')

## Visualization: Original Text + Topics

In [None]:
df_logit_original_topic = logit_topic(news_data = news_data_sampled, topic_tokenizer = topic_tokenizer, topic_model = topic_model, summary =  False)

### hyperparameter tuning of tSNE

In [None]:
parameter_dict = {'learning_rate': [50, 100, 150, 200],
                  'perplexity': [10, 30, 50], 
                  'early_exaggeration': [20]}
grid = ParameterGrid(parameter_dict)

KL_list = []
complete = 0
for i in grid:
  TSNE_model = TSNE(n_components=2, 
                    learning_rate=i['learning_rate'], 
                    init='random', 
                    perplexity=i['perplexity'],
                    method = 'exact',
                    early_exaggeration =i['early_exaggeration']).fit(df_logit_original_topic.iloc[:,:-1])
  KL_list.append(TSNE_model.kl_divergence_)
  complete += 1
  print(f'finish: {complete}/ {len(grid)}')

### reduce dimensionality by tSNE

In [None]:
from sklearn.manifold import TSNE
from sklearn.model_selection import ParameterGrid

best_parameter = grid[np.array(KL_list).argmin()]
np.random.seed(10)

print(best_parameter)
X_transformed_topic = TSNE(n_components=2,
                           learning_rate=best_parameter['learning_rate'], 
                           init='random',
                           perplexity=best_parameter['perplexity'],
                           method = 'exact',
                           early_exaggeration =best_parameter['early_exaggeration']).fit_transform(df_logit_original_topic.iloc[:,:-1])

In [None]:
logit_topic_visualize(news_data_sampled, X_transformed_topic)

## Visualization: Summary + Topics

In [None]:
df_logit_summary_topic = logit_topic(news_data = news_data_sampled, topic_tokenizer = topic_tokenizer, topic_model = topic_model, summary =  True)

### hyperparameter tuning of tSNE

In [None]:
parameter_dict = {'learning_rate': [50, 100, 150, 200],
                  'perplexity': [10, 30, 50], 
                  'early_exaggeration': [20]}
grid = ParameterGrid(parameter_dict)

KL_list = []
complete = 0
for i in grid:
  TSNE_model = TSNE(n_components=2, 
                    learning_rate=i['learning_rate'], 
                    init='random', 
                    perplexity=i['perplexity'],
                    method = 'exact',
                    early_exaggeration =i['early_exaggeration']).fit(df_logit_summary_topic.iloc[:,:-1])
  KL_list.append(TSNE_model.kl_divergence_)
  complete += 1
  print(f'finish: {complete}/ {len(grid)}')

### reduce dimensionality by using tSNE

In [None]:
best_parameter = grid[np.array(KL_list).argmin()]
np.random.seed(10)

print(best_parameter)
X_transformed_sum_topic = TSNE(n_components=2,
                               learning_rate=best_parameter['learning_rate'],
                               init='random',
                               perplexity=best_parameter['perplexity'],
                               method = 'exact',
                               early_exaggeration =best_parameter['early_exaggeration']).fit_transform(df_logit_summary_topic.iloc[:,:-1])

In [None]:
logit_topic_visualize(news_data_sampled, X_transformed_sum_topic)