In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

import plotly.graph_objects as go
from company_info import company_info_list

from helper_functions import filter_data

# Variable used in plots later
textfont_size = 20

In [None]:
# LOAD DATA
data = pd.read_json('data/full_data/nano_esg.json', lines=True)

start_date = '2023-01-01'
end_date = '2024-09-16'

companies = list(data['company'].unique())

sent_dict = {'positive': 1, 'negative': -1, 'neutral': 0}
aspect_filters = ['environmental', 'social', 'governance']

data['sentiment_int'] = data['sentiment'].apply(lambda x: sent_dict[x])

#For plots
aspect_colors = {'environmental': 'forestgreen', 'social': 'cornflowerblue', 'governance': 'darkmagenta'}

In [None]:
# Set Embedding Model used for BERTopic
sentence_model = SentenceTransformer('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True)

In [None]:
# This cell creates topics based on the german summaries + the german keywords returned by the LLM
# In order to get english topic representations, it is possible to use the 'summary_en' field instead of 'summary'
# In this case the topics will be created only based on the english summaries, without the LLM-provided keywords, so their quality might be worse
summary_field = 'summary'
# summary_field = 'summary_en'

# Overwrite companies - processing all of them can take a bit of time
companies = ['vw'] #'bayer', 'bmw', 'siemens'
topic_aspect_filters = ['all']

company_topics = {}
for company in companies:
    print(company)
    company_data = data[data['company'] == company]
    aspect_topics = {}
    for aspect in topic_aspect_filters:
        if aspect == 'all':
            aspect_data = company_data
        else:
            aspect_data = company_data[company_data['aspect'] == aspect]

        #remove company name from keywords
        keyword_filter = company_info_list[company]['keyword_filter']
        aspect_data['keywords'] = aspect_data['keywords'].apply(lambda x: [i for i in x if i.lower() not in [j.lower() for j in keyword_filter]])

        timestamps = aspect_data['date'].to_list()
        # Note that we only have the keywords returned by the LLM in german
        if summary_field == 'summary':
            articles = [i[summary_field] + ' - ' + ', '.join(i['keywords']) for id, i in aspect_data.iterrows()]
        elif summary_field == 'summary_en':
            articles = aspect_data[summary_field].to_list()

        if not articles:
            continue

        # Create a BERTopic model
        topic_model = BERTopic(embedding_model=sentence_model, verbose=True)
        try:
            topics, probs = topic_model.fit_transform(articles)
        except Exception as e:
            print(f'Error for {company} and {aspect}: {e}')
            continue
        topic_info = topic_model.get_topic_info()
        topic_dict = {'topics': topics, 'probs': probs, 'timestamps': timestamps, 'articles': articles, 'topic_info': topic_info}
        aspect_topics[aspect] = topic_dict
    company_topics[company] = aspect_topics

In [None]:
# Determine Mean Relevance, Mean Sentiment and more for each topic

a = 'all'
company_articles = {}
for c in companies:
    topic_subset = company_topics[c][a]
    c_data = data[data['company'] == c]
    c_data['topics'] = topic_subset['topics']

    topic_basis = 'topics'

    topic_rel_score = {}
    topic_mean_sent = {}
    topic_aspects = {}
    topic_dates = {}
    num_articles_last_months = {}
    for topic_number in topic_subset['topic_info']['Topic']:
        topic_rel_score[topic_number] = np.mean(c_data[c_data[topic_basis] == topic_number]['relevance_score'])
        topic_mean_sent[topic_number] = np.mean(c_data[c_data[topic_basis] == topic_number]['sentiment_int'])
        topic_aspects[topic_number] = c_data[c_data[topic_basis] == topic_number][['aspect']].value_counts(normalize=True).to_dict()
        topic_dates[topic_number] = pd.Timestamp(c_data[c_data[topic_basis] == topic_number]['date'].astype('int64').mean())
        # The following determines the number of recently published articles for each topic
        num_articles_last_months[topic_number] = c_data[(c_data[topic_basis] == topic_number) & (c_data['date'] >= '2024-08-01')]['volume'].count()

    topic_subset['topic_info']['Mean_Relevance'] = topic_subset['topic_info'].apply(lambda x: topic_rel_score[x['Topic']], axis=1)
    topic_subset['topic_info']['Mean_Sentiment'] = topic_subset['topic_info'].apply(lambda x: topic_mean_sent[x['Topic']], axis=1)
    topic_subset['topic_info']['Aspects'] = topic_subset['topic_info'].apply(lambda x: topic_aspects[x['Topic']], axis=1)
    topic_subset['topic_info']['Mean_Date'] = topic_subset['topic_info'].apply(lambda x: topic_dates[x['Topic']], axis=1)
    topic_subset['topic_info']['Recent_Articles'] = topic_subset['topic_info'].apply(lambda x: num_articles_last_months[x['Topic']], axis=1)

    company_topics[c][a] = topic_subset
    company_articles[c] = c_data

In [None]:
c = 'vw'
a = 'all'
topic_subset = company_topics[c][a]

In [None]:
# Show the 20 most relevant topics
topic_subset['topic_info'].sort_values('Mean_Relevance', ascending=False).head(20)

### Fig 4: Positive & Negative Articles per month of Topic related to forced labor in China's Xinjiang Province

In [None]:
# Select the most relevant topic containing the keyword 'xinjiang'
# Topic determination has random elements, so the resulting graph might differ slightly from the version in the paper
sorted_topics = topic_subset['topic_info'].sort_values('Mean_Relevance', ascending=False)
topic_num = sorted_topics[sorted_topics['Name'].str.contains('xinjiang', case=False)]['Topic'].values[0]

# Alternatively, it is possible to manually select a topic by its number after browsing the topic info above
# topic_num = 6

print(topic_subset['topic_info'][topic_subset['topic_info']['Topic'] == topic_num]['Name'])

topic_data = company_articles[c][[i == topic_num for i in topic_subset['topics']]]
print(topic_data['aspect'].value_counts(normalize=True))

filter_topic_data = filter_data(topic_data, None, None)

topic_aspect_data_pos = {}
topic_aspect_data_neg = {}
topic_aspect_data_neut = {}
for aspect in aspect_filters:
    topic_aspect_data_pos[aspect] = filter_topic_data[(filter_topic_data['aspect'] == aspect) & (filter_topic_data['sentiment'] == 'positive')].resample('M', on='date')['sentiment_int'].sum()
    topic_aspect_data_neg[aspect] = filter_topic_data[(filter_topic_data['aspect'] == aspect) & (filter_topic_data['sentiment'] == 'negative')].resample('M', on='date')['sentiment_int'].sum()
    topic_aspect_data_neut[aspect] = filter_topic_data[(filter_topic_data['aspect'] == aspect) & (filter_topic_data['sentiment'] == 'neutral')].resample('M', on='date')['sentiment_int'].sum()

relevance_data = filter_topic_data.resample('M', on='date')['relevance_score'].mean()

######################## FIGURE ########################

# Create traces for each category (positive and negative stacked bars)
fig = go.Figure()

# Adding positive sentiment bars for each category
for aspect_filter in aspect_filters:
    fig.add_trace(go.Bar(
        x=list(topic_aspect_data_pos[aspect_filter].index.to_period('M').to_timestamp()),
        y=topic_aspect_data_pos[aspect_filter],
        name=aspect_filter.title(),
        offsetgroup=1,
        # legendgroup=f'{category}',
        marker_color=aspect_colors[aspect_filter],
        hovertemplate=f'{aspect_filter} Positive: %{{y}}<extra></extra>',
        showlegend=False,
        yaxis='y1',
    ))

# Adding negative sentiment bars for each category
for aspect_filter in aspect_filters:
    fig.add_trace(go.Bar(
        x=list(topic_aspect_data_neg[aspect_filter].index.to_period('M').to_timestamp()),
        y=topic_aspect_data_neg[aspect_filter],
        name=aspect_filter.title(),
        offsetgroup=2,
        # legendgroup=f'{category}',
        marker_color=aspect_colors[aspect_filter],
        hovertemplate=f'{aspect_filter} Negative: %{{y}}<extra></extra>',
        showlegend=True,
        yaxis='y1',
    ))

# Update layout for visual styling
fig.update_layout(
    barmode='relative',
    title=f"Topic - {topic_subset['topic_info'][topic_subset['topic_info']['Topic'] == topic_num]['Representation'].values[0]}",
    xaxis=dict(title='Time'),
    yaxis=dict(title='Number of Neg&Pos Articles'),
    bargap=0.2,
    height=500,
    width=1500,
    # font=dict(size=textfont_size),
    legend=dict(
        # x=0.2,
        # y=0.75,
        # xanchor='right',  # Anchor the legend to the left
        # yanchor='bottom',  # Anchor the legend to the middle
        bgcolor='rgba(255, 255, 255, 0.8)',  # Optional: set a background color for better visibility
        bordercolor='black',  # Optional: set border color
        borderwidth=1,  # Optional: set border width
        font = dict(size = textfont_size),
    ),
)

# Show the plot
fig.show()

### Investigate the two months highlighted in Fig. 4 in the Paper

In [None]:
# For the example in the paper: Look at the positive summaries released in December 2023
filter_topic_data[(filter_topic_data['date'] >= '2023-11-30') & (filter_topic_data['date'] <= '2023-12-31')][['date', 'sentiment', 'summary_en']].values

In [None]:
filter_topic_data[(filter_topic_data['date'] >= '2024-01-31') & (filter_topic_data['date'] <= '2024-02-28')][['date', 'sentiment', 'summary_en']].values