In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

import plotly.graph_objects as go
from company_info import company_info_list

from helper_functions import filter_data

# Variable used in plots later
textfont_size = 20

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# LOAD DATA
data = pd.read_json('data/full_data/nano_esg.json', lines=True)

start_date = '2023-01-01'
end_date = '2024-09-16'

companies = list(data['company'].unique())

sent_dict = {'positive': 1, 'negative': -1, 'neutral': 0}
aspect_filters = ['environmental', 'social', 'governance']

data['sentiment_int'] = data['sentiment'].apply(lambda x: sent_dict[x])

#For plots
aspect_colors = {'environmental': 'forestgreen', 'social': 'cornflowerblue', 'governance': 'darkmagenta'}

In [4]:
# Set Embedding Model used for BERTopic
sentence_model = SentenceTransformer('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True)

In [None]:
# This cell creates topics based on the german summaries + the german keywords returned by the LLM
# In order to get english topic representations, it is possible to use the 'summary_en' field instead of 'summary'
# In this case the topics will be created only based on the english summaries, without the LLM-provided keywords, so their quality might be worse
summary_field = 'summary'
# summary_field = 'summary_en'

# Overwrite companies - processing all of them can take a bit of time
companies = ['vw'] #'bayer', 'bmw', 'siemens'
topic_aspect_filters = ['all']

company_topics = {}
for company in companies:
    print(company)
    company_data = data[data['company'] == company]
    aspect_topics = {}
    for aspect in topic_aspect_filters:
        if aspect == 'all':
            aspect_data = company_data
        else:
            aspect_data = company_data[company_data['aspect'] == aspect]

        #remove company name from keywords
        keyword_filter = company_info_list[company]['keyword_filter']
        aspect_data['keywords'] = aspect_data['keywords'].apply(lambda x: [i for i in x if i.lower() not in [j.lower() for j in keyword_filter]])

        timestamps = aspect_data['date'].to_list()
        # Note that we only have the keywords returned by the LLM in german
        if summary_field == 'summary':
            articles = [i[summary_field] + ' - ' + ', '.join(i['keywords']) for id, i in aspect_data.iterrows()]
        elif summary_field == 'summary_en':
            articles = aspect_data[summary_field].to_list()

        if not articles:
            continue

        # Create a BERTopic model
        topic_model = BERTopic(embedding_model=sentence_model, verbose=True)
        try:
            topics, probs = topic_model.fit_transform(articles)
        except Exception as e:
            print(f'Error for {company} and {aspect}: {e}')
            continue
        topic_info = topic_model.get_topic_info()
        topic_dict = {'topics': topics, 'probs': probs, 'timestamps': timestamps, 'articles': articles, 'topic_info': topic_info}
        aspect_topics[aspect] = topic_dict
    company_topics[company] = aspect_topics

vw


2024-11-10 19:49:53,702 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 227/227 [13:22<00:00,  3.54s/it]
2024-11-10 20:03:16,668 - BERTopic - Embedding - Completed ✓
2024-11-10 20:03:16,670 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-11-10 20:03:52,924 - BERTopic - Dimensionality - Completed ✓
2024-11-10 20:03:52,926 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-11-10 20:03:53,181 - BERTopic - Cluster - Completed ✓
2024-11-10 20:03:53,185 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-11-10 20:03:53,609 - BERTopic - Representation - Completed ✓
20it [00:04,  4.59it/s]


In [None]:
# Determine Mean Relevance, Mean Sentiment and more for each topic

a = 'all'
company_articles = {}
for c in companies:
    topic_subset = company_topics[c][a]
    c_data = data[data['company'] == c]
    c_data['topics'] = topic_subset['topics']

    topic_basis = 'topics'

    topic_rel_score = {}
    topic_mean_sent = {}
    topic_aspects = {}
    topic_dates = {}
    num_articles_last_months = {}
    for topic_number in topic_subset['topic_info']['Topic']:
        topic_rel_score[topic_number] = np.mean(c_data[c_data[topic_basis] == topic_number]['relevance_score'])
        topic_mean_sent[topic_number] = np.mean(c_data[c_data[topic_basis] == topic_number]['sentiment_int'])
        topic_aspects[topic_number] = c_data[c_data[topic_basis] == topic_number][['aspect']].value_counts(normalize=True).to_dict()
        topic_dates[topic_number] = pd.Timestamp(c_data[c_data[topic_basis] == topic_number]['date'].astype('int64').mean())
        # The following determines the number of recently published articles for each topic
        num_articles_last_months[topic_number] = c_data[(c_data[topic_basis] == topic_number) & (c_data['date'] >= '2024-08-01')]['volume'].count()

    topic_subset['topic_info']['Mean_Relevance'] = topic_subset['topic_info'].apply(lambda x: topic_rel_score[x['Topic']], axis=1)
    topic_subset['topic_info']['Mean_Sentiment'] = topic_subset['topic_info'].apply(lambda x: topic_mean_sent[x['Topic']], axis=1)
    topic_subset['topic_info']['Aspects'] = topic_subset['topic_info'].apply(lambda x: topic_aspects[x['Topic']], axis=1)
    topic_subset['topic_info']['Mean_Date'] = topic_subset['topic_info'].apply(lambda x: topic_dates[x['Topic']], axis=1)
    topic_subset['topic_info']['Recent_Articles'] = topic_subset['topic_info'].apply(lambda x: num_articles_last_months[x['Topic']], axis=1)

    company_topics[c][a] = topic_subset
    company_articles[c] = c_data

In [10]:
c = 'vw'
a = 'all'
topic_subset = company_topics[c][a]

In [11]:
# Show the 20 most relevant topics
topic_subset['topic_info'].sort_values('Mean_Relevance', ascending=False).head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,Mean_Relevance,Mean_Sentiment,Aspects,Mean_Date,Recent_Articles
5,4,97,4_xinjiang_zwangsarbeit_uiguren_menschenrechts...,"[xinjiang, zwangsarbeit, uiguren, menschenrech...",[Der Artikel behandelt Chinas Pläne für eine F...,8.340206,-0.659794,"{('social',): 0.9072164948453608, ('governance...",2023-10-26 14:55:30.453607936,0
87,86,22,86_stadler_rupert_bewährungsstrafe_hatz,"[stadler, rupert, bewährungsstrafe, hatz, audi...",[Der frühere Audi-Chef Rupert Stadler gestand ...,8.227273,-1.0,"{('governance',): 0.9545454545454546, ('social...",2023-06-20 21:23:45.500000000,0
7,6,91,6_thermofenster_abgasreinigung_eugh_abschaltei...,"[thermofenster, abgasreinigung, eugh, abschalt...","[Der EuGH entschied am 21. März 2023, dass VW ...",8.208791,-0.912088,"{('governance',): 0.5934065934065934, ('enviro...",2023-07-15 18:03:23.351648768,2
28,27,46,27_winterkorn_martin_braunschweig_dieselaffäre,"[winterkorn, martin, braunschweig, dieselaffär...",[Der frühere VW-Chef Martin Winterkorn musste ...,8.173913,-0.934783,"{('governance',): 1.0}",2023-10-04 07:06:54.239130368,3
56,55,32,55_milliarden_180_investitionen_forschung,"[milliarden, 180, investitionen, forschung, eu...","[Volkswagen plant, seine Investitionen in den ...",8.0625,0.6875,"{('environmental',): 0.8125, ('governance',): ...",2023-08-27 15:55:38.031249920,2
109,108,16,108_co2_based_kohle_scope,"[co2, based, kohle, scope, klimaneutral, emiss...",[Volkswagen ist mit 493 Millionen Tonnen CO2-E...,8.0,0.5,"{('environmental',): 0.9375, ('social',): 0.0625}",2023-08-02 17:00:32.125000192,0
52,51,35,51_abgasnorm_euro_luftqualität_feinstaub,"[abgasnorm, euro, luftqualität, feinstaub, lem...",[VW kritisiert die EU-Pläne zur Euro 7-Abgasno...,8.0,-0.857143,"{('environmental',): 0.9142857142857143, ('soc...",2023-05-13 09:21:55.400000000,0
153,152,11,152_umweltbonus_förderung_bundeswirtschaftsmin...,"[umweltbonus, förderung, bundeswirtschaftsmini...",[Die deutsche Regierung streicht ab dem 18. De...,8.0,-0.363636,"{('environmental',): 1.0}",2023-12-06 05:42:20.272727552,1
138,137,13,137_continental_staatsanwaltschaft_ermittlunge...,"[continental, staatsanwaltschaft, ermittlungen...",[Die Staatsanwaltschaft Hannover ermittelt sei...,8.0,-1.0,"{('governance',): 1.0}",2023-11-13 04:24:10.615384832,0
58,57,32,57_recycling_redwood_salzgitter_lithium,"[recycling, redwood, salzgitter, lithium, batt...",[Volkswagen hat in Salzgitter eine Recyclingan...,7.96875,0.90625,"{('environmental',): 1.0}",2023-08-13 11:12:16.968750080,1


### Fig 4: Positive & Negative Articles per month of Topic related to forced labor in China's Xinjiang Province

In [46]:
# Select the most relevant topic containing the keyword 'xinjiang'
# Topic determination has random elements, so the resulting graph might differ slightly from the version in the paper
sorted_topics = topic_subset['topic_info'].sort_values('Mean_Relevance', ascending=False)
topic_num = sorted_topics[sorted_topics['Name'].str.contains('xinjiang', case=False)]['Topic'].values[0]

# Alternatively, it is possible to manually select a topic by its number after browsing the topic info above
# topic_num = 6

print(topic_subset['topic_info'][topic_subset['topic_info']['Topic'] == topic_num]['Name'])

topic_data = company_articles[c][[i == topic_num for i in topic_subset['topics']]]
print(topic_data['aspect'].value_counts(normalize=True))

filter_topic_data = filter_data(topic_data, None, None)

topic_aspect_data_pos = {}
topic_aspect_data_neg = {}
topic_aspect_data_neut = {}
for aspect in aspect_filters:
    topic_aspect_data_pos[aspect] = filter_topic_data[(filter_topic_data['aspect'] == aspect) & (filter_topic_data['sentiment'] == 'positive')].resample('M', on='date')['sentiment_int'].sum()
    topic_aspect_data_neg[aspect] = filter_topic_data[(filter_topic_data['aspect'] == aspect) & (filter_topic_data['sentiment'] == 'negative')].resample('M', on='date')['sentiment_int'].sum()
    topic_aspect_data_neut[aspect] = filter_topic_data[(filter_topic_data['aspect'] == aspect) & (filter_topic_data['sentiment'] == 'neutral')].resample('M', on='date')['sentiment_int'].sum()

relevance_data = filter_topic_data.resample('M', on='date')['relevance_score'].mean()

######################## FIGURE ########################

# Create traces for each category (positive and negative stacked bars)
fig = go.Figure()

# Adding positive sentiment bars for each category
for aspect_filter in aspect_filters:
    fig.add_trace(go.Bar(
        x=list(topic_aspect_data_pos[aspect_filter].index.to_period('M').to_timestamp()),
        y=topic_aspect_data_pos[aspect_filter],
        name=aspect_filter.title(),
        offsetgroup=1,
        # legendgroup=f'{category}',
        marker_color=aspect_colors[aspect_filter],
        hovertemplate=f'{aspect_filter} Positive: %{{y}}<extra></extra>',
        showlegend=False,
        yaxis='y1',
    ))

# Adding negative sentiment bars for each category
for aspect_filter in aspect_filters:
    fig.add_trace(go.Bar(
        x=list(topic_aspect_data_neg[aspect_filter].index.to_period('M').to_timestamp()),
        y=topic_aspect_data_neg[aspect_filter],
        name=aspect_filter.title(),
        offsetgroup=2,
        # legendgroup=f'{category}',
        marker_color=aspect_colors[aspect_filter],
        hovertemplate=f'{aspect_filter} Negative: %{{y}}<extra></extra>',
        showlegend=True,
        yaxis='y1',
    ))

# Update layout for visual styling
fig.update_layout(
    barmode='relative',
    title=f"Topic - {topic_subset['topic_info'][topic_subset['topic_info']['Topic'] == topic_num]['Representation'].values[0]}",
    xaxis=dict(title='Time'),
    yaxis=dict(title='Number of Neg&Pos Articles'),
    bargap=0.2,
    height=500,
    width=1500,
    # font=dict(size=textfont_size),
    legend=dict(
        # x=0.2,
        # y=0.75,
        # xanchor='right',  # Anchor the legend to the left
        # yanchor='bottom',  # Anchor the legend to the middle
        bgcolor='rgba(255, 255, 255, 0.8)',  # Optional: set a background color for better visibility
        bordercolor='black',  # Optional: set border color
        borderwidth=1,  # Optional: set border width
        font = dict(size = textfont_size),
    ),
)

# Show the plot
fig.show()

5    4_xinjiang_zwangsarbeit_uiguren_menschenrechts...
Name: Name, dtype: object
aspect
social           0.907216
governance       0.082474
environmental    0.010309
Name: proportion, dtype: float64


### Investigate the two months highlighted in Fig. 4 in the Paper

In [None]:
# For the example in the paper: Look at the positive summaries released in December 2023
filter_topic_data[(filter_topic_data['date'] >= '2023-11-30') & (filter_topic_data['date'] <= '2023-12-31')][['date', 'sentiment', 'summary_en']].values

array([[Timestamp('2023-12-05 17:13:47'), 'positive',
        'Volkswagen commissioned an audit of its Xinjiang site, which found no signs of forced labor. Conducted by Loening Human Rights & Responsible Business GmbH, it included on-site interviews. Employees are well-paid; the audit was prompted by investor demands.'],
       [Timestamp('2023-12-05 17:18:56'), 'positive',
        "An independent investigation at VW's Urumqi plant found no evidence of forced labor or human rights violations. The review included 197 employees, including 50 Uyghurs. VW emphasizes that the workers are well-qualified and above-average paid."],
       [Timestamp('2023-12-06 06:45:55'), 'positive',
        "Volkswagen's auditors found no evidence of forced labor at the factory in Xinjiang. Markus Löning from Löning Human Rights & Responsible Business confirmed that there are no indications of forced labor among the employees."],
       [Timestamp('2023-12-06 09:52:55'), 'positive',
       [Timestamp('2023-1

In [19]:
filter_topic_data[(filter_topic_data['date'] >= '2024-01-31') & (filter_topic_data['date'] <= '2024-02-28')][['date', 'sentiment', 'summary_en']].values

array([[Timestamp('2024-02-01 05:13:48'), 'negative',
        "HRW urges Volkswagen to ensure that materials from Xinjiang, potentially linked to forced labor, do not enter its supply chain. VW's audit found no signs of forced labor but was controversial. HRW recommends better mapping of supply chains."],
       [Timestamp('2024-02-02 12:09:54'), 'negative',
        'The article discusses new allegations against Volkswagen in China, which is attempting to clear its name with a controversial report for the capital market.'],
       [Timestamp('2024-02-08 14:46:30'), 'negative',
        "The article discusses Human Rights Watch's allegations against VW regarding forced labor in aluminum production in Xinjiang. VW asserts no forced labor occurs in Urumqi but faces challenges in supply chain verification and mandates high sustainability standards for suppliers."],
       [Timestamp('2024-02-09 14:24:16'), 'negative',
        "The article discusses BASF's sale of stakes in joint ventures in