In [21]:
from re import sub
import numpy as np
import pandas as pd
import spacy
import plotly.express as px


In [75]:
DB = pd.read_csv('dataset.csv')

data = DB[DB.text.str.len()>30]
data = data[['text', 'name','year']]


In [76]:
tots = pd.read_csv('n_of_sentences.csv')


In [77]:
nlp = spacy.load('en_core_web_md', exclude=["tagger", "parser", "senter", "attribute_ruler", "lemmatizer", "ner"])


In [78]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

In [79]:
import umap 
import hdbscan

In [80]:
import torch

In [81]:
from sentence_transformers import SentenceTransformer

In [82]:


embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=torch.device('mps'))

In [83]:

# Convert training set to list of documents


umap_model = umap.UMAP(n_neighbors=50, n_components=15, min_dist=0.075)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=40, metric='euclidean',
                        prediction_data=True, gen_min_span_tree=True)
# Train the BERTopic model
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")

# Train the BERTopic model
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    nr_topics=15,
    min_topic_size=8,
    calculate_probabilities=True,
    verbose = True
)

topics, probs = topic_model.fit_transform(data.text)

Batches:   0%|          | 0/1987 [00:00<?, ?it/s]

2023-04-23 15:23:52,474 - BERTopic - Transformed documents to Embeddings
2023-04-23 15:24:56,411 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environm

2023-04-23 15:25:13,017 - BERTopic - Clustered reduced embeddings
2023-04-23 15:25:37,563 - BERTopic - Reduced number of topics from 47 to 15


In [84]:
topic_model.generate_topic_labels()

['-1_group_management_business',
 '0_energy_emissions_climate',
 '1_sustainability_risk_rights',
 '2_group_women_training',
 '3_tax_million_income',
 '4_board_suppliers_directors',
 '5_transport_cars_railway',
 '6_cables_cable_general cable',
 '7_customer_customers_satisfaction',
 '8_nt_ss_rs',
 '9_innovation_patents_patent',
 '10_biodiversity_species_areas',
 '11_green_bond_green bond',
 '12_circular_circular economy_economy',
 '13_digital_transformation_digital transformation']

In [None]:
topic_model.set_topic_labels(topic_model.generate_topic_labels())

In [85]:
data['labels'] = topic_model.topics_


data['prob'] = [max(i) for i in probs]




In [86]:
data

Unnamed: 0,text,name,year,labels,prob
0,beyond continuity renewal innovation figures o...,ferrari,2021,5,0.084221
1,everything revealed world demonstrates leaders...,ferrari,2021,-1,0.170547
2,broadest across main financial indicators inno...,ferrari,2021,5,0.310011
3,efforts rewarded buy also join vibrant equal s...,ferrari,2021,-1,0.054269
4,marked also refined anniversary special event ...,ferrari,2021,-1,0.068080
...,...,...,...,...,...
63556,analysing assessing identification criteria re...,trenitalia,2021,-1,0.204558
63557,comparing financial disclosures presented sust...,trenitalia,2021,-1,0.261796
63558,understanding processes underlying generation ...,trenitalia,2021,-1,0.114364
63559,also performed selected procedures gather info...,trenitalia,2021,-1,0.070033


In [87]:
data = data[data.labels!=8]

data = data[data.labels!=-1]
data= data[data.prob>0.6]

In [88]:
data

Unnamed: 0,text,name,year,labels,prob
8,track best ever season february racing winning...,ferrari,2021,5,1.00000
9,five podium places third constructor standings...,ferrari,2021,5,1.00000
30,result welfare systems activities institute ac...,ferrari,2021,2,1.00000
46,championship racing presented new model divide...,ferrari,2021,5,1.00000
59,passion racing spirit lives emotions transcend...,ferrari,2021,5,0.66204
...,...,...,...,...,...
63527,indeed waste operations tonnes active member t...,trenitalia,2021,0,1.00000
63528,urban mass want promote transform hazardous sp...,trenitalia,2021,0,1.00000
63529,rise waste production volume urban waste waste...,trenitalia,2021,0,1.00000
63530,special waste due waste demolition rail groups...,trenitalia,2021,0,1.00000


In [89]:
results = []
attr = []
for company in data.name.unique():
    dt = data[data['name'] == company]
    for year in dt.year.unique():
        datii =dt[dt['year'] == year]
        r = []
        for i in data.labels.unique():
            n = tots[(tots.name==company) & (tots.year == year)]['text']
            
            r.append((len(datii[datii['labels'] == i]))/int(n))
        attr.append([company, year])
        results.append(r)
        
DataSet = pd.DataFrame(results,columns=[
 '0_energy_emissions_climate',
 '1_sustainability_risk_rights',
 '2_group_women_training',
 '3_tax_million_income',
 '4_board_suppliers_directors',
 '5_transport_cars_railway',
 '6_cables_cable_general cable',
 '7_customer_customers_satisfaction',
 '9_innovation_patents_patent',
 '10_biodiversity_species_areas',
 '11_green_bond_green bond',
 '12_circular_circular economy_economy',
 '13_digital_transformation_digital transformation'])



DataSet

Unnamed: 0,0_energy_emissions_climate,1_sustainability_risk_rights,2_group_women_training,3_tax_million_income,4_board_suppliers_directors,5_transport_cars_railway,6_cables_cable_general cable,7_customer_customers_satisfaction,9_innovation_patents_patent,10_biodiversity_species_areas,11_green_bond_green bond,12_circular_circular economy_economy,13_digital_transformation_digital transformation
0,0.033288,0.006114,0.012228,0.002038,0.002038,0.002717,0.000679,0.005435,0.002038,0.0,0.0,0.0,0.0
1,0.039764,0.011782,0.016937,0.002946,0.001473,0.002209,0.0,0.007364,0.000736,0.0,0.0,0.0,0.0
2,0.049063,0.006244,0.009813,0.003568,0.00446,0.003568,0.0,0.00446,0.001784,0.0,0.000892,0.0,0.0
3,0.039568,0.010791,0.008094,0.002698,0.003597,0.000899,0.0,0.008993,0.0,0.0,0.0,0.000899,0.0
4,0.052941,0.011765,0.016667,0.002941,0.003922,0.003922,0.0,0.008824,0.00098,0.0,0.0,0.0,0.00098
5,0.001604,0.014435,0.01684,0.0,0.007217,0.002406,0.0,0.008821,0.010425,0.000802,0.0,0.0,0.0
6,0.001647,0.011526,0.020856,0.0,0.007135,0.006037,0.0,0.008782,0.005488,0.000549,0.001098,0.0,0.0
7,0.00189,0.013233,0.018904,0.0,0.005671,0.003781,0.0,0.010397,0.007561,0.0,0.0,0.0,0.0
8,0.0,0.010782,0.019766,0.0,0.006289,0.013477,0.000898,0.005391,0.012579,0.0,0.0,0.0,0.0
9,0.00203,0.008122,0.017259,0.0,0.010152,0.018274,0.001015,0.004061,0.013198,0.0,0.001015,0.0,0.0


In [90]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(DataSet)
scaled = scaler.fit_transform(DataSet)
DataSet = pd.DataFrame(scaled, columns=[
 '0_energy_emissions_climate',
 '1_sustainability_risk_rights',
 '2_group_women_training',
 '3_tax_million_income',
 '4_board_suppliers_directors',
 '5_transport_cars_railway',
 '6_cables_cable_general cable',
 '7_customer_customers_satisfaction',
 '9_innovation_patents_patent',
 '10_biodiversity_species_areas',
 '11_green_bond_green bond',
 '12_circular_circular economy_economy',
 '13_digital_transformation_digital transformation'])

DataSet['ind'] = DataSet.index

names = pd.DataFrame(attr, columns=['name', 'year'])
names['ind'] = names.index
DataSet = DataSet.merge(names, on = 'ind')

DataSet

Unnamed: 0,0_energy_emissions_climate,1_sustainability_risk_rights,2_group_women_training,3_tax_million_income,4_board_suppliers_directors,5_transport_cars_railway,6_cables_cable_general cable,7_customer_customers_satisfaction,9_innovation_patents_patent,10_biodiversity_species_areas,11_green_bond_green bond,12_circular_circular economy_economy,13_digital_transformation_digital transformation,ind,name,year
0,0.628774,0.15077,0.293257,0.255506,0.076602,0.148702,0.137228,0.068005,0.06085,0.0,0.0,0.0,0.0,0,ferrari,2021
1,0.751105,0.308129,0.456988,0.369274,0.055355,0.120889,0.0,0.095615,0.021986,0.0,0.0,0.0,0.0,1,ferrari,2020
2,0.926752,0.154388,0.209257,0.447345,0.167646,0.195262,0.0,0.054057,0.053269,0.0,0.107812,0.0,0.0,2,ferrari,2018
3,0.747402,0.280625,0.149475,0.338224,0.135202,0.049211,0.0,0.118931,0.0,0.0,0.0,0.170763,0.0,3,ferrari,2019
4,1.0,0.307648,0.447599,0.368731,0.147397,0.214597,0.0,0.116509,0.029272,0.0,0.0,0.0,0.030094,4,ferrari,2017
5,0.030295,0.381773,0.453641,0.0,0.271272,0.131649,0.0,0.116475,0.311261,0.182237,0.0,0.0,0.0,5,unipolsai,2020
6,0.031101,0.301015,0.593286,0.0,0.268178,0.330376,0.0,0.115908,0.16387,0.124726,0.132664,0.0,0.0,6,unipolsai,2021
7,0.035707,0.348398,0.525386,0.0,0.213154,0.206889,0.0,0.139029,0.225763,0.0,0.0,0.0,0.0,7,unipolsai,2019
8,0.0,0.280356,0.555389,0.0,0.236391,0.737496,0.181491,0.067376,0.375562,0.0,0.0,0.0,0.0,8,unipolsai,2018
9,0.038353,0.20651,0.468192,0.0,0.381586,1.0,0.205076,0.048341,0.394054,0.0,0.122698,0.0,0.0,9,unipolsai,2017


In [92]:
DataSet.to_csv('finalfr.csv')

In [93]:
import plotly.io as io
io.renderers.default = "browser"


In [94]:
umap_model1 = umap.UMAP(n_neighbors=15, n_components=3, min_dist=0.05)

In [100]:
df = pd.read_csv('finalfr.csv')
df = df.drop('Unnamed: 0', axis = 1)
nameyear = df[['ind', 'name', 'year']]
nameyear
df = df.drop(['ind', 'name', 'year'], axis = 1)


In [101]:
df = pd.DataFrame(umap_model1.fit_transform(df), columns = ['x', 'y', 'z'])
df['ind'] = df.index
df= df.merge(nameyear, on='ind').drop('ind', axis = 1)



In [98]:
df.to_csv('3dscatter.csv')

In [97]:

fig = px.scatter_3d(
    df, x='x', y='y', z='z',
    color=df.name, hover_data = 'year'
)
fig.update_traces(marker_size=8)
fig.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
